In [1]:
    
%matplotlib inline
import tweepy as tw
import json
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
import os
from IPython.display import clear_output
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from io import StringIO
from pydotplus import graph_from_dot_data
import matplotlib.image as mpimg
    
In [2]:
    
sns.set_context("poster")
sns.set_style("ticks")
    
In [3]:
    
DATA_DIR="../data"
TWITTER_CONFIG_FILE=os.path.join(DATA_DIR, "twitter_config.json")
    
If you are proceeding further then you are expected to have created your Twitter application by following the steps from Twitter App Creation page.
Make sure you have the following details of your Twitter application readily available:
Please enter the value of each of the items as shown in your Twitter application, when prompted by the code below.
In [4]:
    
if not os.path.isfile(TWITTER_CONFIG_FILE):
    with open(os.path.join(DATA_DIR, "twitter_config.sample.json")) as fp:
        creds = json.load(fp)
        for k in sorted(creds.keys()):
            v = input("Enter %s:\t" % k)
            creds[k] = v
    print(creds)
    with open(TWITTER_CONFIG_FILE, "w+") as fp:
        json.dump(creds, fp, indent=4, sort_keys=True)
    clear_output()
    print("Printed credentials to file %s" % TWITTER_CONFIG_FILE)
    
In [5]:
    
with open(TWITTER_CONFIG_FILE) as fp:
    creds = json.load(fp)
print(creds.keys())
    
    
In [6]:
    
auth = tw.OAuthHandler(creds["consumer_key"], creds["consumer_secret"])
auth.set_access_token(creds["access_token"], creds["access_token_secret"])
api = tw.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True,
             retry_count=5, retry_delay=100, 
            )
print("Tweepy ready for search")
    
    
In [7]:
    
statuses = api.search(q=input("What is your search term?"), count=10)
    
    
In [8]:
    
len(statuses)
    
    Out[8]:
In [9]:
    
for status in statuses:
    print(status.text)
    
    
In [10]:
    
def dict2df(data):
    return pd.DataFrame(
        list(data.items()),
        columns=["item", "counts"]
    ).sort_values("counts", ascending=False)
def get_entities(statuses):
    hashtags = defaultdict(int)
    mentions = defaultdict(int)
    keys = ("hashtags", "user_mentions")
    for s in statuses:
        entities = s.entities
        if "hashtags" in entities:
            e = map(lambda x: x["text"], entities["hashtags"])
            for t in e:
                hashtags[t] += 1
        if "user_mentions" in entities:
            e = map(lambda x: x["screen_name"], entities["user_mentions"])
            for t in e:
                mentions[t] += 1
    return dict2df(hashtags), dict2df(mentions)
    
In [11]:
    
hashtags, mentions = get_entities(statuses)
    
In [12]:
    
len(statuses)
    
    Out[12]:
In [13]:
    
hashtags
    
    Out[13]:
In [14]:
    
mentions
    
    Out[14]:
In [15]:
    
current_user = api.me()
current_user
    
    Out[15]:
In [16]:
    
status
    
    Out[16]:
In [17]:
    
print(
"""Username: {}
Full Name: {}
# Followers: {}
# Friends: {}
# Statuses: {}""".format(
        current_user.screen_name,
        current_user.name,
        current_user.followers_count,
        current_user.friends_count,
        current_user.statuses_count
    )
)
    
    
In [18]:
    
friends = []
for friend in tw.Cursor(api.friends, count=100).items():
    friends.append(friend)
print("{} friends found for {}".format(len(friends), current_user.name))
    
    
In [19]:
    
df_friends = pd.DataFrame(
    list(map(
        lambda k: (k.id, k.name, k.friends_count, k.followers_count, k.statuses_count),
        friends
    )), columns=["id", "name", "friends", "followers", "statuses"]
).sort_values("followers", ascending=False).reset_index(drop=True)
df_friends.head(15)
    
    Out[19]:
In [20]:
    
network = np.zeros([df_friends.shape[0], df_friends.shape[0]])
network.shape
    
    Out[20]:
In [21]:
    
def get_friendship(id1, id2, verbose=False):
    response = api.show_friendship(source_id=id1, target_id=id2)
    if verbose:
        print(response)
    return response[0].following, response[1].following
    
In [22]:
    
get_friendship(df_friends["id"].values[0], df_friends["id"].values[1], verbose=True)
    
    
    Out[22]:
In [23]:
    
network[0, 0] = False
network[1, 0] = True
network[0:3, 0]
    
    Out[23]:
In [24]:
    
def generate_ego_network(df_friends):
    network = np.zeros([df_friends.shape[0], df_friends.shape[0]])
    processed_friendships=0
    for i, fid1 in enumerate(df_friends["id"].values):
        for j, fid2 in enumerate(df_friends["id"].values[i+1:], start=i+1):
            try:
                tie_labels = get_friendship(fid1, fid2)
                processed_friendships += 1
            except:
                print("Processed friendships = {}".format(processed_friendships))
                print("Error occurred")
                return network
            network[i, j] = tie_labels[0]
            network[j, i] = tie_labels[1]
    return network
    
In [25]:
    
df_friends.tail()
    
    Out[25]:
In [26]:
    
statuses = [status for status in tw.Cursor(
    api.search, q=input("What is your search term?"), count=1000).items(1000)]
    
    
In [27]:
    
len(statuses)
    
    Out[27]:
In [28]:
    
status = next(filter(lambda x: len(x.entities["hashtags"]), statuses))
    
In [29]:
    
status.entities
    
    Out[29]:
In [30]:
    
def get_entities(statuses, entity_type, text_property):
    entity_counts = defaultdict(int)
    entity_network = defaultdict(int)
    for status in statuses:
        for i, entity in enumerate(status.entities[entity_type]):
            entity_counts[entity[text_property].lower()] += 1
            for j, entity_2 in enumerate(status.entities[entity_type][i+1:], start=i+1):
                entity_network[(
                    entity[text_property].lower(),
                    entity_2[text_property].lower()
                )] += 1
    return entity_counts, entity_network
    
In [31]:
    
entity_type="user_mentions"
text_property="screen_name"
entity_counts, entity_network = get_entities(statuses, entity_type, text_property)
    
In [32]:
    
df_entities = pd.DataFrame(list(entity_counts.items()),
                           columns=["entity", "counts"]).sort_values(
    "counts", ascending=False
).reset_index(drop=True)
df_entities.head()
    
    Out[32]:
In [33]:
    
df_entities.head(20)
    
    Out[33]:
In [34]:
    
df_entity_pairs = pd.DataFrame([(k1, k2, v) for (k1,k2), v in entity_network.items()],
                           columns=[
                               "{}_1".format(entity_type),
                               "{}_2".format(entity_type),
                               "counts"]).sort_values(
    "counts", ascending=False
).reset_index(drop=True)
df_entity_pairs.head()
    
    Out[34]:
In [35]:
    
df_entity_pairs.head(20)
    
    Out[35]:
In [36]:
    
G = nx.Graph()
    
In [37]:
    
G.add_nodes_from(entity_counts)
    
In [38]:
    
G.add_edges_from([
    (k[0], k[1], {"weight": v})
    for k, v in entity_network.items()
])
    
In [39]:
    
fig, ax = plt.subplots(1,1)
nx.draw_networkx(
    G, with_labels=True,
    node_size=[x[1]*3 for x in G.degree_iter()],
    pos=nx.spring_layout(G),
    ax=ax
)
ax.axis("off")
    
    Out[39]:
    
    
In [40]:
    
connected_components = sorted(nx.connected_component_subgraphs(G), key = len, reverse=True)
print("{} connected components found.".format(len(connected_components)))
    
    
In [41]:
    
fig, ax = plt.subplots(1,1)
nx.draw_networkx(
    connected_components[0], with_labels=True,
    node_size=[x[1]*5 for x in connected_components[0].degree_iter()],
    pos=nx.spring_layout(connected_components[0]),
    ax=ax
)
ax.axis("off")
    
    Out[41]:
    
    
In [42]:
    
fig, ax = plt.subplots(1,2, figsize=(16,8))
ax[0].hist(list(G.degree().values()), bins=list(range(max(G.degree().values()))), log=True)
ax[0].set_xlabel("Degree")
ax[0].set_ylabel("Frequency")
ax[1].hist(list(entity_counts.values()), bins=list(range(max(entity_counts.values()))), log=True)
ax[1].set_xlabel("Counts")
ax[1].set_ylabel("Frequency")
sns.despine(offset=10)
    
    
    
In [ ]: