In [1]:
%matplotlib inline
import tweepy as tw
import json
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
import os
from IPython.display import clear_output
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from io import StringIO
from pydotplus import graph_from_dot_data
import matplotlib.image as mpimg
In [2]:
sns.set_context("poster")
sns.set_style("ticks")
In [3]:
DATA_DIR="../data"
TWITTER_CONFIG_FILE=os.path.join(DATA_DIR, "twitter_config.json")
If you are proceeding further then you are expected to have created your Twitter application by following the steps from Twitter App Creation page.
Make sure you have the following details of your Twitter application readily available:
Please enter the value of each of the items as shown in your Twitter application, when prompted by the code below.
In [4]:
if not os.path.isfile(TWITTER_CONFIG_FILE):
with open(os.path.join(DATA_DIR, "twitter_config.sample.json")) as fp:
creds = json.load(fp)
for k in sorted(creds.keys()):
v = input("Enter %s:\t" % k)
creds[k] = v
print(creds)
with open(TWITTER_CONFIG_FILE, "w+") as fp:
json.dump(creds, fp, indent=4, sort_keys=True)
clear_output()
print("Printed credentials to file %s" % TWITTER_CONFIG_FILE)
In [5]:
with open(TWITTER_CONFIG_FILE) as fp:
creds = json.load(fp)
print(creds.keys())
In [6]:
auth = tw.OAuthHandler(creds["consumer_key"], creds["consumer_secret"])
auth.set_access_token(creds["access_token"], creds["access_token_secret"])
api = tw.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True,
retry_count=5, retry_delay=100,
)
print("Tweepy ready for search")
In [7]:
statuses = api.search(q=input("What is your search term?"), count=10)
In [8]:
len(statuses)
Out[8]:
In [9]:
for status in statuses:
print(status.text)
In [10]:
def dict2df(data):
return pd.DataFrame(
list(data.items()),
columns=["item", "counts"]
).sort_values("counts", ascending=False)
def get_entities(statuses):
hashtags = defaultdict(int)
mentions = defaultdict(int)
keys = ("hashtags", "user_mentions")
for s in statuses:
entities = s.entities
if "hashtags" in entities:
e = map(lambda x: x["text"], entities["hashtags"])
for t in e:
hashtags[t] += 1
if "user_mentions" in entities:
e = map(lambda x: x["screen_name"], entities["user_mentions"])
for t in e:
mentions[t] += 1
return dict2df(hashtags), dict2df(mentions)
In [11]:
hashtags, mentions = get_entities(statuses)
In [12]:
len(statuses)
Out[12]:
In [13]:
hashtags
Out[13]:
In [14]:
mentions
Out[14]:
In [15]:
current_user = api.me()
current_user
Out[15]:
In [16]:
status
Out[16]:
In [17]:
print(
"""Username: {}
Full Name: {}
# Followers: {}
# Friends: {}
# Statuses: {}""".format(
current_user.screen_name,
current_user.name,
current_user.followers_count,
current_user.friends_count,
current_user.statuses_count
)
)
In [18]:
friends = []
for friend in tw.Cursor(api.friends, count=100).items():
friends.append(friend)
print("{} friends found for {}".format(len(friends), current_user.name))
In [19]:
df_friends = pd.DataFrame(
list(map(
lambda k: (k.id, k.name, k.friends_count, k.followers_count, k.statuses_count),
friends
)), columns=["id", "name", "friends", "followers", "statuses"]
).sort_values("followers", ascending=False).reset_index(drop=True)
df_friends.head(15)
Out[19]:
In [20]:
network = np.zeros([df_friends.shape[0], df_friends.shape[0]])
network.shape
Out[20]:
In [21]:
def get_friendship(id1, id2, verbose=False):
response = api.show_friendship(source_id=id1, target_id=id2)
if verbose:
print(response)
return response[0].following, response[1].following
In [22]:
get_friendship(df_friends["id"].values[0], df_friends["id"].values[1], verbose=True)
Out[22]:
In [23]:
network[0, 0] = False
network[1, 0] = True
network[0:3, 0]
Out[23]:
In [24]:
def generate_ego_network(df_friends):
network = np.zeros([df_friends.shape[0], df_friends.shape[0]])
processed_friendships=0
for i, fid1 in enumerate(df_friends["id"].values):
for j, fid2 in enumerate(df_friends["id"].values[i+1:], start=i+1):
try:
tie_labels = get_friendship(fid1, fid2)
processed_friendships += 1
except:
print("Processed friendships = {}".format(processed_friendships))
print("Error occurred")
return network
network[i, j] = tie_labels[0]
network[j, i] = tie_labels[1]
return network
In [25]:
df_friends.tail()
Out[25]:
In [26]:
statuses = [status for status in tw.Cursor(
api.search, q=input("What is your search term?"), count=1000).items(1000)]
In [27]:
len(statuses)
Out[27]:
In [28]:
status = next(filter(lambda x: len(x.entities["hashtags"]), statuses))
In [29]:
status.entities
Out[29]:
In [30]:
def get_entities(statuses, entity_type, text_property):
entity_counts = defaultdict(int)
entity_network = defaultdict(int)
for status in statuses:
for i, entity in enumerate(status.entities[entity_type]):
entity_counts[entity[text_property].lower()] += 1
for j, entity_2 in enumerate(status.entities[entity_type][i+1:], start=i+1):
entity_network[(
entity[text_property].lower(),
entity_2[text_property].lower()
)] += 1
return entity_counts, entity_network
In [31]:
entity_type="user_mentions"
text_property="screen_name"
entity_counts, entity_network = get_entities(statuses, entity_type, text_property)
In [32]:
df_entities = pd.DataFrame(list(entity_counts.items()),
columns=["entity", "counts"]).sort_values(
"counts", ascending=False
).reset_index(drop=True)
df_entities.head()
Out[32]:
In [33]:
df_entities.head(20)
Out[33]:
In [34]:
df_entity_pairs = pd.DataFrame([(k1, k2, v) for (k1,k2), v in entity_network.items()],
columns=[
"{}_1".format(entity_type),
"{}_2".format(entity_type),
"counts"]).sort_values(
"counts", ascending=False
).reset_index(drop=True)
df_entity_pairs.head()
Out[34]:
In [35]:
df_entity_pairs.head(20)
Out[35]:
In [36]:
G = nx.Graph()
In [37]:
G.add_nodes_from(entity_counts)
In [38]:
G.add_edges_from([
(k[0], k[1], {"weight": v})
for k, v in entity_network.items()
])
In [39]:
fig, ax = plt.subplots(1,1)
nx.draw_networkx(
G, with_labels=True,
node_size=[x[1]*3 for x in G.degree_iter()],
pos=nx.spring_layout(G),
ax=ax
)
ax.axis("off")
Out[39]:
In [40]:
connected_components = sorted(nx.connected_component_subgraphs(G), key = len, reverse=True)
print("{} connected components found.".format(len(connected_components)))
In [41]:
fig, ax = plt.subplots(1,1)
nx.draw_networkx(
connected_components[0], with_labels=True,
node_size=[x[1]*5 for x in connected_components[0].degree_iter()],
pos=nx.spring_layout(connected_components[0]),
ax=ax
)
ax.axis("off")
Out[41]:
In [42]:
fig, ax = plt.subplots(1,2, figsize=(16,8))
ax[0].hist(list(G.degree().values()), bins=list(range(max(G.degree().values()))), log=True)
ax[0].set_xlabel("Degree")
ax[0].set_ylabel("Frequency")
ax[1].hist(list(entity_counts.values()), bins=list(range(max(entity_counts.values()))), log=True)
ax[1].set_xlabel("Counts")
ax[1].set_ylabel("Frequency")
sns.despine(offset=10)
In [ ]: