In this section we are going to parse the tweets we collected and build the social network of interactions between Twitter users. We will also see how to analyze the network using NetworkX.
Tweets are saved in JSON format (JavaScript Object Notation) JSON is text, written with JavaScript object notation.
The json
python module allows to easily import json file into python Dictonairies
In [1]:
#load tweets
import json
filename = 'AI2.txt'
tweet_list = []
with open(filename, 'r') as fopen:
# each line correspond to a tweet
for line in fopen:
if line != '\n':
tweet_list.append(json.loads(line))
Let's look at the informations contained in a tweet
In [2]:
# take the first tweet of the list
tweet = tweet_list[2]
In [3]:
# each tweet is a python dictionary
type(tweet)
Out[3]:
In [4]:
# all the 'entries' of the dictionary
tweet.keys()
Out[4]:
you can find a description of the fields in the Twitter API documentation: https://dev.twitter.com/overview/api/tweets
In [5]:
#creation time
tweet['created_at']
Out[5]:
In [6]:
# text of the tweet
print(tweet['text'])
In [7]:
# user info
tweet['user']
Out[7]:
In [8]:
# user is itslef a dict
print(type(tweet['user']))
tweet['user']['name']
Out[8]:
In [9]:
# unique id of the user
tweet['user']['id']
Out[9]:
In [10]:
#is the tweet a retweet?
'retweeted_status' in tweet
Out[10]:
In [11]:
if 'retweeted_status' in tweet:
print(tweet['retweeted_status'])
# the `retweeted_status` is also a tweet dictionary
In [12]:
# user id and name of the retweeted user?
if 'retweeted_status' in tweet:
print(tweet['retweeted_status']['user']['id'])
print(tweet['retweeted_status']['user']['name'])
In [13]:
# is the tweet a reply?
'in_reply_to_user_id' in tweet and tweet['in_reply_to_user_id'] is not None
Out[13]:
In [14]:
# 'entities' contains the hashtags, urls and usernames used in the tweet
tweet['entities']
Out[14]:
In [15]:
# user id of the mentioned users
for mention in tweet['entities']['user_mentions']:
print(mention['id'])
In [16]:
# is the tweet a quote?
'quoted_status' in tweet
Out[16]:
We will use the python module NetworkX
to construct and analyze the social network.
A short introduction to networkx: https://github.com/networkx/notebooks
There are four types of interactions between two users in Twitter:
In [17]:
# let's define some functions to extract the interactions from tweets
def getTweetID(tweet):
""" If properly included, get the ID of the tweet """
return tweet.get('id')
def getUserIDandScreenName(tweet):
""" If properly included, get the tweet
user ID and Screen Name """
user = tweet.get('user')
if user is not None:
uid = user.get('id')
screen_name = user.get('screen_name')
return uid, screen_name
else:
return (None, None)
def getRetweetedUserIDandSreenName(tweet):
""" If properly included, get the retweet
source user ID and Screen Name"""
retweet = tweet.get('retweeted_status')
if retweet is not None:
return getUserIDandScreenName(retweet)
else:
return (None, None)
def getRepliedUserIDandScreenName(tweet):
""" If properly included, get the ID and Screen Name
of the user the tweet replies to """
reply_id = tweet.get('in_reply_to_user_id')
reply_screenname = tweet.get('in_reply_to_screen_name')
return reply_id, reply_screenname
def getUserMentionsIDandScreenName(tweet):
""" If properly included, return a list of IDs and Screen Names tuple
of all user mentions, including retweeted and replied users """
mentions = []
entities = tweet.get('entities')
if entities is not None:
user_mentions = entities.get('user_mentions')
for mention in user_mentions:
mention_id = mention.get('id')
screen_name = mention.get('screen_name')
mentions.append((mention_id, screen_name))
return mentions
def getQuotedUserIDandScreenName(tweet):
""" If properly included, get the ID of the user the tweet is quoting"""
quoted_status = tweet.get('quoted_status')
if quoted_status is not None:
return getUserIDandScreenName(quoted_status)
else:
return (None, None)
def getAllInteractions(tweet):
""" Get all the interactions from this tweet
returns : (tweeter_id, tweeter_screenname), list of (interacting_id, interacting_screenname)
"""
# Get the tweeter
tweeter = getUserIDandScreenName(tweet)
# Nothing to do if we couldn't get the tweeter
if tweeter[0] is None:
return (None, None), []
# a python set is a collection of unique items
# we use a set to avoid duplicated ids
interacting_users = set()
# Add person they're replying to
interacting_users.add(getRepliedUserIDandScreenName(tweet))
# Add person they retweeted
interacting_users.add(getRetweetedUserIDandSreenName(tweet))
# Add person they quoted
interacting_users.add(getQuotedUserIDandScreenName(tweet))
# Add mentions
interacting_users.update(getUserMentionsIDandScreenName(tweet))
# remove the tweeter if he is in the set
interacting_users.discard(tweeter)
# remove the None case
interacting_users.discard((None,None))
# Return our tweeter and their influencers
return tweeter, list(interacting_users)
In [18]:
print(getUserIDandScreenName(tweet))
print(getAllInteractions(tweet))
In [19]:
import networkx as nx
# define an empty Directed Graph
# A directed graph is a graph where edges have a direction
# in our case the edges goes from user that sent the tweet to
# the user with whom they interacted (retweeted, mentioned or quoted)
G = nx.DiGraph()
# loop over all the tweets and add edges if the tweet include some interactions
for tweet in tweet_list:
# find all influencers in the tweet
tweeter, interactions = getAllInteractions(tweet)
tweeter_id, tweeter_name = tweeter
# add an edge to the Graph for each influencer
for interaction in interactions:
interact_id, interact_name = interaction
# add edges between the two user ids
# this will create new nodes if the nodes are not already in the network
G.add_edge(tweeter_id, interact_id)
# add name as a property to each node
# with networkX each node is a dictionary
G.node[tweeter_id]['name'] = tweeter_name
G.node[interact_id]['name'] = interact_name
In [20]:
# The graph's node are contained in a dictionary
print(type(G.node))
In [21]:
#print(G.node.keys())
# the keys are the user_id
print(G.node[tweeter_id])
In [22]:
# each node is itself a dictionary with node attributes as key,value pairs
print(type(G.node[tweeter_id]))
In [23]:
# edges are also contained in a dictionary
print(type(G.edge))
In [24]:
# we can see all the edges going out of this node
# each edge is a dictionary inside this dictionary with a key
# corresponding to the target user_id
print(G.edge[tweeter_id])
In [25]:
# so we can access the edge using the source user_id and the target user_id
G.edge[tweeter_id][interact_id]
Out[25]:
In [26]:
G.number_of_nodes()
Out[26]:
In [27]:
G.number_of_edges()
Out[27]:
In [28]:
# listing all nodes
node_list = G.nodes()
node_list[:3]
Out[28]:
In [29]:
# degree of a node
print(G.degree(node_list[2]))
print(G.in_degree(node_list[2]))
print(G.out_degree(node_list[2]))
In [30]:
# dictionary with the degree of all nodes
all_degrees = G.degree(node_list) # this is the degree for undirected edges
in_degrees = G.in_degree(node_list)
out_degrees = G.in_degree(node_list)
In [31]:
# average degree
2*G.number_of_edges()/G.number_of_nodes()
Out[31]:
In [32]:
import numpy as np
np.array(list(all_degrees.values())).mean()
Out[32]:
In [33]:
np.array(list(in_degrees.values())).mean()
Out[33]:
In [34]:
np.array(list(out_degrees.values())).mean()
Out[34]:
In [35]:
# maximum degree
max(all_degrees.values())
Out[35]:
In [36]:
# we want to make a list with (user_id, username, degree) for all nodes
degree_node_list = []
for node in G.nodes_iter():
degree_node_list.append((node, G.node[node]['name'], G.degree(node)))
print('Unordered user, degree list')
print(degree_node_list[:10])
# sort the list according the degree in descinding order
degree_node_list = sorted(degree_node_list, key=lambda x:x[2], reverse=True)
print('Ordered user, degree list')
print(degree_node_list[:10])
In [37]:
# we need to import matplolib for making plots
# and numpy for numerical computations
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
For directed graphs we can define two types of components:
Weakly connected component (WCC): maximal set of nodes where there exists a path in at least one direction between each pair of nodes.
Strongly connected component (SCC): maximal set of nodes where there exists a path in both directions between each pair of nodes.
Weakly connected giant (largest) component (WCGC): Largest WCC Strongly connected giant (largest) component (SCGC): Largest SCC
In [38]:
# this returns a list of set of nodes belonging to the
# different (weakly) connected components
components = list(nx.weakly_connected_components(G))
# sort the component according to their size
components = list(sorted(components, key=lambda x:len(x), reverse=True))
In [39]:
# make a list with the size of each component
comp_sizes = []
for comp in components:
comp_sizes.append(len(comp))
In [40]:
# plot the histogram of component sizes
hist = plt.hist(comp_sizes, bins=100)
In [41]:
# histogram with logarithmic y scale
hist = plt.hist(comp_sizes, bins=100, log=True)
plt.xlabel('component size')
plt.ylabel('number of components')
Out[41]:
In [42]:
# sizes of the ten largest components
comp_sizes[:10]
Out[42]:
In [43]:
# let's make a new graph which is the subgraph of G corresponding to
# the largest connected component
# let's find the largest component
largest_comp = components[0]
LCC = G.subgraph(largest_comp)
In [44]:
G.number_of_nodes()
Out[44]:
In [45]:
LCC.number_of_nodes()
Out[45]:
In [46]:
# let's plot the degree distribution inside the LCC
degrees = nx.degree(LCC)
degrees
Out[46]:
In [47]:
degree_array = np.array(list(degrees.values()))
hist = plt.hist(degree_array, bins=100)
In [48]:
# using logarithmic scales
hist = plt.hist(degree_array, bins=100, log=True)
plt.xscale('log')
In [49]:
# logarithmic scale with logarithmic bins
N, bins, patches = plt.hist(degree_array, bins=np.logspace(0,np.log10(degree_array.max()+1), 20), log=True)
plt.xscale('log')
plt.xlabel('k - degree')
plt.ylabel('number of nodes')
Out[49]:
In [50]:
# Degree probability distribution (P(k))
# since we have logarithmic bins, we need to
# take into account the fact that the bins
# have different lenghts when normalizing
bin_lengths = np.diff(bins) # lenght of each bin
summ = np.sum(N*bin_lengths)
normalized_degree_dist = N/summ
# check normalization:
print(np.sum(normalized_degree_dist*bin_lengths))
hist = plt.bar(bins[:-1], normalized_degree_dist, width=np.diff(bins))
plt.xscale('log')
plt.yscale('log')
plt.xlabel('k (degree)')
plt.ylabel('P(k)')
Out[50]:
In [51]:
import random
def getGCsize(G):
""" returns the size of the largest component of G"""
comps = nx.connected_components(G)
return max([len(comp) for comp in comps])
In [52]:
# list that will contain the size of the GC as we remove nodes
rnd_attack_GC_sizes = []
# we will take into account the undirected version of the graph
LCCundirected = nx.Graph(LCC)
nodes_list = LCCundirected.nodes()
while len(nodes_list) > 1:
# add the size of the current GC
rnd_attack_GC_sizes.append(getGCsize(LCCundirected))
# pick a random node
rnd_node = random.choice(nodes_list)
# remove from graph
LCCundirected.remove_node(rnd_node)
# remove from node list
nodes_list.remove(rnd_node)
In [53]:
# convert list to numpy array
rnd_attack_GC_sizes = np.array(rnd_attack_GC_sizes)
# normalize by the initial size of the GC
GC_rnd = rnd_attack_GC_sizes/rnd_attack_GC_sizes[0]
# fraction of removed nodes
q = np.linspace(0,1,num=GC_rnd.size)
plt.plot(q,GC_rnd)
plt.xlabel('q')
plt.ylabel('GC')
Out[53]:
In [54]:
# high degree attack
LCCundirected = nx.Graph(LCC)
# list of pairs (node, degree) sorted according the degree
node_deg_dict = nx.degree(LCCundirected)
nodes_sorted = sorted(node_deg_dict, key=node_deg_dict.get)
# list that will contain the size of the GC as we remove nodes
hd_attack_GC_sizes = []
while len(nodes_sorted) > 1:
hd_attack_GC_sizes.append(getGCsize(LCCundirected))
#remove node according to their degree
node = nodes_sorted.pop()
LCCundirected.remove_node(node)
In [55]:
hd_attack_GC_sizes = np.array(hd_attack_GC_sizes)
GC_hd = hd_attack_GC_sizes/hd_attack_GC_sizes[0]
q = np.linspace(0,1,num=GC_hd.size)
plt.plot(q,GC_rnd, label='random attack')
plt.plot(q,GC_hd, label='High-Degree attack')
plt.xlabel('q')
plt.ylabel('GC')
plt.legend()
Out[55]:
In [57]:
nx.write_graphml(LCC, 'twitter_lcc_AI2.graphml')
We can now open the file with Gephi to vizualize the graph
In [ ]: