In [1]:
# Imports needed for this ipython file
from collections import Counter
import matplotlib.pyplot as plt
import networkx as nx
import pickle
import numpy as np
import operator
import math
%matplotlib inline
In [2]:
# Unpickle everything
iit_followers = pickle.load(open('processed_data/iit_followers'))
friends_of_iit_followers = pickle.load(open('raw_data/friends_of_iit_followers'))
friend_ids_of_iit_followers = pickle.load(open('raw_data/friend_ids_of_iit_followers'))
In [3]:
iit_alumni_followers = pickle.load(open('raw_data/iitalumniassn'))
iit_career_followers = pickle.load(open('raw_data/IITCareers'))
iit_univ_followers = pickle.load(open('raw_data/illinoistech'))
In [6]:
def get_all_follower_objects(*args):
""" Returns union of all follower objects of a particular university
Args: Variables containing followers from different accounts in dictionary format.
Returns : A dictionary of { follower_screen_name : follower_object } pairs.
"""
univ_followers = {}
for followers in args:
for key, value in followers.items():
univ_followers[key] = value
return univ_followers
In [5]:
iit_follower_objects = get_all_follower_objects(iit_alumni_followers,iit_career_followers,iit_univ_followers)
In [7]:
processed_iit_follower = {}
for follower, follower_object in iit_follower_objects.items():
processed_iit_follower[follower_object["id"]] = follower
In [8]:
iit_follower_name_to_id = {}
for follower, follower_object in iit_follower_objects.items():
iit_follower_name_to_id[follower] = follower_object["id"]
In [ ]:
print len(friend_ids_of_iit_followers.keys())
print friend_ids_of_iit_followers
In [10]:
friend_ids_of_iit_followers_corrected = {}
for key,value in friend_ids_of_iit_followers.items():
friend_ids_of_iit_followers_corrected[iit_follower_name_to_id[key]] = value
fileObject = open("processed_data/friend_ids_of_iit_followers",'wb')
pickle.dump(friend_ids_of_iit_followers_corrected,fileObject)
fileObject.close()
In [2]:
friend_ids_of_iit_followers_corrected = pickle.load(open('processed_data/friend_ids_of_iit_followers'))
processed_iit_follower = pickle.load(open('processed_data/iit_follower_id_to_name_mapping'))
friend_ids_of_iit_followers = pickle.load(open('raw_data/friend_ids_of_iit_followers'))
In [12]:
error3_count = []
follower_count = []
other_error_count = []
for follower, friends in friend_ids_of_iit_followers_corrected.items():
if not isinstance(friends,(int,long)):
if len(friends) > 1000:
error3_count.append(len(friends))
follower_count.append(len(friends))
else:
other_error_count.append(follower)
print len(friend_ids_of_iit_followers_corrected.keys())
print len(error3_count)
print len(other_error_count)
print len(follower_count)
In [13]:
print np.mean(error3_count)
print np.median(error3_count)
In [14]:
print len(processed_iit_follower.keys())
print len(friend_ids_of_iit_followers_corrected.keys())
In [15]:
# Pickling iit_follower_objects for future calculations.
fileObject = open("raw_data/iit_follower_objects",'wb')
pickle.dump(iit_follower_objects,fileObject)
fileObject.close()
In [16]:
# Pickling iit_follower_ids for graph calculations.
fileObject = open("processed_data/iit_follower_id_to_name_mapping",'wb')
pickle.dump(processed_iit_follower,fileObject)
fileObject.close()
In [3]:
G = nx.DiGraph()
In [4]:
for iit_follower_id, iit_follower_name in processed_iit_follower.items():
if not isinstance(friend_ids_of_iit_followers_corrected[iit_follower_id],(int,long)):
G.add_node(iit_follower_id, name = iit_follower_name)
In [5]:
for follower_name, friend_ids in friend_ids_of_iit_followers_corrected.items():
if not isinstance(friend_ids,(int,long)):
for friend_id in friend_ids:
if friend_id in G:
G.add_edge(follower_name,friend_id)
In [6]:
print len(G.nodes())
print len(G.edges())
In [4]:
def get_top_20_ranked_nodes(graph,n=20):
""" Returns 20 nodes having highest PageRank values
Args: A DiGraph object is passed as input
Returns : A list of n nodes having highest PageRank values
"""
pr = nx.pagerank(graph, alpha=0.9)
sorted_pr = sorted(pr.items(), key=operator.itemgetter(1))
return [node[0] for node in sorted_pr[-n:]]
def get_color(node, pr):
""" Returns color values based on PageRank value of node
if node is in top n rank, then it is colored blue; otherwise white
"""
if node in pr:
return 'r'
else:
return 'w'
def get_label(node, data, pr):
""" Returns data label of node based on PageRank value of node
if node is in top 20, then data value is returned; otherwise empty string
"""
if node in pr:
return data
else:
return ''
def get_edge_list(graph, nodelist):
""" Returns a list of edges, of nodes in nodelist
"""
edges = []
for n in nodelist:
for edge in nx.edges(graph,n):
if edge[0] in nodelist:
if edge[0] in nodelist:
edges.append(edge)
return edges
def get_node_list(graph, edgelist):
""" Returns a list of all nodes that are in edgelist
"""
node_list = set()
for edge in edgelist:
node_list.add(edge[0])
node_list.add(edge[1])
return list(node_list)
def draw_network(graph,title):
pr = get_top_20_ranked_nodes(graph)
edgelist = get_edge_list(graph, pr)
nodelist = get_node_list(graph,edgelist)
colors = [get_color(node,pr) for node in nodelist]
data = nx.get_node_attributes(graph,'name')
labels = {node: get_label(node,data[node],pr) for node in nodelist}
shells = []
second_layer = []
for node in nodelist:
if node not in pr:
second_layer.append(node)
shells.append(second_layer)
shells.append(pr)
fig = plt.figure(figsize=(10,10))
nx.draw_networkx(graph,
#pos=nx.spring_layout(graph,k=1./math.sqrt(graph.number_of_nodes()+1000),scale=10.0),
#pos=nx.circular_layout(graph),
pos=nx.shell_layout(graph,shells),
node_color=colors,
labels=labels,
nodelist = nodelist,
edgelist = edgelist,
alpha=.5,
width=.1,
arrows = False,
node_size=100)
plt.axis("off")
plt.title(title)
plt.savefig("Output "+title+".pdf")
plt.show()
#draw_network(G)
In [27]:
draw_network(G," Illinois tech top 20 ranked network")
In [15]:
def plot_degree_distribution(G,title,n):
""" Plotting 3 bar graphs which displays the following :
graph 1 : displays the degree distribution of first 1/3rd
of entire graph's nodes sorted by degree.
graph 2 : displays the first 20 degree distribution of nodes
(i.e.) having degrees 1 - 20
graph 3 : displays the last 20 degree distribution of nodes
(i.e.) nodes having 20 largest degree values in graph
"""
degrees = nx.degree(G)
degree_counts = Counter(degrees.values())
p_k = [(degree, 1. * count / len(G.nodes()))
for degree, count in degree_counts.iteritems()]
p_k = sorted(p_k)
ks = [x[0] for x in p_k] # Get the first element of each tuple (the degree)
x_pos = range(len(ks))
# Plot the bar chart.
#Main graph till size
plt.figure(figsize=(20,20))
plt.subplot(311)
x_pos_1 = x_pos[:len(x_pos)/n]
ks_1 = ks[:len(ks)/n]
p_k_1 = p_k[:len(p_k)/n]
plt.xticks(x_pos_1, ks_1)
plt.bar(x_pos_1, [x[1] for x in p_k_1], align='center', alpha=0.4, width = 1)
# Label the x ticks.
# Label axes and title.
plt.xlabel('$k$')
plt.ylabel('$P(k)$')
plt.title(title+" (first 1/"+str(n)+" part of graph)")
#Subplots
#First 20
plt.subplot(312)
x_pos_2 = x_pos[:20]
ks_2 = ks[:20]
p_k_2 = p_k[:20]
plt.xticks(x_pos_2, ks_2)
plt.bar(x_pos_2, [x[1] for x in p_k_2], align='center', alpha=0.4, width = 0.5)
plt.xlabel('$k$')
plt.ylabel('$P(k)$')
plt.title("Degree Distribution of first 20 degrees")
# Last 10
plt.subplot(313)
x_pos_3 = x_pos[-20:]
ks_3 = ks[-20:]
p_k_3 = p_k[-20:]
plt.xticks(x_pos_3, ks_3)
plt.bar(x_pos_3, [x[1] for x in p_k_3], align='center', alpha=0.4, width = 0.5)
plt.xlabel('$k$')
plt.ylabel('$P(k)$')
plt.title("Degree Distribution of last 20 degrees")
plt.savefig("Output "+title+".pdf")
plt.show()
#plot_degree_distribution(G)
In [16]:
plot_degree_distribution(G,title = "Degree Distribution of IIT", n =3)
In [6]:
def plot_clustering_coefficient(G,title):
"""
Plotting histogram of clustering coefficient values of a graph
"""
plt.hist(nx.clustering(G.to_undirected()).values())
plt.xlabel(title)
plt.ylabel('count')
plt.savefig("Output "+title+".pdf")
plt.show()
#plot_clustering_coefficient(G.copy())
In [33]:
plot_clustering_coefficient(G.copy(),title = "Clustering coefficient of IIT")
In [14]:
def remove_zero_degree_nodes(G,verbose = False):
""" Find and remove nodes having zero degrees
Args : A graph G is passed as input
A boolean variable verbose, to display which nodes were removed.
Returns : A graph G with zero degree nodes removed
"""
zero_degree_nodes = []
for n in nx.nodes(G):
if not nx.edges(G,n): # Checking if number of edges for n is 0
#print n , processed_iit_follower[n], friend_ids_of_iit_followers_corrected[n]
zero_degree_nodes.append(n)
if verbose:
if not zero_degree_nodes:
print "No nodes removed"
else:
print zero_degree_nodes
if zero_degree_nodes:
for node in zero_degree_nodes:
G.remove_node(node)
return G
#G = remove_zero_degree_nodes(G,verbose = True)
G2 = remove_zero_degree_nodes(G2,verbose = True)
In [3]:
nu_followers = pickle.load(open('processed_data/nu_followers'))
In [7]:
nu_alumni_followers = pickle.load(open('raw_data/nualumni'))
nu_career_followers = pickle.load(open('raw_data/NUCareerAdvance'))
nu_univ_followers = pickle.load(open('raw_data/NorthwesternU'))
In [9]:
nu_follower_objects = get_all_follower_objects(nu_univ_followers,nu_alumni_followers,nu_career_followers)
# Pickling nu_follower_objects for future calculations.
fileObject = open("raw_data/nu_follower_objects",'wb')
pickle.dump(nu_follower_objects,fileObject)
fileObject.close()
In [5]:
# Reading follower objects from pickle
nu_follower_objects = pickle.load(open('raw_data/nu_follower_objects'))
In [6]:
processed_nu_follower = {}
for follower, follower_object in nu_follower_objects.items():
processed_nu_follower[follower_object["id"]] = follower
In [7]:
# Pickling nu_follower_ids for graph calculations.
fileObject = open("processed_data/nu_follower_id_to_name_mapping",'wb')
pickle.dump(processed_nu_follower,fileObject)
fileObject.close()
In [18]:
friend_ids_of_nu_followers = pickle.load(open('raw_data/friend_ids_of_nu_followers'))
In [22]:
friend_ids_of_nu_followers_corrected = {}
for key,value in friend_ids_of_nu_followers.items():
follower_object = nu_follower_objects[key]
friend_ids_of_nu_followers_corrected[follower_object["id"]] = value
fileObject = open("processed_data/friend_ids_of_nu_followers",'wb')
pickle.dump(friend_ids_of_nu_followers_corrected,fileObject)
fileObject.close()
In [12]:
# Data required for graph
friend_ids_of_nu_followers_corrected = pickle.load(open('processed_data/friend_ids_of_nu_followers'))
processed_nu_follower = pickle.load(open("processed_data/nu_follower_id_to_name_mapping"))
In [13]:
# Create NU graph
G2 = nx.DiGraph()
# Adding nodes
for nu_follower_id, nu_follower_name in processed_nu_follower.items():
if not isinstance(friend_ids_of_nu_followers_corrected[nu_follower_id],(int,long)):
G2.add_node(nu_follower_id, name = nu_follower_name)
#Adding edges
for follower_name, friend_ids in friend_ids_of_nu_followers_corrected.items():
if not isinstance(friend_ids,(int,long)):
for friend_id in friend_ids:
if friend_id in G2:
G2.add_edge(follower_name,friend_id)
In [15]:
print len(G2.nodes())
print len(G2.edges())
In [9]:
draw_network(G2," North Western University top 20 ranked network")
In [21]:
plot_degree_distribution(G2,title = "Degree Distribution of NU",n=8)
In [14]:
plot_clustering_coefficient(G2.copy(),title = "Clustering coefficient of NU")