In [1]:
# Imports needed for this ipython file
from collections import Counter
import matplotlib.pyplot as plt
import networkx as nx
import pickle
import numpy as np
import operator
import math
%matplotlib inline
In [6]:
#Remove nodes having zero degrees
def remove_zero_degree_nodes(G,verbose = False):
""" Find and remove nodes having zero degrees
Args : A graph G is passed as input
A boolean variable verbose, to display which nodes were removed.
Returns : A graph G with zero degree nodes removed
"""
zero_degree_nodes = []
for n in nx.nodes(G):
if not nx.edges(G,n): # Checking if number of edges for n is 0
zero_degree_nodes.append(n)
if verbose:
if not zero_degree_nodes:
print "No nodes removed"
else:
print zero_degree_nodes
if zero_degree_nodes:
for node in zero_degree_nodes:
G.remove_node(node)
return G
# Methods to visualize top 20 ranked nodes of a graph
def get_top_20_ranked_nodes(graph,n=20):
""" Returns 20 nodes having highest PageRank values
Args: A DiGraph object is passed as input
Returns : A list of n nodes having highest PageRank values
"""
pr = nx.pagerank(graph, alpha=0.9)
sorted_pr = sorted(pr.items(), key=operator.itemgetter(1))
return [node[0] for node in sorted_pr[-n:]]
def get_color(node, pr):
""" Returns color values based on PageRank value of node
if node is in top n rank, then it is colored blue; otherwise white
"""
if node in pr:
return 'r'
else:
return 'w'
def get_label(node, data, pr):
""" Returns data label of node based on PageRank value of node
if node is in top 20, then data value is returned; otherwise empty string
"""
if node in pr:
return data
else:
return ''
def get_edge_list(graph, nodelist):
""" Returns a list of edges, of nodes in nodelist
"""
edges = []
for n in nodelist:
for edge in nx.edges(graph,n):
if edge[0] in nodelist:
if edge[0] in nodelist:
edges.append(edge)
return edges
def get_node_list(graph, edgelist):
""" Returns a list of all nodes that are in edgelist
"""
node_list = set()
for edge in edgelist:
node_list.add(edge[0])
node_list.add(edge[1])
return list(node_list)
def draw_network(graph,title):
pr = get_top_20_ranked_nodes(graph)
edgelist = get_edge_list(graph, pr)
nodelist = get_node_list(graph,edgelist)
colors = [get_color(node,pr) for node in nodelist]
data = nx.get_node_attributes(graph,'name')
labels = {node: get_label(node,data[node],pr) for node in nodelist}
shells = []
second_layer = []
for node in nodelist:
if node not in pr:
second_layer.append(node)
shells.append(second_layer)
shells.append(pr)
fig = plt.figure(figsize=(10,10))
nx.draw_networkx(graph,
pos=nx.shell_layout(graph,shells),
node_color=colors,
labels=labels,
nodelist = nodelist,
edgelist = edgelist,
alpha=.5,
width=.1,
arrows = False,
node_size=100)
plt.axis("off")
plt.title(title)
plt.savefig("Output "+title+".pdf")
plt.show()
def plot_degree_distribution(G,title,n):
""" Plotting 3 bar graphs which displays the following :
graph 1 : displays the degree distribution of first 1/3rd
of entire graph's nodes sorted by degree.
graph 2 : displays the first 20 degree distribution of nodes
(i.e.) having degrees 1 - 20
graph 3 : displays the last 20 degree distribution of nodes
(i.e.) nodes having 20 largest degree values in graph
"""
degrees = nx.degree(G)
degree_counts = Counter(degrees.values())
p_k = [(degree, 1. * count / len(G.nodes()))
for degree, count in degree_counts.iteritems()]
p_k = sorted(p_k)
ks = [x[0] for x in p_k] # Get the first element of each tuple (the degree)
x_pos = range(len(ks))
# Plot the bar chart.
#Main graph till size
plt.figure(figsize=(20,20))
plt.subplot(311)
x_pos_1 = x_pos[:len(x_pos)/n]
ks_1 = ks[:len(ks)/n]
p_k_1 = p_k[:len(p_k)/n]
plt.xticks(x_pos_1, ks_1)
plt.bar(x_pos_1, [x[1] for x in p_k_1], align='center', alpha=0.4, width = 1)
# Label the x ticks.
# Label axes and title.
plt.xlabel('$k$')
plt.ylabel('$P(k)$')
plt.title(title+" (first 1/"+str(n)+" part of graph)")
#Subplots
#First 20
plt.subplot(312)
x_pos_2 = x_pos[:20]
ks_2 = ks[:20]
p_k_2 = p_k[:20]
plt.xticks(x_pos_2, ks_2)
plt.bar(x_pos_2, [x[1] for x in p_k_2], align='center', alpha=0.4, width = 0.5)
plt.xlabel('$k$')
plt.ylabel('$P(k)$')
plt.title("Degree Distribution of first 20 degrees")
# Last 10
plt.subplot(313)
x_pos_3 = x_pos[-20:]
ks_3 = ks[-20:]
p_k_3 = p_k[-20:]
plt.xticks(x_pos_3, ks_3)
plt.bar(x_pos_3, [x[1] for x in p_k_3], align='center', alpha=0.4, width = 0.5)
plt.xlabel('$k$')
plt.ylabel('$P(k)$')
plt.title("Degree Distribution of last 20 degrees")
plt.savefig("Output "+title+".pdf")
plt.show()
# Method to plot clustering coefficient of a graph
def plot_clustering_coefficient(G,title):
"""
Plotting histogram of clustering coefficient values of a graph
"""
plt.hist(nx.clustering(G.to_undirected()).values())
plt.xlabel(title)
plt.ylabel('count')
plt.savefig("Output "+title+".pdf")
plt.show()
In [2]:
friend_ids_of_iit_followers_corrected = pickle.load(open('processed_data/friend_ids_of_iit_followers'))
processed_iit_follower = pickle.load(open('processed_data/iit_follower_id_to_name_mapping'))
In [3]:
#Creating graph for iit
G = nx.DiGraph()
for iit_follower_id, iit_follower_name in processed_iit_follower.items():
if not isinstance(friend_ids_of_iit_followers_corrected[iit_follower_id],(int,long)):
G.add_node(iit_follower_id, name = iit_follower_name)
for follower_name, friend_ids in friend_ids_of_iit_followers_corrected.items():
if not isinstance(friend_ids,(int,long)): #Ignoring error codes
for friend_id in friend_ids:
if friend_id in G:
G.add_edge(follower_name,friend_id)
In [4]:
print len(G.nodes())
print len(G.edges())
In [6]:
#Remove zero degree nodes
G = remove_zero_degree_nodes(G,verbose = True)
In [8]:
draw_network(G," Illinois tech top 20 ranked network")
In [16]:
plot_degree_distribution(G,title = "Degree Distribution of IIT", n =3)
In [33]:
plot_clustering_coefficient(G.copy(),title = "Clustering coefficient of IIT")
Restarting kernel now would be recommended to free up ram. Then, have to re-run the import and methods for processing network data modules.
In [2]:
# Data required for graph
friend_ids_of_nu_followers_corrected = pickle.load(open('processed_data/friend_ids_of_nu_followers'))
processed_nu_follower = pickle.load(open("processed_data/nu_follower_id_to_name_mapping"))
In [3]:
# Create NU graph
G2 = nx.DiGraph()
# Adding nodes
for nu_follower_id, nu_follower_name in processed_nu_follower.items():
if not isinstance(friend_ids_of_nu_followers_corrected[nu_follower_id],(int,long)):
G2.add_node(nu_follower_id, name = nu_follower_name)
#Adding edges
for follower_name, friend_ids in friend_ids_of_nu_followers_corrected.items():
if not isinstance(friend_ids,(int,long)):
for friend_id in friend_ids:
if friend_id in G2:
G2.add_edge(follower_name,friend_id)
In [4]:
print len(G2.nodes())
print len(G2.edges())
In [7]:
# Removing zero degree nodes
G2 = remove_zero_degree_nodes(G2,verbose = True)
In [8]:
draw_network(G2," North Western University top 20 ranked network")
In [9]:
plot_degree_distribution(G2,title = "Degree Distribution of NU",n=8)
In [14]:
plot_clustering_coefficient(G2.copy(),title = "Clustering coefficient of NU")