Graph Analysis - I

Imports

The new import we are doing in this class is networkx:

http://networkx.github.io/documentation/latest/tutorial/


In [ ]:
import time

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import scipy as sp
import scipy.sparse.linalg as linalg
import scipy.cluster.hierarchy as hr
from scipy.spatial.distance import pdist, squareform

import sklearn.datasets as datasets
import sklearn.metrics as metrics
import sklearn.utils as utils
import sklearn.linear_model as linear_model
import sklearn.svm as svm
import sklearn.cross_validation as cross_validation
import sklearn.cluster as cluster
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

import statsmodels.api as sm

from patsy import dmatrices

import networkx as nx

import seaborn as sns
%matplotlib inline

Basic graph concepts in NetworkX

Undirected Graphs


In [ ]:
g = nx.Graph()

Adding to the graph one node at a time


In [ ]:
g.add_node(1)

Adding multiple nodes at a time


In [ ]:
g.add_nodes_from([2,3])

Nodes are objects themselves


In [ ]:
g.add_node('ET')

In [ ]:
print g.nodes()

Nodes can also be removed


In [ ]:
g.remove_node(1)

In [ ]:
print g.nodes()

Adding edges to the graph


In [ ]:
g.add_edge(1,2)
g.add_edge(3,'ET')
g.add_edges_from([(2,3), (1,3)])

In [ ]:
print g.edges()

In [ ]:
print g.nodes()

Removing edges


In [ ]:
g.remove_edge(1,2)

In [ ]:
print g.edges()

In [ ]:
print g.nodes()

Neighbors, degrees etc.


In [ ]:
g.neighbors(1)

In [ ]:
g.degree(1)
Any NetworkX graph behaves like a Python dictionary with nodes as primary keys

In [ ]:
g.add_node(1, time='5pm')

In [ ]:
g.node[1]['time']

In [ ]:
g.node[1] # Python dictionary

The special edge attribute "weight" should always be numeric and holds values used by algorithms requiring weighted edges.


In [ ]:
g.add_edge(1, 2, weight=4.0 )

In [ ]:
g[1][2]['weight'] = 5.0 # edge already added

In [ ]:
g[1][2]
Node and edge iterators

In [ ]:
for node in g:
    print 'nodeid: ', node, '\t degree:', g.degree(node)

In [ ]:
print g.edges(data=True)

Add the nodes from any container (a list, dict, set or even the lines from a file or the nodes from another graph).


In [ ]:
G = nx.DiGraph()
G.add_node(1)
G.add_nodes_from([2,3])
G.add_nodes_from(range(100,110))
H=nx.Graph()
H.add_path([0,1,2,3,4,5,6,7,8,9])
G.add_nodes_from(H)

In [ ]:
print G.nodes()

G can also grow by adding edges


In [ ]:
G.add_edge(1, 2)
G.add_edges_from([(1,2),(1,3)])
G.add_edges_from(H.edges())

In [ ]:
print G.edges()

Attributes:

Each graph, node, and edge can hold key/value attribute pairs in an associated attribute dictionary (the keys must be hashable). By default these are empty, but can be added or changed using add_edge, add_node or direct manipulation of the attribute dictionaries named graph, node and edge respectively.


In [ ]:
G = nx.DiGraph(day="Friday")
print G.graph

Add node attributes using add_node(), add_nodes_from() or G.node


In [ ]:
G.add_node(1, time='5pm')
G.add_nodes_from([3], time='2pm')
print G.node[1]
G.node[1]['room'] = 714
del G.node[1]['room'] # remove attribute
print G.nodes(data=True)

Add edge attributes using add_edge(), add_edges_from(), subscript notation, or G.edge.


In [ ]:
G.add_edge(1, 2, weight=4.7 )
G.add_edges_from([(3,4),(4,5)], color='red')
G.add_edges_from([(1,2,{'color':'blue'}), (2,3,{'weight':8})])
G[1][2]['weight'] = 4.7
G.edge[1][2]['weight'] = 4

print G.edges(data=True)

Many common graph features allow python syntax to speed reporting.


In [ ]:
1 in G     # check if node in graph

In [ ]:
[n for n in G if n<3]   # iterate through nodes

In [ ]:
len(G)  # number of nodes in graph

In [ ]:
print G[1] # adjacency dict keyed by neighbor to edge attributes
...            # Note: you should not change this dict manually!

Iterating over the edges of a graph


In [ ]:
for n,nbrsdict in G.adjacency_iter():
    for nbr,eattr in nbrsdict.items():
        if 'weight' in eattr:
            print (n,nbr,eattr['weight'])

or


In [ ]:
[ (u,v,edata['weight']) for u,v,edata in G.edges(data=True) if 'weight' in edata ]

Reading .gml files


In [ ]:
Ggml = nx.read_gml('polblogs.gml')

In [ ]:
print len(Ggml.nodes())
print len(Ggml.edges())

In [ ]:
with sns.axes_style('white'):
    fig = plt.subplots(1, figsize=(12,8))
    nx.draw_networkx(Ggml, edge_color='#a4a4a4', node_size=50, with_labels=False, arrows=False)
    plt.axis('off')

Reading graphs as edge lists


In [ ]:
with open('football.txt', 'r') as f:
   football =  nx.read_edgelist(f, comments='#', nodetype=int, data=False)

In [ ]:
print len(football.nodes())
print len(football.edges())

In [ ]:
with sns.axes_style('white'):
    fig = plt.subplots(1, figsize=(12,8))
    nx.draw_networkx(football, edge_color='#a4a4a4', node_size=50, with_labels=False)
    plt.axis('off')

Generating graphs using the routines already available in python (for small data)

Networkx has a wealth of data-generation routines that can be found here:

https://networkx.github.io/documentation/latest/reference/generators.html

This is the function that generates the Zachary's Karate club network data


In [ ]:
kn=nx.karate_club_graph()

In [ ]:
num_nodes = kn.number_of_nodes()
print 'number of nodes: ' + str(num_nodes)
num_edges = kn.number_of_edges()
print 'number of edges: ' + str(num_edges)

Drawing the network


In [ ]:
with sns.axes_style('white'):
    fig = plt.subplots(1, figsize=(12,8))
    nx.draw_networkx(kn, edge_color='#a4a4a4', with_labels=True, font_color='#cacaca')
    plt.axis('off')

In [ ]:
fl = nx.florentine_families_graph()
num_nodes = fl.number_of_nodes()
print 'number of nodes: ' + str(num_nodes)
num_edges = fl.number_of_edges()
print 'number of edges: ' + str(num_edges)
with sns.axes_style('white'):
    fig = plt.subplots(1, figsize=(12,8))
    nx.draw_networkx(fl, edge_color='#a4a4a4', node_size=0, with_labels=True)
    plt.axis('off')

Experimenting with different graph models

Experimenting with Erdos-Renyi random graphs


In [ ]:
er=nx.erdos_renyi_graph(1000,0.15)

In [ ]:
print type(er)

In [ ]:
print "Number of nodes in the random graph: ", er.number_of_nodes() 
print "Number of edges in the random graph: ", er.number_of_edges()

In [ ]:
with sns.axes_style('white'):
    fig = plt.subplots(1, figsize=(12,8))
    nx.draw_networkx(er, node_size=15, edge_color='#a4a4a4', with_labels=False, alpha=.4, linewidths=0)
    plt.axis('off')

Degree distribution


In [ ]:
degree_sequence=sorted(nx.degree(er).values(),reverse=True) 
dmax=max(degree_sequence)
print dmax

In [ ]:
h,bins,patches = plt.hist(degree_sequence,bins=dmax)

In [ ]:
hmax=max(h)
plt.axis([1,dmax,1,hmax]) # set ranges
#x=compress(h,bins)    # remove bins with zero entries
#y=compress(h,h)       # remove corresponding entries
x=bins.compress(h)
y=h.compress(h)
plt.plot(x,y,'bo')
plt.title("Degree distribution")
plt.xlabel("degree")
plt.ylabel("number of nodes")
plt.show()

Connected Components

Two nodes of a graph belong in the same connected component if there is a path of edges of the graph that connects these two nodes.


In [ ]:
cc= nx.connected_components(er)
print type(cc)
print [len(s) for s in cc]

In [ ]:
def  print_cc_sizes(g):
    cc = nx.connected_components(g)
    print [len(s) for s in cc]

Clustering coefficient

The clustering coefficient of a node is defined as the number of possible triangles centered in this node, divided by the total number of possible triangles in which this node can participate in. Formally, the clustering coefficient of a node $u$ is defined as $$c_u=\frac{2T(u)}{d(u)(d(u)-1)},$$ where $T(u)$ is the number of triangles through node $u$ and $d(u)$ is the degree of node $u$.

For more details for weighted graphs etc see:

http://networkx.lanl.gov/reference/generated/networkx.algorithms.cluster.clustering.html#networkx.algorithms.cluster.clustering

The average clustering coefficient is the average clustering coefficient of all the nodes in the graph.

http://networkx.lanl.gov/reference/generated/networkx.algorithms.cluster.average_clustering.html#networkx.algorithms.cluster.average_clustering


In [ ]:
ccall = nx.clustering(er)
clustering_coefficient = nx.average_clustering(er)

print clustering_coefficient

In [ ]:
h,bins,patches = plt.hist(list(nx.clustering(er).values()))
plt.title('clustering coefficients')

Triangles


In [ ]:
print(nx.triangles(er,0))
#print(nx.triangles(er))
h,bins, patches = plt.hist(list(nx.triangles(er).values()))
plt.title('Triangles')

Diameter and average path length

The diameter of a graph is defined as the largest shortest path between any two nodes in the graph


In [ ]:
print(nx.diameter(er))

The average shortest path length of a graph is defined as the average of all shortest path lengths in the graph

http://networkx.lanl.gov/reference/generated/networkx.algorithms.shortest_paths.generic.average_shortest_path_length.html


In [ ]:
print(nx.average_shortest_path_length(er))

In [ ]:
ws=nx.watts_strogatz_graph(500,5,0.1)
print_cc_sizes(ws)

Degree distribution


In [ ]:
degree_sequence=sorted(nx.degree(ws).values(),reverse=True) 
dmax=max(degree_sequence)
print dmax

In [ ]:
h,bins,patches = plt.hist(degree_sequence,bins=dmax)

In [ ]:
hmax=max(h)
plt.axis([1,dmax,1,hmax]) # set ranges
#x=compress(h,bins)    # remove bins with zero entries
#y=compress(h,h)       # remove corresponding entries
x=bins.compress(h)
y=h.compress(h)
plt.plot(x,y,'bo')
plt.title("Degree distribution")
plt.xlabel("degree")
plt.ylabel("number of nodes")
plt.show()

Clustering coefficient


In [ ]:
h,bins,patches = plt.hist(list(nx.clustering(ws).values()))
plt.title('clustering coefficients')

Average path length and diameter


In [ ]:
print 'Diameter:', (nx.diameter(ws))
print 'Average shortest path length:', (nx.average_shortest_path_length(ws))
print 'Average clustering coefficient:', (nx.average_clustering(ws))

In [ ]:
r = range(4,7)
d = np.zeros(len(r))
cc = np.zeros(len(r))
pl = np.zeros(len(r))
index = 0
for i in  r:
    ws=nx.watts_strogatz_graph(500,i,0.1)
    d[index] = nx.diameter(ws)
    cc[index] = nx.average_clustering(ws)
    pl[index] = nx.average_shortest_path_length(ws)
    index=+1
plt.plot(r,d,'r')
plt.plot(r,cc,'b')
#plt.plot(r,pl,'g');

In [ ]:
ba=nx.barabasi_albert_graph(500,5)
print_cc_sizes(ba)

Degree distribution


In [ ]:
degree_sequence=sorted(nx.degree(ba).values(),reverse=True) 
dmax=max(degree_sequence)
print dmax

In [ ]:
h,bins,patches = plt.hist(degree_sequence,bins=dmax)

In [ ]:
hmax=max(h)
plt.axis([1,dmax,1,hmax]) # set ranges
#x=compress(h,bins)    # remove bins with zero entries
#y=compress(h,h)       # remove corresponding entries
x=bins.compress(h)
y=h.compress(h)
plt.loglog(x,y,'bo')
plt.title("Degree distribution")
plt.xlabel("degree")
plt.ylabel("number of nodes")
plt.show()

In [1]:
# Code for setting the style of the notebook
from IPython.core.display import HTML
def css_styling():
    styles = open("../theme/custom.css", "r").read()
    return HTML(styles)
css_styling()


Out[1]: