The new import we are doing in this class is networkx:
In [ ]:
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy as sp
import scipy.sparse.linalg as linalg
import scipy.cluster.hierarchy as hr
from scipy.spatial.distance import pdist, squareform
import sklearn.datasets as datasets
import sklearn.metrics as metrics
import sklearn.utils as utils
import sklearn.linear_model as linear_model
import sklearn.svm as svm
import sklearn.cross_validation as cross_validation
import sklearn.cluster as cluster
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from patsy import dmatrices
import networkx as nx
import seaborn as sns
%matplotlib inline
In [ ]:
g = nx.Graph()
Adding to the graph one node at a time
In [ ]:
g.add_node(1)
Adding multiple nodes at a time
In [ ]:
g.add_nodes_from([2,3])
Nodes are objects themselves
In [ ]:
g.add_node('ET')
In [ ]:
print g.nodes()
Nodes can also be removed
In [ ]:
g.remove_node(1)
In [ ]:
print g.nodes()
Adding edges to the graph
In [ ]:
g.add_edge(1,2)
g.add_edge(3,'ET')
g.add_edges_from([(2,3), (1,3)])
In [ ]:
print g.edges()
In [ ]:
print g.nodes()
Removing edges
In [ ]:
g.remove_edge(1,2)
In [ ]:
print g.edges()
In [ ]:
print g.nodes()
Neighbors, degrees etc.
In [ ]:
g.neighbors(1)
In [ ]:
g.degree(1)
In [ ]:
g.add_node(1, time='5pm')
In [ ]:
g.node[1]['time']
In [ ]:
g.node[1] # Python dictionary
The special edge attribute "weight" should always be numeric and holds values used by algorithms requiring weighted edges.
In [ ]:
g.add_edge(1, 2, weight=4.0 )
In [ ]:
g[1][2]['weight'] = 5.0 # edge already added
In [ ]:
g[1][2]
In [ ]:
for node in g:
print 'nodeid: ', node, '\t degree:', g.degree(node)
In [ ]:
print g.edges(data=True)
Add the nodes from any container (a list, dict, set or even the lines from a file or the nodes from another graph).
In [ ]:
G = nx.DiGraph()
G.add_node(1)
G.add_nodes_from([2,3])
G.add_nodes_from(range(100,110))
H=nx.Graph()
H.add_path([0,1,2,3,4,5,6,7,8,9])
G.add_nodes_from(H)
In [ ]:
print G.nodes()
G can also grow by adding edges
In [ ]:
G.add_edge(1, 2)
G.add_edges_from([(1,2),(1,3)])
G.add_edges_from(H.edges())
In [ ]:
print G.edges()
Attributes:
Each graph, node, and edge can hold key/value attribute pairs in an associated attribute dictionary (the keys must be hashable). By default these are empty, but can be added or changed using add_edge, add_node or direct manipulation of the attribute dictionaries named graph, node and edge respectively.
In [ ]:
G = nx.DiGraph(day="Friday")
print G.graph
Add node attributes using add_node(), add_nodes_from() or G.node
In [ ]:
G.add_node(1, time='5pm')
G.add_nodes_from([3], time='2pm')
print G.node[1]
G.node[1]['room'] = 714
del G.node[1]['room'] # remove attribute
print G.nodes(data=True)
Add edge attributes using add_edge(), add_edges_from(), subscript notation, or G.edge.
In [ ]:
G.add_edge(1, 2, weight=4.7 )
G.add_edges_from([(3,4),(4,5)], color='red')
G.add_edges_from([(1,2,{'color':'blue'}), (2,3,{'weight':8})])
G[1][2]['weight'] = 4.7
G.edge[1][2]['weight'] = 4
print G.edges(data=True)
Many common graph features allow python syntax to speed reporting.
In [ ]:
1 in G # check if node in graph
In [ ]:
[n for n in G if n<3] # iterate through nodes
In [ ]:
len(G) # number of nodes in graph
In [ ]:
print G[1] # adjacency dict keyed by neighbor to edge attributes
... # Note: you should not change this dict manually!
Iterating over the edges of a graph
In [ ]:
for n,nbrsdict in G.adjacency_iter():
for nbr,eattr in nbrsdict.items():
if 'weight' in eattr:
print (n,nbr,eattr['weight'])
or
In [ ]:
[ (u,v,edata['weight']) for u,v,edata in G.edges(data=True) if 'weight' in edata ]
Reading .gml files
In [ ]:
Ggml = nx.read_gml('polblogs.gml')
In [ ]:
print len(Ggml.nodes())
print len(Ggml.edges())
In [ ]:
with sns.axes_style('white'):
fig = plt.subplots(1, figsize=(12,8))
nx.draw_networkx(Ggml, edge_color='#a4a4a4', node_size=50, with_labels=False, arrows=False)
plt.axis('off')
Reading graphs as edge lists
In [ ]:
with open('football.txt', 'r') as f:
football = nx.read_edgelist(f, comments='#', nodetype=int, data=False)
In [ ]:
print len(football.nodes())
print len(football.edges())
In [ ]:
with sns.axes_style('white'):
fig = plt.subplots(1, figsize=(12,8))
nx.draw_networkx(football, edge_color='#a4a4a4', node_size=50, with_labels=False)
plt.axis('off')
Networkx has a wealth of data-generation routines that can be found here:
https://networkx.github.io/documentation/latest/reference/generators.html
This is the function that generates the Zachary's Karate club network data
In [ ]:
kn=nx.karate_club_graph()
In [ ]:
num_nodes = kn.number_of_nodes()
print 'number of nodes: ' + str(num_nodes)
num_edges = kn.number_of_edges()
print 'number of edges: ' + str(num_edges)
Drawing the network
In [ ]:
with sns.axes_style('white'):
fig = plt.subplots(1, figsize=(12,8))
nx.draw_networkx(kn, edge_color='#a4a4a4', with_labels=True, font_color='#cacaca')
plt.axis('off')
In [ ]:
fl = nx.florentine_families_graph()
num_nodes = fl.number_of_nodes()
print 'number of nodes: ' + str(num_nodes)
num_edges = fl.number_of_edges()
print 'number of edges: ' + str(num_edges)
with sns.axes_style('white'):
fig = plt.subplots(1, figsize=(12,8))
nx.draw_networkx(fl, edge_color='#a4a4a4', node_size=0, with_labels=True)
plt.axis('off')
In [ ]:
er=nx.erdos_renyi_graph(1000,0.15)
In [ ]:
print type(er)
In [ ]:
print "Number of nodes in the random graph: ", er.number_of_nodes()
print "Number of edges in the random graph: ", er.number_of_edges()
In [ ]:
with sns.axes_style('white'):
fig = plt.subplots(1, figsize=(12,8))
nx.draw_networkx(er, node_size=15, edge_color='#a4a4a4', with_labels=False, alpha=.4, linewidths=0)
plt.axis('off')
In [ ]:
degree_sequence=sorted(nx.degree(er).values(),reverse=True)
dmax=max(degree_sequence)
print dmax
In [ ]:
h,bins,patches = plt.hist(degree_sequence,bins=dmax)
In [ ]:
hmax=max(h)
plt.axis([1,dmax,1,hmax]) # set ranges
#x=compress(h,bins) # remove bins with zero entries
#y=compress(h,h) # remove corresponding entries
x=bins.compress(h)
y=h.compress(h)
plt.plot(x,y,'bo')
plt.title("Degree distribution")
plt.xlabel("degree")
plt.ylabel("number of nodes")
plt.show()
Two nodes of a graph belong in the same connected component if there is a path of edges of the graph that connects these two nodes.
In [ ]:
cc= nx.connected_components(er)
print type(cc)
print [len(s) for s in cc]
In [ ]:
def print_cc_sizes(g):
cc = nx.connected_components(g)
print [len(s) for s in cc]
The clustering coefficient of a node is defined as the number of possible triangles centered in this node, divided by the total number of possible triangles in which this node can participate in. Formally, the clustering coefficient of a node $u$ is defined as $$c_u=\frac{2T(u)}{d(u)(d(u)-1)},$$ where $T(u)$ is the number of triangles through node $u$ and $d(u)$ is the degree of node $u$.
For more details for weighted graphs etc see:
The average clustering coefficient is the average clustering coefficient of all the nodes in the graph.
In [ ]:
ccall = nx.clustering(er)
clustering_coefficient = nx.average_clustering(er)
print clustering_coefficient
In [ ]:
h,bins,patches = plt.hist(list(nx.clustering(er).values()))
plt.title('clustering coefficients')
In [ ]:
print(nx.triangles(er,0))
#print(nx.triangles(er))
h,bins, patches = plt.hist(list(nx.triangles(er).values()))
plt.title('Triangles')
The diameter of a graph is defined as the largest shortest path between any two nodes in the graph
In [ ]:
print(nx.diameter(er))
The average shortest path length of a graph is defined as the average of all shortest path lengths in the graph
In [ ]:
print(nx.average_shortest_path_length(er))
In [ ]:
ws=nx.watts_strogatz_graph(500,5,0.1)
print_cc_sizes(ws)
Degree distribution
In [ ]:
degree_sequence=sorted(nx.degree(ws).values(),reverse=True)
dmax=max(degree_sequence)
print dmax
In [ ]:
h,bins,patches = plt.hist(degree_sequence,bins=dmax)
In [ ]:
hmax=max(h)
plt.axis([1,dmax,1,hmax]) # set ranges
#x=compress(h,bins) # remove bins with zero entries
#y=compress(h,h) # remove corresponding entries
x=bins.compress(h)
y=h.compress(h)
plt.plot(x,y,'bo')
plt.title("Degree distribution")
plt.xlabel("degree")
plt.ylabel("number of nodes")
plt.show()
Clustering coefficient
In [ ]:
h,bins,patches = plt.hist(list(nx.clustering(ws).values()))
plt.title('clustering coefficients')
Average path length and diameter
In [ ]:
print 'Diameter:', (nx.diameter(ws))
print 'Average shortest path length:', (nx.average_shortest_path_length(ws))
print 'Average clustering coefficient:', (nx.average_clustering(ws))
In [ ]:
r = range(4,7)
d = np.zeros(len(r))
cc = np.zeros(len(r))
pl = np.zeros(len(r))
index = 0
for i in r:
ws=nx.watts_strogatz_graph(500,i,0.1)
d[index] = nx.diameter(ws)
cc[index] = nx.average_clustering(ws)
pl[index] = nx.average_shortest_path_length(ws)
index=+1
plt.plot(r,d,'r')
plt.plot(r,cc,'b')
#plt.plot(r,pl,'g');
In [ ]:
ba=nx.barabasi_albert_graph(500,5)
print_cc_sizes(ba)
Degree distribution
In [ ]:
degree_sequence=sorted(nx.degree(ba).values(),reverse=True)
dmax=max(degree_sequence)
print dmax
In [ ]:
h,bins,patches = plt.hist(degree_sequence,bins=dmax)
In [ ]:
hmax=max(h)
plt.axis([1,dmax,1,hmax]) # set ranges
#x=compress(h,bins) # remove bins with zero entries
#y=compress(h,h) # remove corresponding entries
x=bins.compress(h)
y=h.compress(h)
plt.loglog(x,y,'bo')
plt.title("Degree distribution")
plt.xlabel("degree")
plt.ylabel("number of nodes")
plt.show()
In [1]:
# Code for setting the style of the notebook
from IPython.core.display import HTML
def css_styling():
styles = open("../theme/custom.css", "r").read()
return HTML(styles)
css_styling()
Out[1]: