In [ ]:
import pandas as pd
import numpy as np
import networkx as nx
from copy import deepcopy
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.backends.backend_pdf import PdfPages
from glob import glob
fileName = 'article0'
In [ ]:
def getFiles(fileName):
matches = glob('*'+fileName+'*')
bigFile = matches[0]
data = pd.DataFrame.from_csv(bigFile)
return clearSource(data)
def clearSource(data):
columns = ['source','target']
pre = len(data)
for column in columns:
data = data[pd.notnull(data[column])]
post = len(data)
print "Filtered %s rows to %s rows by removing rows with blank values in columns %s" % (pre,post,columns)
return data
#data = getFiles(fileName)
In [ ]:
def getStuff(data,labels):
forEdges = labels == ['edge']
columns = list(data.columns.values)
items = dict()
nameFunc = {True: lambda x,y: '%s - %s - %s' % (x['source'],x['edge'],x['target']),
False: lambda x,y: x[y]}[forEdges]
extra = ['source','target'] * forEdges
for label in labels:
relevant = [col for col in columns if label+'-' in col] + extra
#relevant = extra
print "Extracting %s data from %s" % (label,relevant)
for i in data.index:
row = data.ix[i]
for col in relevant:
if str(row[col]).lower() != 'nan':
name = nameFunc(row,label)
if name not in items:
items[name] = dict()
items[name][col.replace(label+'-','')] = row[col]
return items
def getNodes(data):
return getStuff(data,['source','target'])
def getEdges(data):
return getStuff(data,['edge'])
#allNodes = getNodes(data); allEdges = getEdges(data)
In [ ]:
def addNodes(graph,nodes):
for key,value in nodes.iteritems():
graph.add_node(key,attr_dict=value)
return graph
def addEdges(graph,edges):
for key,value in edges.iteritems():
value['label'] = key
value['edge'] = key.split(' - ')[1]
graph.add_edge(value['source'],value['target'],attr_dict = value)
return graph
#########
def createNetwork(edges,nodes):
graph = nx.MultiGraph()
graph = addNodes(graph,nodes)
graph = addEdges(graph,edges)
return graph
#fullGraph = createNetwork(allEdges,allNodes)
In [ ]:
def drawIt(graph,what='graph', save_plot=None):
style=nx.spring_layout(graph)
size = graph.number_of_nodes()
print "Drawing %s of size %s:" % (what,size)
if size > 20:
plt.figure(figsize=(10,10))
if size > 40:
nx.draw(graph,style,node_size=60,font_size=8)
if save_plot is not None:
print('saving: {}'.format(save_plot))
plt.savefig(save_plot)
else:
nx.draw(graph,style)
if save_plot is not None:
print('saving: {}'.format(save_plot))
plt.savefig(save_plot)
else:
nx.draw(graph,style)
if save_plot is not None:
print('saving: {}'.format(save_plot))
plt.savefig(save_plot)
plt.show()
def describeGraph(graph, save_plot=None):
components = nx.connected_components(graph)
components = list(components)
isolated = [entry[0] for entry in components if len(entry)==1]
params = (graph.number_of_edges(),graph.number_of_nodes(),len(components),len(isolated))
print "Graph has %s nodes, %s edges, %s connected components, and %s isolated nodes\n" % params
drawIt(graph, save_plot=save_plot)
for idx, sub in enumerate(components):
drawIt(graph.subgraph(sub),what='component', save_plot='{}-{}.png'.format('component', idx))
print "Isolated nodes:", isolated
def getGraph(fileRef, save_plot=None):
data = getFiles(fileName)
nodes = getNodes(data)
edges = getEdges(data)
graph = createNetwork(edges,nodes)
fileOut = fileRef.split('.')[0]+'.gml'
print "Writing GML file to %s" % fileOut
nx.write_gml(graph, fileOut)
fileOutNet = fileRef.split('.')[0]+'.net'
print "Writing net file to %s" % fileOutNet
nx.write_pajek(graph, fileOutNet)
describeGraph(graph, save_plot)
return graph, nodes, edges
In [ ]:
fileName = 'data/csv/article1'
graph, nodes, edges = getGraph(fileName, save_plot='graph.png')
In [ ]:
plt.figure(figsize=(12, 12))
nx.draw_spring(graph, node_color='g', with_labels=True, arrows=True)
plt.show()
In [ ]:
# return a dictionary of centrality values for each node
nx.degree_centrality(graph)
In [ ]:
# the type of degree centrality is a dictionary
type(nx.degree_centrality(graph))
In [ ]:
# get all the values of the dictionary, this returns a list of centrality scores
# turn the list into a numpy array
# take the mean of the numpy array
np.array(nx.degree_centrality(graph).values()).mean()
of a node u is the reciprocal of the sum of the shortest path distances from u to all n-1 other nodes. Since the sum of distances depends on the number of nodes in the graph, closeness is normalized by the sum of minimum possible distances n-1. Notice that higher values of closeness indicate higher centrality.
In [ ]:
nx.closeness_centrality(graph)
In [ ]:
nx.betweenness_centrality(graph)
np.array(nx.betweenness_centrality(graph).values()).mean()
In [ ]:
nx.degree_assortativity_coefficient(graph)