conversion, drawing, saving, analysis

  • copy of dan's thing
  • converts .csv to .gml and .net
  • draws graph, saves graph.png
  • try to combine into this

In [ ]:
import pandas as pd
import numpy as np
import networkx as nx
from copy import deepcopy

import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.backends.backend_pdf import PdfPages

from glob import glob
fileName = 'article0'

In [ ]:
def getFiles(fileName):
    matches = glob('*'+fileName+'*')
    bigFile = matches[0]
    data = pd.DataFrame.from_csv(bigFile)
    return clearSource(data)
    

def clearSource(data):
    columns = ['source','target']
    pre = len(data)
    for column in columns:
        data = data[pd.notnull(data[column])]
    post = len(data)
    print "Filtered %s rows to %s rows by removing rows with blank values in columns %s" % (pre,post,columns)
    return data
    
    
#data = getFiles(fileName)

In [ ]:
def getStuff(data,labels):
    forEdges = labels == ['edge']
    columns = list(data.columns.values)
    items = dict()
    
    nameFunc = {True: lambda x,y: '%s - %s - %s' % (x['source'],x['edge'],x['target']),
                False: lambda x,y: x[y]}[forEdges]
    
    extra = ['source','target'] * forEdges
    
    for label in labels:
        relevant = [col for col in columns if label+'-' in col] + extra
        #relevant = extra
        print "Extracting %s data from %s" % (label,relevant)
        for i in data.index:
            row = data.ix[i]
            for col in relevant:
                if str(row[col]).lower() != 'nan':
                    name = nameFunc(row,label)
                    if name not in items:
                        items[name] = dict()
                    items[name][col.replace(label+'-','')] = row[col]
    return items
    

def getNodes(data):
    return getStuff(data,['source','target'])


def getEdges(data):
    return getStuff(data,['edge'])
      
    
#allNodes = getNodes(data); allEdges = getEdges(data)

In [ ]:
def addNodes(graph,nodes):
    for key,value in nodes.iteritems():
        graph.add_node(key,attr_dict=value)
    return graph
    
def addEdges(graph,edges):
    for key,value in edges.iteritems():
        value['label'] = key
        value['edge'] = key.split(' - ')[1]
        graph.add_edge(value['source'],value['target'],attr_dict = value)
    return graph
    

#########

def createNetwork(edges,nodes):
    graph = nx.MultiGraph()
    graph = addNodes(graph,nodes)
    graph = addEdges(graph,edges)
    return graph


#fullGraph = createNetwork(allEdges,allNodes)

In [ ]:
def drawIt(graph,what='graph', save_plot=None):
    style=nx.spring_layout(graph)
    size = graph.number_of_nodes()
    print "Drawing %s of size %s:" % (what,size)
    if size > 20:
        plt.figure(figsize=(10,10))
        if size > 40:
            nx.draw(graph,style,node_size=60,font_size=8)
            if save_plot is not None:
                print('saving: {}'.format(save_plot))
                plt.savefig(save_plot)
        else:
            nx.draw(graph,style)
            if save_plot is not None:
                print('saving: {}'.format(save_plot))
                plt.savefig(save_plot)
    else:
        nx.draw(graph,style)
        if save_plot is not None:
            print('saving: {}'.format(save_plot))
            plt.savefig(save_plot)
    plt.show()
    
    
def describeGraph(graph, save_plot=None):
    components = nx.connected_components(graph)
    components = list(components)
    isolated = [entry[0] for entry in components if len(entry)==1]
    params = (graph.number_of_edges(),graph.number_of_nodes(),len(components),len(isolated))
    print "Graph has %s nodes, %s edges, %s connected components, and %s isolated nodes\n" % params
    drawIt(graph, save_plot=save_plot)
    for idx, sub in enumerate(components):
        drawIt(graph.subgraph(sub),what='component', save_plot='{}-{}.png'.format('component', idx))
    print "Isolated nodes:", isolated

def getGraph(fileRef, save_plot=None):
    data = getFiles(fileName)
    nodes = getNodes(data)
    edges = getEdges(data)
    graph = createNetwork(edges,nodes)
    fileOut = fileRef.split('.')[0]+'.gml'
    print "Writing GML file to %s" % fileOut
    nx.write_gml(graph, fileOut)
    
    fileOutNet = fileRef.split('.')[0]+'.net'
    print "Writing net file to %s" % fileOutNet
    nx.write_pajek(graph, fileOutNet)
    
    describeGraph(graph, save_plot)
    return graph, nodes, edges

In [ ]:
fileName = 'data/csv/article1'
graph, nodes, edges = getGraph(fileName, save_plot='graph.png')

In [ ]:
plt.figure(figsize=(12, 12))
nx.draw_spring(graph, node_color='g', with_labels=True, arrows=True)
plt.show()

In [ ]:
# return a dictionary of centrality values for each node
nx.degree_centrality(graph)

degree centrality

for a node v is the fraction of nodes it is connected to


In [ ]:
# the type of degree centrality is a dictionary
type(nx.degree_centrality(graph))

In [ ]:
# get all the values of the dictionary, this returns a list of centrality scores
# turn the list into a numpy array
# take the mean of the numpy array
np.array(nx.degree_centrality(graph).values()).mean()

closeness centrality

of a node u is the reciprocal of the sum of the shortest path distances from u to all n-1 other nodes. Since the sum of distances depends on the number of nodes in the graph, closeness is normalized by the sum of minimum possible distances n-1. Notice that higher values of closeness indicate higher centrality.


In [ ]:
nx.closeness_centrality(graph)

betweenness centrality

of a node v is the sum of the fraction of all-pairs shortest paths that pass through v


In [ ]:
nx.betweenness_centrality(graph)
np.array(nx.betweenness_centrality(graph).values()).mean()

degree assortativity coefficient

Assortativity measures the similarity of connections in the graph with respect to the node degree.


In [ ]:
nx.degree_assortativity_coefficient(graph)