network_union_df

make dataframe of positive, negative, and neutral networks



In [1]:

    
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
from glob import glob
import re

pd.set_option('display.mpl_style', 'default') 
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)

# use a list comprehension to create a list that matches files in a directory using regular expressions
# http://stackoverflow.com/questions/2225564/get-a-filtered-list-of-files-in-a-directory
#gml_files = [f for f in os.listdir('.') if re.match(r'(pos|neg|neu)_u*all\.gml', f)]

gml_files = [f for f in os.listdir('.') if re.match(r'(positive|negative|neutral)_all\.gml', f)]


def calculate_graph_inf(graph):
    graph.name = filename
    info = nx.info(graph)
    print info
    #plt.figure(figsize=(10,10))
    #nx.draw_spring(graph, arrows=True, with_labels=True)

def highest_centrality(cent_dict):
    """Returns a tuple (node,value) with the node
    with largest value from centrality dictionary."""
    # create ordered tuple of centrality data
    cent_items = [(b,a) for (a,b) in cent_dict.iteritems()]
    # sort in descending order
    cent_items.sort()
    cent_items.reverse()
    return tuple(reversed(cent_items[0]))



In [2]:

    
gml_files.sort()
gml_files









    Out[2]:





['negative_all.gml', 'neutral_all.gml', 'positive_all.gml']



In [11]:

    
# create empty dataframe with columns

network_data_columns = ['name',
                    'sentiment',
                    'n nodes',
                    'n edges',
                    'avg degree',
                    'density',
                    'avg deg cent',
                    'avg bet cent',
                    'avg clo cent',
                    'highest degc',
                    'highest betc',
                    'highest cloc',
                    'avg node connect',
                    'deg assort coeff',
                    'avg in-deg',
                    'avg out-deg',
                    'n strong comp',
                    'n weak comp',
                    'n conn comp',
                    'Gc size'
                    ]

network_data = pd.DataFrame(columns = network_data_columns)



In [12]:

    
for graph_num, gml_graph in enumerate(gml_files):
    graph = nx.read_gml(gml_graph)
    ugraph = graph.to_undirected()
    U = graph.to_undirected(reciprocal=True)
    e = U.edges()
    ugraph.add_edges_from(e)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 40)
    print(gml_graph)
    calculate_graph_inf(graph)

    # calculate variables
    sent = filename.split('_')[0]
    nodes = nx.number_of_nodes(graph)
    edges = nx.number_of_edges(graph)
    density = float("{0:.4f}".format(nx.density(graph)))
    avg_deg_cen = np.array(nx.degree_centrality(graph).values()).mean()
    avg_bet_cen = np.array(nx.betweenness_centrality(graph).values()).mean()
    avg_clo_cen = np.array(nx.closeness_centrality(graph).values()).mean()
    in_deg = sum(graph.in_degree().values())/float(nx.number_of_nodes(graph))
    out_deg = sum(graph.out_degree().values())/float(nx.number_of_nodes(graph))
    avg_deg = float("{0:.4f}".format(in_deg + out_deg))
    strong_comp = nx.number_strongly_connected_components(graph)
    weak_comp =  nx.number_weakly_connected_components(graph)
    avg_node_con = float("{0:.4f}".format((nx.average_node_connectivity(graph))))
    deg_assort_coeff = float("{0:.4f}".format((nx.degree_assortativity_coefficient(graph))))
    conn_comp = nx.number_connected_components(ugraph)
    deg_cen = nx.degree_centrality(graph)
    bet_cen = nx.betweenness_centrality(graph)
    clo_cen = nx.closeness_centrality(graph)
    highest_deg_cen = highest_centrality(deg_cen)
    highest_bet_cen = highest_centrality(bet_cen)
    highest_clo_cen = highest_centrality(clo_cen)
    Gc = len(max(nx.connected_component_subgraphs(ugraph), key=len))

    # save variables into list

    graph_values = {'name':filename,
                    'sentiment':sent,
                    'n nodes':nodes,
                    'n edges':edges,
                    'avg degree':avg_deg,
                    'density':density,
                    'avg deg cent':"%.4f" % avg_deg_cen,
                    'avg bet cent':"%.4f" % avg_bet_cen,
                    'avg clo cent':"%.4f" % avg_clo_cen,
                    'highest degc':highest_deg_cen,
                    'highest betc':highest_bet_cen,
                    'highest cloc':highest_clo_cen,
                    'avg node connect':avg_node_con,
                    'deg assort coeff':deg_assort_coeff,
                    'avg in-deg':"%.4f" % in_deg,
                    'avg out-deg':"%.4f" % out_deg,
                    'n strong comp':strong_comp,
                    'n weak comp':weak_comp,
                    'n conn comp':conn_comp,
                    'Gc size':Gc
                    }
    
    network_data = network_data.append(graph_values, ignore_index=True)

#    if graph_num == 1:
#        break









    



----------------------------------------
neg_all.gml
Name: neg_all.gml
Type: MultiDiGraph
Number of nodes: 1563
Number of edges: 1879
Average in degree:   1.2022
Average out degree:   1.2022






    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-12-035e757337ac> in <module>()
     23     strong_comp = nx.number_strongly_connected_components(graph)
     24     weak_comp =  nx.number_weakly_connected_components(graph)
---> 25     avg_node_con = float("{0:.4f}".format((nx.average_node_connectivity(graph))))
     26     deg_assort_coeff = float("{0:.4f}".format((nx.degree_assortativity_coefficient(graph))))
     27     conn_comp = nx.number_connected_components(ugraph)

/home/gkang/anaconda2/lib/python2.7/site-packages/networkx/algorithms/connectivity/connectivity.pyc in average_node_connectivity(G, flow_func)
    405     num, den = 0, 0
    406     for u, v in iter_func(G, 2):
--> 407         num += local_node_connectivity(G, u, v, **kwargs)
    408         den += 1
    409 

/home/gkang/anaconda2/lib/python2.7/site-packages/networkx/algorithms/connectivity/connectivity.pyc in local_node_connectivity(G, s, t, flow_func, auxiliary, residual, cutoff)
    200         kwargs['cutoff'] = cutoff
    201 
--> 202     return nx.maximum_flow_value(H, '%sB' % mapping[s], '%sA' % mapping[t], **kwargs)
    203 
    204 

/home/gkang/anaconda2/lib/python2.7/site-packages/networkx/algorithms/flow/maxflow.pyc in maximum_flow_value(G, s, t, capacity, flow_func, **kwargs)
    294         raise nx.NetworkXError("flow_func has to be callable.")
    295 
--> 296     R = flow_func(G, s, t, capacity=capacity, value_only=True, **kwargs)
    297 
    298     return R.graph['flow_value']

/home/gkang/anaconda2/lib/python2.7/site-packages/networkx/algorithms/flow/edmondskarp.pyc in edmonds_karp(G, s, t, capacity, residual, value_only, cutoff)
    245 
    246     """
--> 247     R = edmonds_karp_impl(G, s, t, capacity, residual, cutoff)
    248     R.graph['algorithm'] = 'edmonds_karp'
    249     return R

/home/gkang/anaconda2/lib/python2.7/site-packages/networkx/algorithms/flow/edmondskarp.pyc in edmonds_karp_impl(G, s, t, capacity, residual, cutoff)
    119     for u in R:
    120         for e in R[u].values():
--> 121             e['flow'] = 0
    122 
    123     if cutoff is None:

KeyboardInterrupt:



In [5]:

    
network_data









    Out[5]:






  
    
      
      name
      sentiment
      n nodes
      n edges
      avg degree
      density
      avg deg cent
      avg bet cent
      avg clo cent
      highest degc
      highest betc
      highest cloc
      avg node connect
      deg assort coeff
      avg in-deg
      avg out-deg
      n strong comp
      n weak comp
      n conn comp
      Gc size
    
  
  
    
      0
      neu_all.gml
      neu
      250
      238
      1.904
      0.0038
      0.0076
      0.0001
      0.0062
      (SB277, 0.104417670683)
      (SB277, 0.00468001036404)
      (SB277, 0.0697233377956)
      0.0122
      -0.0743
      0.9520
      0.9520
      246
      40
      40
      74



In [ ]:

    
# save dataframe to csv
network_data.to_csv('network_union_df', encoding = 'utf-8')



In [ ]:



In [ ]:



In [ ]:



In [ ]: