Full network measures for all sentiment


In [1]:
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
from glob import glob

plt.style.use('ggplot')
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)

In [2]:
def calculate_graph_inf(graph):
    graph.name = filename
    info = nx.info(graph)
    print info

def highest_centrality(cent_dict):
    """Returns a tuple (node,value) with the node
    with largest value from centrality dictionary."""
    # create ordered tuple of centrality data
    cent_items = [(b,a) for (a,b) in cent_dict.iteritems()]
    # sort in descending order
    cent_items.sort()
    cent_items.reverse()
    return tuple(reversed(cent_items[0]))

Calculate network statistics (undirected)


In [3]:
# run undirected
gml_files = glob('../output/network/article_u_*.gml')

# load undirected
#gml_files = glob('../output/network/article_u_pos.gml')
#gml_files = glob('../output/network/article_u_neg.gml')
#gml_files = glob('../output/network/article_u_neu.gml')

In [4]:
gml_files


Out[4]:
['../output/network/article_u_neg.gml',
 '../output/network/article_u_neu.gml',
 '../output/network/article_u_pos.gml']

In [5]:
network_data_columns = ['name',
                    #'sentiment',
                    '# nodes',
                    '# edges',
                    #'avg deg',
                    'density',
                    'deg assort coef', 
                    'avg deg cent',
                    'avg bet cent',
                    'avg clo cent',
                    'high deg cent',
                    'high bet cent',
                    'high clo cent',
                    'avg node conn',
                    '# conn comp',
                    'gc size'
                    ]
network_data = pd.DataFrame(columns = network_data_columns)

In [6]:
for graph_num, gml_graph in enumerate(gml_files):
    graph = nx.read_gml(gml_graph)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 10)
    print(gml_graph)
    calculate_graph_inf(graph)
    
    # change
    #sent = ""
    nodes = nx.number_of_nodes(graph)
    edges = nx.number_of_edges(graph)
    density = float("{0:.4f}".format(nx.density(graph)))
    avg_deg_cen = np.array(nx.degree_centrality(graph).values()).mean()
    avg_bet_cen = np.array(nx.betweenness_centrality(graph).values()).mean()
    avg_clo_cen = np.array(nx.closeness_centrality(graph).values()).mean()
    #avg_deg = float("{0:.4f}".format(in_deg + out_deg))
    avg_node_con = float("{0:.4f}".format((nx.average_node_connectivity(graph))))
    deg_assort_coeff = float("{0:.4f}".format((nx.degree_assortativity_coefficient(graph))))
    #conn_comp = nx.number_weakly_connected_components(graph) # not for undirected
    deg_cen = nx.degree_centrality(graph)
    bet_cen = nx.betweenness_centrality(graph)
    clo_cen = nx.closeness_centrality(graph)
    highest_deg_cen = highest_centrality(deg_cen)
    highest_bet_cen = highest_centrality(bet_cen)
    highest_clo_cen = highest_centrality(clo_cen)
    #Gc = len(max(nx.weakly_connected_component_subgraphs(graph), key=len))

    # save variables into list
    graph_values = {'name':filename,
                    #'sentiment':sent,
                    '# nodes':nodes,
                    '# edges':edges,
                    #'avg deg':avg_deg,
                    'density':density,
                    'deg assort coef':deg_assort_coeff,
                    'avg deg cent':"%.4f" % avg_deg_cen,
                    'avg bet cent':"%.4f" % avg_bet_cen,
                    'avg clo cent':"%.4f" % avg_clo_cen,
                    'high deg cent':highest_deg_cen,
                    'high bet cent':highest_bet_cen,
                    'high clo cent':highest_clo_cen,
                    'avg node conn':avg_node_con
                    #'# conn comp':conn_comp,
                    #'gc size':Gc
                    }
    network_data = network_data.append(graph_values, ignore_index=True)


----------
../output/network/article_u_neg.gml
Name: article_u_neg.gml
Type: MultiGraph
Number of nodes: 1257
Number of edges: 1854
Average degree:   2.9499
----------
../output/network/article_u_neu.gml
Name: article_u_neu.gml
Type: MultiGraph
Number of nodes: 201
Number of edges: 236
Average degree:   2.3483
----------
../output/network/article_u_pos.gml
Name: article_u_pos.gml
Type: MultiGraph
Number of nodes: 652
Number of edges: 1094
Average degree:   3.3558

In [7]:
# print network data
network_data


Out[7]:
name # nodes # edges density deg assort coef avg deg cent avg bet cent avg clo cent high deg cent high bet cent high clo cent avg node conn # conn comp gc size
0 article_u_neg.gml 1257.0 1854.0 0.0023 -0.0340 0.0023 0.0025 0.1778 (vaccines, 0.0955414012739) (vaccines, 0.269748793744) (vaccines, 0.324810970236) 0.9735 NaN NaN
1 article_u_neu.gml 201.0 236.0 0.0117 -0.2586 0.0117 0.0210 0.1120 (SB 277, 0.155) (vaccines, 0.414983249581) (vaccines, 0.198489010989) 0.7500 NaN NaN
2 article_u_pos.gml 652.0 1094.0 0.0052 -0.0961 0.0052 0.0043 0.1850 (vaccines, 0.0967741935484) (parents, 0.218725048513) (parents, 0.330742137194) 1.0567 NaN NaN

In [8]:
# save
#network_data.to_csv('../output/df/all-stats-undirected.csv')

Calculate network statistics (directed)


In [11]:
# run directed
gml_files = glob('../output/network/article_*1.gml')

# load directed
#gml_files = glob('../output/network/article_pos1.gml')
#gml_files = glob('../output/network/article_neg1.gml')
#gml_files = glob('../output/network/article_neu1.gml')

In [12]:
gml_files


Out[12]:
['../output/network/article_neg1.gml',
 '../output/network/article_neu1.gml',
 '../output/network/article_pos1.gml']

In [13]:
for graph_num, gml_graph in enumerate(gml_files):
    graph = nx.read_gml(gml_graph)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 10)
    print(gml_graph)
    calculate_graph_inf(graph)
    
    # change
    #sent = ""
    nodes = nx.number_of_nodes(graph)
    edges = nx.number_of_edges(graph)
    density = float("{0:.4f}".format(nx.density(graph)))
    avg_deg_cen = np.array(nx.degree_centrality(graph).values()).mean()
    avg_bet_cen = np.array(nx.betweenness_centrality(graph).values()).mean()
    avg_clo_cen = np.array(nx.closeness_centrality(graph).values()).mean()
    #avg_deg = float("{0:.4f}".format(in_deg + out_deg))
    avg_node_con = float("{0:.4f}".format((nx.average_node_connectivity(graph))))
    deg_assort_coeff = float("{0:.4f}".format((nx.degree_assortativity_coefficient(graph))))
    #conn_comp = nx.number_weakly_connected_components(graph) # not for undirected
    deg_cen = nx.degree_centrality(graph)
    bet_cen = nx.betweenness_centrality(graph)
    clo_cen = nx.closeness_centrality(graph)
    highest_deg_cen = highest_centrality(deg_cen)
    highest_bet_cen = highest_centrality(bet_cen)
    highest_clo_cen = highest_centrality(clo_cen)
    #Gc = len(max(nx.weakly_connected_component_subgraphs(graph), key=len))

    # save variables into list
    graph_values = {'name':filename,
                    #'sentiment':sent,
                    '# nodes':nodes,
                    '# edges':edges,
                    #'avg deg':avg_deg,
                    'density':density,
                    'deg assort coef':deg_assort_coeff,
                    'avg deg cent':"%.4f" % avg_deg_cen,
                    'avg bet cent':"%.4f" % avg_bet_cen,
                    'avg clo cent':"%.4f" % avg_clo_cen,
                    'high deg cent':highest_deg_cen,
                    'high bet cent':highest_bet_cen,
                    'high clo cent':highest_clo_cen,
                    'avg node conn':avg_node_con
                    #'# conn comp':conn_comp,
                    #'gc size':Gc
                    }
    network_data = network_data.append(graph_values, ignore_index=True)


----------
../output/network/article_neg1.gml
Name: article_neg1.gml
Type: MultiDiGraph
Number of nodes: 1257
Number of edges: 1898
Average in degree:   1.5099
Average out degree:   1.5099
----------
../output/network/article_neu1.gml
Name: article_neu1.gml
Type: MultiDiGraph
Number of nodes: 201
Number of edges: 241
Average in degree:   1.1990
Average out degree:   1.1990
----------
../output/network/article_pos1.gml
Name: article_pos1.gml
Type: MultiDiGraph
Number of nodes: 652
Number of edges: 1140
Average in degree:   1.7485
Average out degree:   1.7485

In [14]:
# print network data
network_data


Out[14]:
name # nodes # edges density deg assort coef avg deg cent avg bet cent avg clo cent high deg cent high bet cent high clo cent avg node conn # conn comp gc size
0 article_u_neg.gml 1257.0 1854.0 0.0023 -0.0340 0.0023 0.0025 0.1778 (vaccines, 0.0955414012739) (vaccines, 0.269748793744) (vaccines, 0.324810970236) 0.9735 NaN NaN
1 article_u_neu.gml 201.0 236.0 0.0117 -0.2586 0.0117 0.0210 0.1120 (SB 277, 0.155) (vaccines, 0.414983249581) (vaccines, 0.198489010989) 0.7500 NaN NaN
2 article_u_pos.gml 652.0 1094.0 0.0052 -0.0961 0.0052 0.0043 0.1850 (vaccines, 0.0967741935484) (parents, 0.218725048513) (parents, 0.330742137194) 1.0567 NaN NaN
3 article_neg1.gml 1257.0 1898.0 0.0012 0.0012 0.0024 0.0005 0.0329 (vaccines, 0.106687898089) (vaccines, 0.0689197216817) (vaccine industry, 0.158231271073) 0.1757 NaN NaN
4 article_neu1.gml 201.0 241.0 0.0060 -0.1490 0.0120 0.0007 0.0137 (SB 277, 0.165) (children, 0.0163819095477) (children, 0.113061594203) 0.0449 NaN NaN
5 article_pos1.gml 652.0 1140.0 0.0027 -0.0194 0.0054 0.0013 0.0418 (vaccines, 0.1044546851) (vaccines, 0.0639739376272) (parents, 0.211479613435) 0.2395 NaN NaN

In [15]:
# save
#network_data.to_csv('../output/df/all-stats-directed.csv')

all nodes table


In [ ]:
# run directed
#gml_files = glob('../output/network/article_*.gml')

# run undirected
#gml_files = glob('../output/network/article_u_*.gml')


# load directed
#gml_files = glob('../output/network/article_pos1.gml')
#gml_files = glob('../output/network/article_neg1.gml')
#gml_files = glob('../output/network/article_neu1.gml')

# load undirected
#gml_files = glob('../output/network/article_u_pos.gml')
#gml_files = glob('../output/network/article_u_neg.gml')
#gml_files = glob('../output/network/article_u_neu.gml')

In [ ]:
data_columns = ['name'
                ]
data = pd.DataFrame(columns = data_columns)
combined_df = pd.DataFrame()

In [ ]:
for graph_num, gml_graph in enumerate(gml_files):
    graph = nx.read_gml(gml_graph)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 10)
    print(gml_graph)
    calculate_graph_inf(graph)

    ## calculate variables and save into list
    #sent = ""    
    deg_cent = nx.degree_centrality(graph)
    bet_cent = nx.betweenness_centrality(graph)
    clo_cent = nx.closeness_centrality(graph)
    graph_values = {'name':filename,
                    'sentiment':sent
                    }
    data = data.append(graph_values, ignore_index=True)

    degree = nx.degree(graph)
    deg_df = pd.DataFrame.from_dict(degree, orient = 'index')
    deg_df.columns = ['degree']
    # degree centrality
    deg_cent = nx.degree_centrality(graph)
    dc_df = pd.DataFrame.from_dict(deg_cent, orient = 'index')
    dc_df.columns = ['deg cent']
    # betweenness centrality
    bet_cent = nx.betweenness_centrality(graph)
    bc_df = pd.DataFrame.from_dict(bet_cent, orient = 'index')
    bc_df.columns = ['bet cent']
    # closeness centrality
    clo_cent = nx.closeness_centrality(graph)
    cc_df = pd.DataFrame.from_dict(clo_cent, orient = 'index')
    cc_df.columns = ['clo cent']
    # concat node frames into node_df
    frames = [deg_df, dc_df, bc_df, cc_df]
    node_df = pd.concat(frames, axis = 1)
    node_df.index.name = 'node'
    node_df = node_df.reset_index()

    values = pd.DataFrame(graph_values, columns = ('name', 'sentiment'), index = [0])
    
    # df = merges graph_values with node_df for single graph and fill NaNs
    df = pd.concat([values, node_df], axis = 1)
    df = df.fillna(method='ffill')
    combined_df = combined_df.append(df)

In [ ]:
# print entire network
combined_df