positive graph


In [1]:
# 1_network_df

import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
from glob import glob

plt.style.use('ggplot')
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)

gml_files = glob('../output/network/article_pos1.gml')

In [2]:
def calculate_graph_inf(graph):
    graph.name = filename
    info = nx.info(graph)
    print info

def highest_centrality(cent_dict):
    """Returns a tuple (node,value) with the node
    with largest value from centrality dictionary."""
    # create ordered tuple of centrality data
    cent_items = [(b,a) for (a,b) in cent_dict.iteritems()]
    # sort in descending order
    cent_items.sort()
    cent_items.reverse()
    return tuple(reversed(cent_items[0]))

2. convert to undirected


In [3]:
for graph_num, gml_graph in enumerate(gml_files):
    dgraph = nx.read_gml(gml_graph)
    ugraph = dgraph.to_undirected() # to undirected graph
    #U = dgraph.to_undirected(reciprocal=True)
    #e = U.edges()
    #ugraph.add_edges_from(e)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 10)
    print(gml_graph)
    calculate_graph_inf(dgraph)
    calculate_graph_inf(ugraph)


----------
../output/network/article_pos1.gml
Name: article_pos1.gml
Type: MultiDiGraph
Number of nodes: 652
Number of edges: 1140
Average in degree:   1.7485
Average out degree:   1.7485
Name: article_pos1.gml
Type: MultiGraph
Number of nodes: 652
Number of edges: 1094
Average degree:   3.3558

In [4]:
# save undirected gml
#nx.write_gml(ugraph, "../output/network/article_u_pos.gml")

dgraph = directed ugraph = undirected

undirected graph


In [5]:
# load
gml_files = glob('../output/network/article_u_pos.gml')

In [6]:
# ugraph = undirected; dgraph = directed
for graph_num, gml_graph in enumerate(gml_files):
    ugraph = nx.read_gml(gml_graph)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 10)
    print(gml_graph)
    calculate_graph_inf(ugraph)


----------
../output/network/article_u_pos.gml
Name: article_u_pos.gml
Type: MultiGraph
Number of nodes: 652
Number of edges: 1094
Average degree:   3.3558

3. all nodes table


In [7]:
# 2_node_df: list all nodes and centrality
data_columns = ['name',
                'sentiment'
                ]
data = pd.DataFrame(columns = data_columns)
combined_df = pd.DataFrame()

In [8]:
# calculate variables and save into list
sent = "positive"    
deg_cent = nx.degree_centrality(ugraph)
bet_cent = nx.betweenness_centrality(ugraph)
clo_cent = nx.closeness_centrality(ugraph)
graph_values = {'name':filename,
                'sentiment':sent
                }
data = data.append(graph_values, ignore_index=True)

degree = nx.degree(ugraph)
deg_df = pd.DataFrame.from_dict(degree, orient = 'index')
deg_df.columns = ['degree']

# degree centrality
deg_cent = nx.degree_centrality(ugraph)
dc_df = pd.DataFrame.from_dict(deg_cent, orient = 'index')
dc_df.columns = ['deg cent']

# betweenness centrality
bet_cent = nx.betweenness_centrality(ugraph)
bc_df = pd.DataFrame.from_dict(bet_cent, orient = 'index')
bc_df.columns = ['bet cent']

# closeness centrality
clo_cent = nx.closeness_centrality(ugraph)
cc_df = pd.DataFrame.from_dict(clo_cent, orient = 'index')
cc_df.columns = ['clo cent']

# concat node frames into node_df
frames = [deg_df, dc_df, bc_df, cc_df]
node_df = pd.concat(frames, axis = 1)
node_df.index.name = 'node'
node_df = node_df.reset_index()

values = pd.DataFrame(graph_values, columns = ('name', 'sentiment'), index = [0])

# df = merges graph_values with node_df for single graph and fill NaNs
df = pd.concat([values, node_df], axis = 1)
df = df.fillna(method='ffill')
combined_df = combined_df.append(df)

In [9]:
# print entire network
combined_df


Out[9]:
name sentiment node degree deg cent bet cent clo cent
0 article_u_pos.gml positive neighbors 1 0.001536 0.000000 0.177652
1 article_u_pos.gml positive vitamins 1 0.001536 0.000000 0.011151
2 article_u_pos.gml positive colleges 1 0.001536 0.000000 0.183437
3 article_u_pos.gml positive influenza 2 0.003072 0.000599 0.150718
4 article_u_pos.gml positive parents of autistic children 6 0.009217 0.004474 0.238568
5 article_u_pos.gml positive religious exemption 9 0.013825 0.005750 0.242208
6 article_u_pos.gml positive results 1 0.001536 0.000000 0.193034
7 article_u_pos.gml positive Scott Morrison 1 0.001536 0.000000 0.001536
8 article_u_pos.gml positive repetitive behaviors 1 0.001536 0.000000 0.112424
9 article_u_pos.gml positive Michael Mina 2 0.003072 0.000005 0.003072
10 article_u_pos.gml positive children 26 0.039939 0.066382 0.306372
11 article_u_pos.gml positive Dr. Paul Offit 3 0.004608 0.000014 0.004608
12 article_u_pos.gml positive vaccination schedule 3 0.004608 0.001076 0.233361
13 article_u_pos.gml positive Samantha Page 1 0.001536 0.000000 0.001536
14 article_u_pos.gml positive best-sellers 1 0.001536 0.000000 0.184665
15 article_u_pos.gml positive American Medical Association 5 0.007680 0.000863 0.211846
16 article_u_pos.gml positive Orthodox Jewish communities 1 0.001536 0.000000 0.226795
17 article_u_pos.gml positive fence-sitters 4 0.006144 0.012472 0.248881
18 article_u_pos.gml positive Journal of the American Medical Association 1 0.001536 0.000000 0.189200
19 article_u_pos.gml positive sexually transmitted virus 4 0.006144 0.005859 0.251631
20 article_u_pos.gml positive fear of autism 5 0.007680 0.006918 0.248057
21 article_u_pos.gml positive genetic risk factors for ASD 1 0.001536 0.000000 0.147038
22 article_u_pos.gml positive siblings 1 0.001536 0.000000 0.157090
23 article_u_pos.gml positive resources 2 0.003072 0.001477 0.184730
24 article_u_pos.gml positive risk 1 0.001536 0.000000 0.212361
25 article_u_pos.gml positive vaccine campaign 2 0.003072 0.000000 0.185976
26 article_u_pos.gml positive stiff neck 1 0.001536 0.000000 0.145123
27 article_u_pos.gml positive Faith Assembly 2 0.003072 0.000000 0.005530
28 article_u_pos.gml positive nausea 2 0.003072 0.001356 0.219939
29 article_u_pos.gml positive ill effects 1 0.001536 0.000000 0.175627
... ... ... ... ... ... ... ...
622 article_u_pos.gml positive role 1 0.001536 0.000000 0.175451
623 article_u_pos.gml positive driving factors 3 0.004608 0.000098 0.251510
624 article_u_pos.gml positive sexually active 7 0.010753 0.008428 0.249000
625 article_u_pos.gml positive immunity 6 0.009217 0.003333 0.253457
626 article_u_pos.gml positive expected 1 0.001536 0.000000 0.188587
627 article_u_pos.gml positive sense of urgency 1 0.001536 0.000000 0.241761
628 article_u_pos.gml positive health officials 5 0.007680 0.010986 0.256309
629 article_u_pos.gml positive rubella 13 0.019969 0.024134 0.233882
630 article_u_pos.gml positive former gastroenterologist 1 0.001536 0.000000 0.126575
631 article_u_pos.gml positive varicella vaccine 3 0.004608 0.000005 0.124087
632 article_u_pos.gml positive magnitude of benefits 1 0.001536 0.000000 0.165163
633 article_u_pos.gml positive serogroups 1 0.001536 0.000000 0.198370
634 article_u_pos.gml positive 16 years of age 1 0.001536 0.000000 0.001536
635 article_u_pos.gml positive state vaccination rates 3 0.004608 0.002933 0.157847
636 article_u_pos.gml positive loss of limb 1 0.001536 0.000000 0.167486
637 article_u_pos.gml positive Early Childhood Australia 3 0.004608 0.002756 0.200266
638 article_u_pos.gml positive religious groups 23 0.035330 0.081924 0.288807
639 article_u_pos.gml positive age 26 1 0.001536 0.000000 0.162700
640 article_u_pos.gml positive Robert F. Kennedy Jr. 2 0.003072 0.007541 0.235353
641 article_u_pos.gml positive friends 1 0.001536 0.000000 0.241761
642 article_u_pos.gml positive Catholic Church 1 0.001536 0.000000 0.176634
643 article_u_pos.gml positive Amish 3 0.004608 0.001437 0.220217
644 article_u_pos.gml positive scheduled appointment 1 0.001536 0.000000 0.241761
645 article_u_pos.gml positive meningococcal disease symptoms 4 0.006144 0.004952 0.173074
646 article_u_pos.gml positive Netherlands Reformed Congregation 2 0.003072 0.000000 0.226795
647 article_u_pos.gml positive immune protection 3 0.004608 0.004347 0.237272
648 article_u_pos.gml positive environmental trigger 2 0.003072 0.000505 0.196731
649 article_u_pos.gml positive time 1 0.001536 0.000000 0.213486
650 article_u_pos.gml positive overseas 3 0.004608 0.000870 0.204807
651 article_u_pos.gml positive Tdap vaccine 9 0.013825 0.021138 0.257569

652 rows × 7 columns


In [10]:
# save
#combined_df.to_csv('../output/df/article_u_pos.csv')

4. Draw undirected and directed network


In [11]:
# 7_graph_calculation
def drawIt(graph, what = 'graph'):
    nsize = graph.number_of_nodes()
    print "Drawing %s of size %s:" % (what, nsize)
    
    if nsize > 20:
        plt.figure(figsize=(10, 10))
        if nsize > 40:
            nx.draw_spring(graph, with_labels = True, node_size = 70, font_size = 12)
        else:
            nx.draw_spring(graph, with_labels = True)
    else:
        nx.draw_spring(graph, with_labels = True)
    plt.show()

# for undirected graphs
def describeGraph(graph):
    components = sorted(nx.connected_components(graph), key = len, reverse = True)
    cc = [len(c) for c in components]
    subgraphs = list(nx.connected_component_subgraphs(graph))
    params = (graph.number_of_edges(),graph.number_of_nodes(),len(cc))
    print "Graph has %s edges, %s nodes, %s connected components\n" % params
    drawIt(graph)
    for sub in components:
        drawIt(graph.subgraph(sub), what = 'component')

# for directed graphs
def describeGraph_d(graph):
    components = sorted(nx.weakly_connected_components(graph), key = len, reverse = True)
    cc = [len(c) for c in components]
    subgraphs = list(nx.weakly_connected_component_subgraphs(graph))
    params = (graph.number_of_edges(),graph.number_of_nodes(),len(cc))
    print "Graph has %s edges, %s nodes, %s connected components\n" % params
    drawIt(graph)
    for sub in components:
        drawIt(graph.subgraph(sub), what = 'component')

In [12]:
# UNDIRECTED network graph
describeGraph(ugraph)


Graph has 1094 edges, 652 nodes, 21 connected components

Drawing graph of size 652:
Drawing component of size 585:
Drawing component of size 15:
Drawing component of size 7:
Drawing component of size 4:
Drawing component of size 4:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:

In [13]:
# DIRECTED network graph
describeGraph_d(dgraph)


Graph has 1140 edges, 652 nodes, 21 connected components

Drawing graph of size 652:
Drawing component of size 585:
Drawing component of size 15:
Drawing component of size 7:
Drawing component of size 4:
Drawing component of size 4:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:

Undirected graph components


In [14]:
# list of connected components by size (undirected graph)
connected_components = [len(c) for c in sorted(nx.connected_components(ugraph), key=len, reverse=True)]

# generate connected components as subgraphs (undirected graph)
subgraphs = list(nx.connected_component_subgraphs(ugraph))

# greatest component (undirected MultiGraph)
u_Gc = max(nx.connected_component_subgraphs(ugraph), key=len)
u_Gc.name = "undirected Gc"

In [15]:
print "connected components = ", connected_components
print nx.info(u_Gc)


connected components =  [585, 15, 7, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Name: undirected Gc
Type: MultiGraph
Number of nodes: 585
Number of edges: 1042
Average degree:   3.5624

Directed graph components


In [16]:
# use directed dgraph
components = sorted(nx.weakly_connected_components(dgraph), key = len, reverse = True)
cc = [len(c) for c in components]

# generate connected components as subgraphs 
subgraphs = list(nx.weakly_connected_component_subgraphs(dgraph))

# greatest component
d_Gc = max(nx.weakly_connected_component_subgraphs(dgraph), key=len)
d_Gc.name = "directed Gc"

In [17]:
print "connected components = ", cc
print nx.info(d_Gc)


connected components =  [585, 15, 7, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Name: directed Gc
Type: MultiDiGraph
Number of nodes: 585
Number of edges: 1088
Average in degree:   1.8598
Average out degree:   1.8598

5. Greatest component graph


In [18]:
# finally, greatest components for undirected and directed graphs
print nx.info(u_Gc)
print nx.info(d_Gc)


Name: undirected Gc
Type: MultiGraph
Number of nodes: 585
Number of edges: 1042
Average degree:   3.5624
Name: directed Gc
Type: MultiDiGraph
Number of nodes: 585
Number of edges: 1088
Average in degree:   1.8598
Average out degree:   1.8598

In [19]:
# save Gc
#nx.write_gml(u_Gc, "../output/network/u_Gc_positive2.gml")
#nx.write_gml(d_Gc, "../output/network/d_Gc_positive2.gml")

6. network stats for DIRECTED GC


In [20]:
# load directed Gc
Gc_files = glob('../output/network/d_Gc_positive2.gml')

network_data_columns = ['name',
                    'sentiment',
                    '# nodes',
                    '# edges',
                    #'avg deg',
                    'density',
                    'deg assort coef', 
                    'avg deg cent',
                    'avg bet cent',
                    'avg clo cent',
                    'high deg cent',
                    'high bet cent',
                    'high clo cent',
                    'avg node conn',
                    '# conn comp',
                    'gc size'
                    ]
network_data = pd.DataFrame(columns = network_data_columns)

In [21]:
# Gc_files
for graph_num, gml_graph in enumerate(Gc_files):
    graph = nx.read_gml(gml_graph)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 10)
    print(gml_graph)
    calculate_graph_inf(graph)
    
    # calculate variables
    sent = "pos"
    nodes = nx.number_of_nodes(graph)
    edges = nx.number_of_edges(graph)
    density = float("{0:.4f}".format(nx.density(graph)))
    avg_deg_cen = np.array(nx.degree_centrality(graph).values()).mean()
    avg_bet_cen = np.array(nx.betweenness_centrality(graph).values()).mean()
    avg_clo_cen = np.array(nx.closeness_centrality(graph).values()).mean()
    #avg_deg = float("{0:.4f}".format(in_deg + out_deg))
    avg_node_con = float("{0:.4f}".format((nx.average_node_connectivity(graph))))
    deg_assort_coeff = float("{0:.4f}".format((nx.degree_assortativity_coefficient(graph))))
    conn_comp = nx.number_weakly_connected_components(graph) # ugraph
    deg_cen = nx.degree_centrality(graph)
    bet_cen = nx.betweenness_centrality(graph)
    clo_cen = nx.closeness_centrality(graph)
    highest_deg_cen = highest_centrality(deg_cen)
    highest_bet_cen = highest_centrality(bet_cen)
    highest_clo_cen = highest_centrality(clo_cen)
    Gc = len(max(nx.weakly_connected_component_subgraphs(graph), key=len))

    # save variables into list
    graph_values = {'name':filename,
                    'sentiment':sent,
                    '# nodes':nodes,
                    '# edges':edges,
                    #'avg deg':avg_deg,
                    'density':density,
                    'deg assort coef':deg_assort_coeff,
                    'avg deg cent':"%.4f" % avg_deg_cen,
                    'avg bet cent':"%.4f" % avg_bet_cen,
                    'avg clo cent':"%.4f" % avg_clo_cen,
                    'high deg cent':highest_deg_cen,
                    'high bet cent':highest_bet_cen,
                    'high clo cent':highest_clo_cen,
                    'avg node conn':avg_node_con,
                    '# conn comp':conn_comp,
                    'gc size':Gc
                    }
    network_data = network_data.append(graph_values, ignore_index=True)


----------
../output/network/d_Gc_positive2.gml
Name: d_Gc_positive2.gml
Type: MultiDiGraph
Number of nodes: 585
Number of edges: 1088
Average in degree:   1.8598
Average out degree:   1.8598

In [22]:
# print network data for greatest component
network_data


Out[22]:
name sentiment # nodes # edges density deg assort coef avg deg cent avg bet cent avg clo cent high deg cent high bet cent high clo cent avg node conn # conn comp gc size
0 d_Gc_positive2.gml pos 585.0 1088.0 0.0032 -0.0336 0.0064 0.0018 0.0517 (vaccines, 0.116438356164) (vaccines, 0.0795089514173) (parents, 0.23574182936) 0.2974 1.0 585.0

In [23]:
# save
#network_data.to_csv('../output/df/d_Gc_pos2.csv')

7. network stats for UNDIRECTED GC


In [24]:
# load UNdirected Gc
Gc_files = glob('../output/network/u_Gc_positive2.gml')

network_data_columns = ['name',
                    'sentiment',
                    '# nodes',
                    '# edges',
                    #'avg deg',
                    'density',
                    'deg assort coef', 
                    'avg deg cent',
                    'avg bet cent',
                    'avg clo cent',
                    'high deg cent',
                    'high bet cent',
                    'high clo cent',
                    'avg node conn'
                    #'# conn comp',
                    #'gc size'
                    ]
network_data = pd.DataFrame(columns = network_data_columns)

In [25]:
# Gc_files
for graph_num, gml_graph in enumerate(Gc_files):
    graph = nx.read_gml(gml_graph)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 10)
    print(gml_graph)
    calculate_graph_inf(graph)
    
    # calculate variables
    sent = "pos"
    nodes = nx.number_of_nodes(graph)
    edges = nx.number_of_edges(graph)
    density = float("{0:.4f}".format(nx.density(graph)))
    avg_deg_cen = np.array(nx.degree_centrality(graph).values()).mean()
    avg_bet_cen = np.array(nx.betweenness_centrality(graph).values()).mean()
    avg_clo_cen = np.array(nx.closeness_centrality(graph).values()).mean()
    #avg_deg = float("{0:.4f}".format(in_deg + out_deg))
    avg_node_con = float("{0:.4f}".format((nx.average_node_connectivity(graph))))
    deg_assort_coeff = float("{0:.4f}".format((nx.degree_assortativity_coefficient(graph))))
    #conn_comp = nx.number_weakly_connected_components(graph) # ugraph
    deg_cen = nx.degree_centrality(graph)
    bet_cen = nx.betweenness_centrality(graph)
    clo_cen = nx.closeness_centrality(graph)
    highest_deg_cen = highest_centrality(deg_cen)
    highest_bet_cen = highest_centrality(bet_cen)
    highest_clo_cen = highest_centrality(clo_cen)
    #Gc = len(max(nx.weakly_connected_component_subgraphs(graph), key=len))

    # save variables into list
    graph_values = {'name':filename,
                    'sentiment':sent,
                    '# nodes':nodes,
                    '# edges':edges,
                    #'avg deg':avg_deg,
                    'density':density,
                    'deg assort coef':deg_assort_coeff,
                    'avg deg cent':"%.4f" % avg_deg_cen,
                    'avg bet cent':"%.4f" % avg_bet_cen,
                    'avg clo cent':"%.4f" % avg_clo_cen,
                    'high deg cent':highest_deg_cen,
                    'high bet cent':highest_bet_cen,
                    'high clo cent':highest_clo_cen,
                    'avg node conn':avg_node_con
                    #'# conn comp':conn_comp,
                    #'gc size':Gc
                    }
    network_data = network_data.append(graph_values, ignore_index=True)


----------
../output/network/u_Gc_positive2.gml
Name: u_Gc_positive2.gml
Type: MultiGraph
Number of nodes: 585
Number of edges: 1042
Average degree:   3.5624

In [26]:
# print network data for greatest component
network_data


Out[26]:
name sentiment # nodes # edges density deg assort coef avg deg cent avg bet cent avg clo cent high deg cent high bet cent high clo cent avg node conn
0 u_Gc_positive2.gml pos 585.0 1042.0 0.0061 -0.115 0.0061 0.0060 0.2292 (vaccines, 0.107876712329) (parents, 0.271838812819) (parents, 0.368686868687) 1.3117

In [27]:
# save
#network_data.to_csv('../output/df/u_Gc_pos2.csv')

Gc nodes table (directed & undirected)


In [33]:
#gml_files = glob('../output/network/d_Gc_positive2.gml')
gml_files = glob('../output/network/u_Gc_positive2.gml')

In [34]:
# 2_node_df: list all nodes and centrality
data_columns = ['name',
                'sentiment'
                ]
data = pd.DataFrame(columns = data_columns)
#combined_df = pd.DataFrame()

In [35]:
for graph_num, gml_graph in enumerate(gml_files):
    graph = nx.read_gml(gml_graph)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 10)
    print(gml_graph)
    calculate_graph_inf(graph)

    # calculate variables and save into list
    sent = "pos"    
    deg_cent = nx.degree_centrality(graph)
    bet_cent = nx.betweenness_centrality(graph)
    clo_cent = nx.closeness_centrality(graph)
    graph_values = {'name':filename,
                    'sentiment':sent
                    }
    data = data.append(graph_values, ignore_index=True)

    degree = nx.degree(graph)
    deg_df = pd.DataFrame.from_dict(degree, orient = 'index')
    deg_df.columns = ['degree']
    # degree centrality
    deg_cent = nx.degree_centrality(graph)
    dc_df = pd.DataFrame.from_dict(deg_cent, orient = 'index')
    dc_df.columns = ['deg cent']
    # betweenness centrality
    bet_cent = nx.betweenness_centrality(graph)
    bc_df = pd.DataFrame.from_dict(bet_cent, orient = 'index')
    bc_df.columns = ['bet cent']
    # closeness centrality
    clo_cent = nx.closeness_centrality(graph)
    cc_df = pd.DataFrame.from_dict(clo_cent, orient = 'index')
    cc_df.columns = ['clo cent']
    # concat node frames into node_df
    frames = [deg_df, dc_df, bc_df, cc_df]
    node_df = pd.concat(frames, axis = 1)
    node_df.index.name = 'node'
    node_df = node_df.reset_index()

    values = pd.DataFrame(graph_values, columns = ('name', 'sentiment'), index = [0])
    
    # df = merges graph_values with node_df for single graph and fill NaNs
    df = pd.concat([values, node_df], axis = 1)
    df = df.fillna(method='ffill')
    #combined_df = combined_df.append(df)


----------
../output/network/u_Gc_positive2.gml
Name: u_Gc_positive2.gml
Type: MultiGraph
Number of nodes: 585
Number of edges: 1042
Average degree:   3.5624

In [36]:
# print positive gc nodes
df


Out[36]:
name sentiment node degree deg cent bet cent clo cent
0 u_Gc_positive2.gml pos neighbors 1 0.001712 0.000000 0.198033
1 u_Gc_positive2.gml pos colleges 1 0.001712 0.000000 0.204482
2 u_Gc_positive2.gml pos influenza 2 0.003425 0.000744 0.168009
3 u_Gc_positive2.gml pos parents of autistic children 6 0.010274 0.005560 0.265938
4 u_Gc_positive2.gml pos religious exemption 9 0.015411 0.007146 0.269995
5 u_Gc_positive2.gml pos results 1 0.001712 0.000000 0.215181
6 u_Gc_positive2.gml pos no brainer 1 0.001712 0.000000 0.236725
7 u_Gc_positive2.gml pos repetitive behaviors 1 0.001712 0.000000 0.125322
8 u_Gc_positive2.gml pos children 26 0.044521 0.082502 0.341520
9 u_Gc_positive2.gml pos vaccination schedule 3 0.005137 0.001338 0.260134
10 u_Gc_positive2.gml pos best-sellers 1 0.001712 0.000000 0.205851
11 u_Gc_positive2.gml pos American Medical Association 5 0.008562 0.001073 0.236150
12 u_Gc_positive2.gml pos Orthodox Jewish communities 1 0.001712 0.000000 0.252814
13 u_Gc_positive2.gml pos fence-sitters 4 0.006849 0.015501 0.277435
14 u_Gc_positive2.gml pos Journal of the American Medical Association 1 0.001712 0.000000 0.210906
15 u_Gc_positive2.gml pos sexually transmitted virus 4 0.006849 0.007282 0.280500
16 u_Gc_positive2.gml pos fear of autism 5 0.008562 0.008598 0.276515
17 u_Gc_positive2.gml pos genetic risk factors for ASD 1 0.001712 0.000000 0.163907
18 u_Gc_positive2.gml pos siblings 1 0.001712 0.000000 0.175112
19 u_Gc_positive2.gml pos resources 2 0.003425 0.001836 0.205924
20 u_Gc_positive2.gml pos risk 1 0.001712 0.000000 0.236725
21 u_Gc_positive2.gml pos increased sensitivity to light 1 0.001712 0.000000 0.221128
22 u_Gc_positive2.gml pos choice 1 0.001712 0.000000 0.258407
23 u_Gc_positive2.gml pos stiff neck 1 0.001712 0.000000 0.161773
24 u_Gc_positive2.gml pos nausea 2 0.003425 0.001685 0.245172
25 u_Gc_positive2.gml pos ill effects 1 0.001712 0.000000 0.195776
26 u_Gc_positive2.gml pos cultured cells 3 0.005137 0.002273 0.160307
27 u_Gc_positive2.gml pos spread of infectious diseases 4 0.006849 0.001095 0.261766
28 u_Gc_positive2.gml pos decrease in exemption rates 5 0.008562 0.010310 0.234162
29 u_Gc_positive2.gml pos debunked 1 0.001712 0.000000 0.195776
... ... ... ... ... ... ... ...
555 u_Gc_positive2.gml pos vaccine requirements 7 0.011986 0.003956 0.272897
556 u_Gc_positive2.gml pos role 1 0.001712 0.000000 0.195579
557 u_Gc_positive2.gml pos driving factors 3 0.005137 0.000122 0.280365
558 u_Gc_positive2.gml pos sexually active 7 0.011986 0.010474 0.277567
559 u_Gc_positive2.gml pos immunity 6 0.010274 0.004142 0.282535
560 u_Gc_positive2.gml pos expected 1 0.001712 0.000000 0.210223
561 u_Gc_positive2.gml pos sense of urgency 1 0.001712 0.000000 0.269497
562 u_Gc_positive2.gml pos rubella 13 0.022260 0.029994 0.260714
563 u_Gc_positive2.gml pos former gastroenterologist 1 0.001712 0.000000 0.141097
564 u_Gc_positive2.gml pos varicella vaccine 3 0.005137 0.000006 0.138323
565 u_Gc_positive2.gml pos magnitude of benefits 1 0.001712 0.000000 0.184111
566 u_Gc_positive2.gml pos serogroups 1 0.001712 0.000000 0.221128
567 u_Gc_positive2.gml pos HPV infection 3 0.005137 0.000007 0.233507
568 u_Gc_positive2.gml pos state vaccination rates 3 0.005137 0.003646 0.175957
569 u_Gc_positive2.gml pos loss of limb 1 0.001712 0.000000 0.186701
570 u_Gc_positive2.gml pos Early Childhood Australia 3 0.005137 0.003425 0.223242
571 u_Gc_positive2.gml pos religious groups 23 0.039384 0.101817 0.321940
572 u_Gc_positive2.gml pos age 26 1 0.001712 0.000000 0.181366
573 u_Gc_positive2.gml pos Robert F. Kennedy Jr. 2 0.003425 0.009372 0.262354
574 u_Gc_positive2.gml pos friends 1 0.001712 0.000000 0.269497
575 u_Gc_positive2.gml pos Catholic Church 1 0.001712 0.000000 0.196898
576 u_Gc_positive2.gml pos Amish 3 0.005137 0.001786 0.245481
577 u_Gc_positive2.gml pos scheduled appointment 1 0.001712 0.000000 0.269497
578 u_Gc_positive2.gml pos meningococcal disease symptoms 4 0.006849 0.006155 0.192930
579 u_Gc_positive2.gml pos Netherlands Reformed Congregation 2 0.003425 0.000000 0.252814
580 u_Gc_positive2.gml pos immune protection 3 0.005137 0.005403 0.264493
581 u_Gc_positive2.gml pos environmental trigger 2 0.003425 0.000628 0.219302
582 u_Gc_positive2.gml pos time 1 0.001712 0.000000 0.237979
583 u_Gc_positive2.gml pos overseas 3 0.005137 0.001082 0.228303
584 u_Gc_positive2.gml pos Tdap vaccine 9 0.015411 0.026271 0.287119

585 rows × 7 columns


In [37]:
# save
#df.to_csv('../output/df/d_Gc_nodes_pos2.csv')
#df.to_csv('../output/df/u_Gc_nodes_pos2.csv')

full network node centrality (directed & undirected)


In [46]:
# make sure you're using the right graph
print "gml_files = ", gml_files
print "gml_graph = ", gml_graph


gml_files =  ['../output/network/u_Gc_positive2.gml']
gml_graph =  ../output/network/u_Gc_positive2.gml

In [47]:
# FULL DIRECTED
#graph = nx.read_gml('../output/network/article_pos1.gml')

# FULL UNDIRECTED
graph = nx.read_gml('../output/network/article_u_pos.gml')

print nx.info(graph)


Name: article_pos1.gml
Type: MultiGraph
Number of nodes: 652
Number of edges: 1094
Average degree:   3.3558

In [48]:
# degree centrality
dc = nx.degree_centrality(graph)
dc_df = pd.DataFrame.from_dict(dc, orient = 'index')
dc_df.columns = ['degree cent']
dc_df = dc_df.sort_values(by = ['degree cent'])
#dc_df

# betweenness centrality
bc = nx.betweenness_centrality(graph)
bc_df = pd.DataFrame.from_dict(bc, orient = 'index')
bc_df.columns = ['betweenness cent']
bc_df = bc_df.sort_values(by = ['betweenness cent'])
#bc_df

# closeness centrality
cc = nx.closeness_centrality(graph)
cc_df = pd.DataFrame.from_dict(cc, orient = 'index')
cc_df.columns = ['closeness cent']
cc_df = cc_df.sort_values(by = ['closeness cent'])
#cc_df

In [49]:
dc_df


Out[49]:
degree cent
neighbors 0.001536
arm 0.001536
elite list 0.001536
sex 0.001536
testing 0.001536
free vaccine 0.001536
Caribbean 0.001536
medical law 0.001536
strong-arm tactics 0.001536
gift from God 0.001536
false concerns 0.001536
Early Childhood Australia's chief executive 0.001536
unvaccinated high-risk children 0.001536
Federal Circuit 0.001536
random cases 0.001536
unethical 0.001536
medical conditions 0.001536
psychiatrist 0.001536
factor 0.001536
rash 0.001536
severe symptoms 0.001536
fear 0.001536
Jewish dietary laws 0.001536
opportunistic infections 0.001536
Sydney, Australia 0.001536
efficacious 0.001536
public schools 0.001536
risk of rubella 0.001536
computer models 0.001536
meningococcal conjugate booster 0.001536
... ...
unvaccinated children 0.016897
vaccine refusal 0.019969
states 0.019969
rubella 0.019969
studies 0.019969
anti-vaccination website 0.021505
herd immunity 0.021505
Gardasil 0.021505
personal belief exemption 0.023041
community 0.024578
Jain study 0.024578
meningococcal vaccine 0.024578
side effects 0.024578
autism risk 0.026114
vaccination exemption 0.026114
disease 0.027650
SB 277 0.030722
measles vaccine 0.033794
religious groups 0.035330
MMR vaccine 0.036866
children 0.039939
anti-vaccination 0.043011
vaccine-autism link 0.047619
meningococcal disease 0.047619
HPV vaccine 0.050691
autism 0.055300
vaccination 0.076805
measles 0.089094
parents 0.089094
vaccines 0.096774

652 rows × 1 columns


In [50]:
bc_df


Out[50]:
betweenness cent
neighbors 0.000000
public schools 0.000000
behavioral research 0.000000
diarrhea deaths 0.000000
efficacious 0.000000
Early Childhood Australia's chief executive 0.000000
arm 0.000000
elite list 0.000000
sex 0.000000
testing 0.000000
free vaccine 0.000000
Caribbean 0.000000
unconscionable 0.000000
medical law 0.000000
gift from God 0.000000
false concerns 0.000000
unvaccinated high-risk children 0.000000
opportunistic infections 0.000000
Federal Circuit 0.000000
imitation infection 0.000000
random cases 0.000000
unethical 0.000000
opponent of sanity-oriented legislation 0.000000
medical conditions 0.000000
vaccine efficacy 0.000000
strong-arm tactics 0.000000
Department of Public Health Immunization Program 0.000000
MMR vaccine doesn't trigger autism 0.000000
genes 0.000000
argument 0.000000
... ...
CDC 0.019609
vaccinated 0.019861
personal belief exemption 0.020411
studies 0.020874
Tdap vaccine 0.021138
Muslim fundamentalists 0.021258
parents who refuse to vaccinate their children 0.022245
disease 0.022347
rubella 0.024134
vaccination exemption 0.024163
Wakefield study 0.024712
polio vaccine opposition 0.024914
Jain study 0.034253
United States 0.036525
Gardasil 0.039917
side effects 0.041014
measles vaccine 0.042119
SB 277 0.045962
community 0.046205
HPV vaccine 0.058863
autism 0.064317
children 0.066382
meningococcal disease 0.072818
vaccine-autism link 0.073800
religious groups 0.081924
vaccination 0.086563
anti-vaccination 0.101460
measles 0.124398
vaccines 0.175097
parents 0.218725

652 rows × 1 columns


In [51]:
cc_df


Out[51]:
closeness cent
meningococcal conjugate booster 0.001536
autism-linked genes 0.001536
benefit 0.001536
short amount of time 0.001536
critical period 0.001536
16 years of age 0.001536
prenatal development 0.001536
factor 0.001536
government 0.001536
insulin 0.001536
reduced vaccine potency 0.001536
Samantha Page 0.001536
Early Childhood Australia's chief executive 0.001536
peer pressure 0.001536
suboptimal protection 0.001536
Scott Morrison 0.001536
decision to vaccinate 0.001536
no jab, no pay policy 0.001536
compliance 0.001536
Australian social services minister 0.001536
reactions 0.001536
Northern Hemisphere flu vaccine 0.001536
Assembly 0.002048
vaccines are not necessary 0.002048
healing through prayer 0.002048
medical student 0.002048
critics 0.002048
opposition 0.002048
Princeton University 0.002048
Minnesota 0.002048
... ...
Tdap vaccine 0.257569
health care 0.258331
vaccine-preventable diseases 0.259611
infectious disease 0.259998
Jehovah's Witnesses 0.260385
Jews 0.260645
measles vaccine 0.261295
herd immunity 0.262473
vaccine delay 0.262868
developmental disability 0.263794
vaccination exemption 0.264193
religion 0.264727
public health 0.264861
side effects 0.265398
protection 0.269078
vaccine refusal 0.269355
disease 0.271871
personal belief exemption 0.273004
schools 0.273718
vaccination 0.278076
anti-vaccination 0.280008
vaccine-autism link 0.282423
SB 277 0.283340
autism 0.285969
religious groups 0.288807
community 0.289445
measles 0.303356
children 0.306372
vaccines 0.312400
parents 0.330742

652 rows × 1 columns

Gc node centrality (directed & undirected)


In [52]:
# Gc directed
#graph = nx.read_gml('../output/network/d_Gc_positive2.gml')

# Gc undirected
graph = nx.read_gml('../output/network/u_Gc_positive2.gml')  

print nx.info(graph)


Name: undirected Gc
Type: MultiGraph
Number of nodes: 585
Number of edges: 1042
Average degree:   3.5624

In [53]:
# degree centrality
dc = nx.degree_centrality(graph)
dc_df = pd.DataFrame.from_dict(dc, orient = 'index')
dc_df.columns = ['degree cent']
dc_df = dc_df.sort_values(by = ['degree cent'])
#dc_df

# betweenness centrality
bc = nx.betweenness_centrality(graph)
bc_df = pd.DataFrame.from_dict(bc, orient = 'index')
bc_df.columns = ['betweenness cent']
bc_df = bc_df.sort_values(by = ['betweenness cent'])
#bc_df

# closeness centrality
cc = nx.closeness_centrality(graph)
cc_df = pd.DataFrame.from_dict(cc, orient = 'index')
cc_df.columns = ['closeness cent']
cc_df = cc_df.sort_values(by = ['closeness cent'])
#cc_df

In [54]:
dc_df


Out[54]:
degree cent
neighbors 0.001712
vaccines cause neurological problems 0.001712
mumps vaccine 0.001712
prophylaxis 0.001712
India 0.001712
Caribbean 0.001712
free vaccine 0.001712
measles experience 0.001712
swelling 0.001712
German measles 0.001712
Muslim leaders 0.001712
revaccinated 0.001712
programs 0.001712
children with family history of autism 0.001712
immune memory cells 0.001712
democrat 0.001712
three-dose course 0.001712
neurological problems 0.001712
childcare benefits 0.001712
worried 0.001712
party lines 0.001712
Sydney, Australia 0.001712
state-level policy 0.001712
bad news 0.001712
testing 0.001712
reduction in HPV 0.001712
women without previous HPV 0.001712
protect the kid next to you 0.001712
sex 0.001712
part of the story 0.001712
... ...
unvaccinated children 0.018836
vaccine delay 0.018836
studies 0.022260
vaccine refusal 0.022260
rubella 0.022260
states 0.022260
herd immunity 0.023973
Gardasil 0.023973
personal belief exemption 0.025685
meningococcal vaccine 0.027397
Jain study 0.027397
community 0.027397
side effects 0.027397
autism risk 0.029110
vaccination exemption 0.029110
disease 0.030822
SB 277 0.034247
measles vaccine 0.037671
religious groups 0.039384
MMR vaccine 0.041096
children 0.044521
anti-vaccination 0.047945
meningococcal disease 0.053082
vaccine-autism link 0.053082
HPV vaccine 0.056507
autism 0.061644
vaccination 0.085616
measles 0.099315
parents 0.099315
vaccines 0.107877

585 rows × 1 columns


In [55]:
bc_df


Out[55]:
betweenness cent
neighbors 0.000000
threat 0.000000
backfire 0.000000
wealthy regions 0.000000
benefits 0.000000
arm 0.000000
partial protection 0.000000
voluntary 0.000000
members 0.000000
diarrhea deaths 0.000000
caregivers 0.000000
German measles 0.000000
theory 0.000000
11-18 year olds 0.000000
behavioral research 0.000000
Johns Hopkins University 0.000000
state legislatures 0.000000
Prabhupada Village 0.000000
sex 0.000000
mumps vaccine 0.000000
compromise 0.000000
anyone 0.000000
vaccination of pregnant women 0.000000
children with cancer 0.000000
pneumonia deaths 0.000000
small nudges 0.000000
tuition 0.000000
Americans 0.000000
prohibits vaccinating members 0.000000
seizures 0.000000
... ...
CDC 0.024371
vaccinated 0.024684
personal belief exemption 0.025367
studies 0.025942
Tdap vaccine 0.026271
Muslim fundamentalists 0.026421
parents who refuse to vaccinate their children 0.027647
disease 0.027773
rubella 0.029994
vaccination exemption 0.030031
Wakefield study 0.030712
polio vaccine opposition 0.030964
Jain study 0.042571
United States 0.045395
Gardasil 0.049611
side effects 0.050973
measles vaccine 0.052347
SB 277 0.057123
community 0.057426
HPV vaccine 0.073157
autism 0.079935
children 0.082502
meningococcal disease 0.090500
vaccine-autism link 0.091721
religious groups 0.101817
vaccination 0.107583
anti-vaccination 0.126098
measles 0.154606
vaccines 0.217616
parents 0.271839

585 rows × 1 columns


In [56]:
cc_df


Out[56]:
closeness cent
unable to speak 0.125322
repetitive behaviors 0.125322
The Lancet 0.133516
Federal Circuit 0.133516
flu vaccine recall 0.137089
rubella vaccine 0.138323
varicella vaccine 0.138323
former gastroenterologist 0.141097
shame 0.141097
children with autism 0.143243
part of the story 0.149667
vaccine information sources 0.151767
MMR vaccine doesn't trigger autism 0.153806
vaccines do not cause autism 0.154049
Department of Public Health Immunization Program 0.156317
reduced effectiveness 0.158653
safety concern 0.158653
GlaxoSmithKline 0.158739
vaccine potency 0.158739
Catholic parents 0.160307
aborted fetuses 0.160307
cultured cells 0.160307
vaccinations should be voluntary 0.161015
stiff neck 0.161773
genetic risk factors for ASD 0.163907
pharmacological interventions 0.163907
behavioral interventions 0.163907
Andrew Wakefield 0.164229
three-dose course 0.166382
one dose of vaccine 0.166382
... ...
Tdap vaccine 0.287119
health care 0.287968
vaccine-preventable diseases 0.289395
infectious disease 0.289826
Jehovah's Witnesses 0.290258
Jews 0.290547
measles vaccine 0.291272
herd immunity 0.292585
vaccine delay 0.293026
developmental disability 0.294058
vaccination exemption 0.294503
religion 0.295099
public health 0.295248
side effects 0.295846
protection 0.299949
vaccine refusal 0.300257
disease 0.303062
personal belief exemption 0.304325
schools 0.305120
vaccination 0.309979
anti-vaccination 0.312133
vaccine-autism link 0.314825
SB 277 0.315846
autism 0.318777
religious groups 0.321940
community 0.322652
measles 0.338159
children 0.341520
vaccines 0.348241
parents 0.368687

585 rows × 1 columns


Cutsets (directed & undirected)


In [57]:
# Gc directed
#graph = nx.read_gml('../output/network/d_Gc_positive2.gml')

# Gc undirected
graph = nx.read_gml('../output/network/u_Gc_positive2.gml')

print nx.info(graph)


Name: undirected Gc
Type: MultiGraph
Number of nodes: 585
Number of edges: 1042
Average degree:   3.5624

In [58]:
print "Greatest component size =", len(graph)


Greatest component size = 585

In [59]:
# returns all minimum k cutsets of an undirected graph
# i.e., the set(s) of nodes of cardinality equal to the node connectivity of G
# thus if removed, would break G into two or more connected components

#cutsets = list(nx.all_node_cuts(graph))  # must be undirected

print "Greatest component size =", len(graph)
#print "# of cutsets =", len(cutsets)

# returns a set of nodes or edges of minimum cardinality that disconnects G
min_ncut = nx.minimum_node_cut(graph)
min_ecut = nx.minimum_edge_cut(graph)

print "Min node cut =", min_ncut
print "Min edge cut =", min_ecut

# min cuts with source and target
print nx.minimum_node_cut(graph, s='vaccines', t='autism')
print nx.minimum_edge_cut(graph, s='vaccines', t='autism')


Greatest component size = 585
Min node cut = set([u'vaccine message'])
Min edge cut = set([(u'anti-vaccination', u'time')])
set([u'protective effect of vaccines', u'families', u'vaccinated children', u'MMR vaccine', u'autism risk', u'anti-vaccination', u'parents', u'genetic predisposition', u'scientists', u'children at higher risk for autism', u'children', u'Jain study'])
set([(u'vaccinated high-risk children', u'autism'), (u'genetic predisposition', u'autism'), (u'children', u'autism'), (u'MMR vaccine', u'autism'), (u'vaccines', u'autism'), (u'scientists', u'autism'), (u'anti-vaccination', u'autism'), (u'children with autistic sibling', u'autism'), (u'parents', u'autism'), (u'vaccinated children and unvaccinated children', u'autism'), (u'families', u'autism'), (u'harmful association', u'autism'), (u'protective effect of vaccines', u'autism'), (u'vaccinated children', u'autism'), (u'children at higher risk for autism', u'autism')])

In [60]:
# read edge labels in min cut for Gc
# change source and target
a = nx.minimum_edge_cut(graph, s='vaccines', t='autism')
#a = nx.minimum_edge_cut(graph)

labels = nx.get_edge_attributes(graph,'edge')
edgelabels = {}
for e in labels.keys():
    e1 = e[0:2]
    edgelabels[e1]=labels[e]

for e in a:
    if edgelabels.has_key(e):
        print e,edgelabels[e]
    else:
        rev_e = e[::-1]
        print rev_e, edgelabels[rev_e]


(u'vaccinated high-risk children', u'autism') are less likely to be diagnosed with
(u'genetic predisposition', u'autism') makes more vulnerable to
(u'children', u'autism') one in 68 kids has some form of
(u'MMR vaccine', u'autism') researchers were unable to find any association with
(u'vaccines', u'autism') cause
(u'scientists', u'autism') remains challenge for
(u'anti-vaccination', u'autism') is driven by fears that shots cause
(u'children with autistic sibling', u'autism') more likely to have
(u'parents', u'autism') who already have a child with autism seem even more concerned
(u'vaccinated children and unvaccinated children', u'autism') severity does not differ between
(u'families', u'autism') remains challenge for
(u'harmful association', u'autism') none between MMR vaccine and
(u'protective effect of vaccines', u'autism') may protect children from
(u'vaccinated children', u'autism') are somewhat less likely to be diagnosed with
(u'children at higher risk for autism', u'autism') actually less likely to receive diagnosis for