positive graph

  1. import full directed network: article_pos1.gml
  2. save as undirected network: u_pos.gml
  • save u_nodes_pos.csv
  • save u_Gc_positive.gml
  • save u_Gc_nodes_pos.csv

1.


In [1]:
# 1_network_df

import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
from glob import glob

plt.style.use('ggplot')
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)

gml_files = glob('../output/network/article_pos1.gml')

In [2]:
def calculate_graph_inf(graph):
    graph.name = filename
    info = nx.info(graph)
    print info

def highest_centrality(cent_dict):
    """Returns a tuple (node,value) with the node
    with largest value from centrality dictionary."""
    # create ordered tuple of centrality data
    cent_items = [(b,a) for (a,b) in cent_dict.iteritems()]
    # sort in descending order
    cent_items.sort()
    cent_items.reverse()
    return tuple(reversed(cent_items[0]))

2. convert to undirected


In [3]:
for graph_num, gml_graph in enumerate(gml_files):
    dgraph = nx.read_gml(gml_graph)
    ugraph = dgraph.to_undirected() # to undirected graph
    U = dgraph.to_undirected(reciprocal=True)
    e = U.edges()
    ugraph.add_edges_from(e)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 10)
    print(gml_graph)
    calculate_graph_inf(dgraph)
    calculate_graph_inf(ugraph)


----------
../output/network/article_pos1.gml
Name: article_pos1.gml
Type: MultiDiGraph
Number of nodes: 652
Number of edges: 1140
Average in degree:   1.7485
Average out degree:   1.7485
Name: article_pos1.gml
Type: MultiGraph
Number of nodes: 652
Number of edges: 1140
Average degree:   3.4969

In [4]:
# save undirected gml
nx.write_gml(ugraph, "../output/network/u_pos.gml")

dgraph = directed ugraph = undirected

undirected graph


In [5]:
# load
gml_files = glob('../output/network/u_pos.gml')

In [6]:
# ugraph = undirected; dgraph = directed
for graph_num, gml_graph in enumerate(gml_files):
    ugraph = nx.read_gml(gml_graph)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 10)
    print(gml_graph)
    calculate_graph_inf(ugraph)


----------
../output/network/u_pos.gml
Name: u_pos.gml
Type: MultiGraph
Number of nodes: 652
Number of edges: 1140
Average degree:   3.4969

3. all nodes table


In [7]:
# 2_node_df: list all nodes and centrality
data_columns = ['name',
                'sentiment'
                ]
data = pd.DataFrame(columns = data_columns)
combined_df = pd.DataFrame()

In [8]:
# calculate variables and save into list
sent = "positive"    
deg_cent = nx.degree_centrality(ugraph)
bet_cent = nx.betweenness_centrality(ugraph)
clo_cent = nx.closeness_centrality(ugraph)
graph_values = {'name':filename,
                'sentiment':sent
                }
data = data.append(graph_values, ignore_index=True)

degree = nx.degree(ugraph)
deg_df = pd.DataFrame.from_dict(degree, orient = 'index')
deg_df.columns = ['degree']

# degree centrality
deg_cent = nx.degree_centrality(ugraph)
dc_df = pd.DataFrame.from_dict(deg_cent, orient = 'index')
dc_df.columns = ['deg cent']

# betweenness centrality
bet_cent = nx.betweenness_centrality(ugraph)
bc_df = pd.DataFrame.from_dict(bet_cent, orient = 'index')
bc_df.columns = ['bet cent']

# closeness centrality
clo_cent = nx.closeness_centrality(ugraph)
cc_df = pd.DataFrame.from_dict(clo_cent, orient = 'index')
cc_df.columns = ['clo cent']

# concat node frames into node_df
frames = [deg_df, dc_df, bc_df, cc_df]
node_df = pd.concat(frames, axis = 1)
node_df.index.name = 'node'
node_df = node_df.reset_index()

values = pd.DataFrame(graph_values, columns = ('name', 'sentiment'), index = [0])

# df = merges graph_values with node_df for single graph and fill NaNs
df = pd.concat([values, node_df], axis = 1)
df = df.fillna(method='ffill')
combined_df = combined_df.append(df)

In [9]:
# print entire network
combined_df


Out[9]:
name sentiment node degree deg cent bet cent clo cent
0 u_pos.gml positive neighbors 1 0.001536 0.000000 0.177652
1 u_pos.gml positive vitamins 1 0.001536 0.000000 0.011151
2 u_pos.gml positive colleges 1 0.001536 0.000000 0.183437
3 u_pos.gml positive influenza 2 0.003072 0.000599 0.150718
4 u_pos.gml positive parents of autistic children 6 0.009217 0.004474 0.238568
5 u_pos.gml positive religious exemption 9 0.013825 0.005750 0.242208
6 u_pos.gml positive results 1 0.001536 0.000000 0.193034
7 u_pos.gml positive Scott Morrison 1 0.001536 0.000000 0.001536
8 u_pos.gml positive repetitive behaviors 1 0.001536 0.000000 0.112424
9 u_pos.gml positive Michael Mina 2 0.003072 0.000005 0.003072
10 u_pos.gml positive children 31 0.047619 0.066382 0.306372
11 u_pos.gml positive Dr. Paul Offit 3 0.004608 0.000014 0.004608
12 u_pos.gml positive vaccination schedule 4 0.006144 0.001076 0.233361
13 u_pos.gml positive Samantha Page 1 0.001536 0.000000 0.001536
14 u_pos.gml positive best-sellers 1 0.001536 0.000000 0.184665
15 u_pos.gml positive American Medical Association 5 0.007680 0.000863 0.211846
16 u_pos.gml positive Orthodox Jewish communities 1 0.001536 0.000000 0.226795
17 u_pos.gml positive fence-sitters 4 0.006144 0.012472 0.248881
18 u_pos.gml positive Journal of the American Medical Association 1 0.001536 0.000000 0.189200
19 u_pos.gml positive sexually transmitted virus 4 0.006144 0.005859 0.251631
20 u_pos.gml positive fear of autism 5 0.007680 0.006918 0.248057
21 u_pos.gml positive genetic risk factors for ASD 1 0.001536 0.000000 0.147038
22 u_pos.gml positive siblings 1 0.001536 0.000000 0.157090
23 u_pos.gml positive resources 2 0.003072 0.001477 0.184730
24 u_pos.gml positive risk 1 0.001536 0.000000 0.212361
25 u_pos.gml positive vaccine campaign 2 0.003072 0.000000 0.185976
26 u_pos.gml positive stiff neck 1 0.001536 0.000000 0.145123
27 u_pos.gml positive Faith Assembly 2 0.003072 0.000000 0.005530
28 u_pos.gml positive nausea 2 0.003072 0.001356 0.219939
29 u_pos.gml positive ill effects 1 0.001536 0.000000 0.175627
... ... ... ... ... ... ... ...
622 u_pos.gml positive role 1 0.001536 0.000000 0.175451
623 u_pos.gml positive driving factors 3 0.004608 0.000098 0.251510
624 u_pos.gml positive sexually active 7 0.010753 0.008428 0.249000
625 u_pos.gml positive immunity 6 0.009217 0.003333 0.253457
626 u_pos.gml positive expected 1 0.001536 0.000000 0.188587
627 u_pos.gml positive sense of urgency 1 0.001536 0.000000 0.241761
628 u_pos.gml positive health officials 5 0.007680 0.010986 0.256309
629 u_pos.gml positive rubella 13 0.019969 0.024134 0.233882
630 u_pos.gml positive former gastroenterologist 1 0.001536 0.000000 0.126575
631 u_pos.gml positive varicella vaccine 3 0.004608 0.000005 0.124087
632 u_pos.gml positive magnitude of benefits 1 0.001536 0.000000 0.165163
633 u_pos.gml positive serogroups 1 0.001536 0.000000 0.198370
634 u_pos.gml positive 16 years of age 1 0.001536 0.000000 0.001536
635 u_pos.gml positive state vaccination rates 3 0.004608 0.002933 0.157847
636 u_pos.gml positive loss of limb 1 0.001536 0.000000 0.167486
637 u_pos.gml positive Early Childhood Australia 3 0.004608 0.002756 0.200266
638 u_pos.gml positive religious groups 24 0.036866 0.081924 0.288807
639 u_pos.gml positive age 26 1 0.001536 0.000000 0.162700
640 u_pos.gml positive Robert F. Kennedy Jr. 2 0.003072 0.007541 0.235353
641 u_pos.gml positive friends 1 0.001536 0.000000 0.241761
642 u_pos.gml positive Catholic Church 1 0.001536 0.000000 0.176634
643 u_pos.gml positive Amish 3 0.004608 0.001437 0.220217
644 u_pos.gml positive scheduled appointment 1 0.001536 0.000000 0.241761
645 u_pos.gml positive meningococcal disease symptoms 4 0.006144 0.004952 0.173074
646 u_pos.gml positive Netherlands Reformed Congregation 2 0.003072 0.000000 0.226795
647 u_pos.gml positive immune protection 3 0.004608 0.004347 0.237272
648 u_pos.gml positive environmental trigger 2 0.003072 0.000505 0.196731
649 u_pos.gml positive time 1 0.001536 0.000000 0.213486
650 u_pos.gml positive overseas 3 0.004608 0.000870 0.204807
651 u_pos.gml positive Tdap vaccine 12 0.018433 0.021138 0.257569

652 rows × 7 columns


In [10]:
# save
combined_df.to_csv('../output/df/u_nodes_pos.csv')

4. Draw undirected and directed network


In [11]:
# 7_graph_calculation
def drawIt(graph, what = 'graph'):
    nsize = graph.number_of_nodes()
    print "Drawing %s of size %s:" % (what, nsize)
    
    if nsize > 20:
        plt.figure(figsize=(10, 10))
        if nsize > 40:
            nx.draw_spring(graph, with_labels = True, node_size = 70, font_size = 12)
        else:
            nx.draw_spring(graph, with_labels = True)
    else:
        nx.draw_spring(graph, with_labels = True)
    plt.show()

# for undirected graphs
def describeGraph(graph):
    components = sorted(nx.connected_components(graph), key = len, reverse = True)
    cc = [len(c) for c in components]
    subgraphs = list(nx.connected_component_subgraphs(graph))
    params = (graph.number_of_edges(),graph.number_of_nodes(),len(cc))
    print "Graph has %s edges, %s nodes, %s connected components\n" % params
    drawIt(graph)
    for sub in components:
        drawIt(graph.subgraph(sub), what = 'component')

# for directed graphs
def describeGraph_d(graph):
    components = sorted(nx.weakly_connected_components(graph), key = len, reverse = True)
    cc = [len(c) for c in components]
    subgraphs = list(nx.weakly_connected_component_subgraphs(graph))
    params = (graph.number_of_edges(),graph.number_of_nodes(),len(cc))
    print "Graph has %s edges, %s nodes, %s connected components\n" % params
    drawIt(graph)
    for sub in components:
        drawIt(graph.subgraph(sub), what = 'component')

In [12]:
# UNDIRECTED network graph
describeGraph(ugraph)


Graph has 1140 edges, 652 nodes, 21 connected components

Drawing graph of size 652:
Drawing component of size 585:
Drawing component of size 15:
Drawing component of size 7:
Drawing component of size 4:
Drawing component of size 4:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:

In [13]:
# DIRECTED network graph
describeGraph_d(dgraph)


Graph has 1140 edges, 652 nodes, 21 connected components

Drawing graph of size 652:
Drawing component of size 585:
Drawing component of size 15:
Drawing component of size 7:
Drawing component of size 4:
Drawing component of size 4:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:

Undirected graph components


In [14]:
# list of connected components by size (undirected graph)
connected_components = [len(c) for c in sorted(nx.connected_components(ugraph), key=len, reverse=True)]

# generate connected components as subgraphs (undirected graph)
subgraphs = list(nx.connected_component_subgraphs(ugraph))

# greatest component (undirected MultiGraph)
u_Gc = max(nx.connected_component_subgraphs(ugraph), key=len)
u_Gc.name = "undirected Gc"

In [15]:
print "connected components = ", connected_components
print nx.info(u_Gc)


connected components =  [585, 15, 7, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Name: undirected Gc
Type: MultiGraph
Number of nodes: 585
Number of edges: 1088
Average degree:   3.7197

Directed graph components


In [16]:
# use directed dgraph
components = sorted(nx.weakly_connected_components(dgraph), key = len, reverse = True)
cc = [len(c) for c in components]

# generate connected components as subgraphs 
subgraphs = list(nx.weakly_connected_component_subgraphs(dgraph))

# greatest component
d_Gc = max(nx.weakly_connected_component_subgraphs(dgraph), key=len)
d_Gc.name = "directed Gc"

In [17]:
print "connected components = ", cc
print nx.info(d_Gc)


connected components =  [585, 15, 7, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Name: directed Gc
Type: MultiDiGraph
Number of nodes: 585
Number of edges: 1088
Average in degree:   1.8598
Average out degree:   1.8598

5. Greatest component graph


In [18]:
# finally, greatest components for undirected and directed graphs
print nx.info(u_Gc)
print nx.info(d_Gc)


Name: undirected Gc
Type: MultiGraph
Number of nodes: 585
Number of edges: 1088
Average degree:   3.7197
Name: directed Gc
Type: MultiDiGraph
Number of nodes: 585
Number of edges: 1088
Average in degree:   1.8598
Average out degree:   1.8598

In [19]:
# save Gc
nx.write_gml(u_Gc, "../output/network/u_Gc_positive.gml")
nx.write_gml(d_Gc, "../output/network/d_Gc_positive.gml")

6. network stats for DIRECTED GC


In [ ]:
# load directed Gc
Gc_files = glob('../output/network/d_Gc_positive.gml')

network_data_columns = ['name',
                    'sentiment',
                    '# nodes',
                    '# edges',
                    #'avg deg',
                    'density',
                    'deg assort coef', 
                    'avg deg cent',
                    'avg bet cent',
                    'avg clo cent',
                    'high deg cent',
                    'high bet cent',
                    'high clo cent',
                    'avg node conn',
                    '# conn comp',
                    'gc size'
                    ]
network_data = pd.DataFrame(columns = network_data_columns)

In [ ]:
# Gc_files
for graph_num, gml_graph in enumerate(Gc_files):
    graph = nx.read_gml(gml_graph)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 10)
    print(gml_graph)
    calculate_graph_inf(graph)
    
    # calculate variables
    sent = "pos"
    nodes = nx.number_of_nodes(graph)
    edges = nx.number_of_edges(graph)
    density = float("{0:.4f}".format(nx.density(graph)))
    avg_deg_cen = np.array(nx.degree_centrality(graph).values()).mean()
    avg_bet_cen = np.array(nx.betweenness_centrality(graph).values()).mean()
    avg_clo_cen = np.array(nx.closeness_centrality(graph).values()).mean()
    #avg_deg = float("{0:.4f}".format(in_deg + out_deg))
    avg_node_con = float("{0:.4f}".format((nx.average_node_connectivity(graph))))
    deg_assort_coeff = float("{0:.4f}".format((nx.degree_assortativity_coefficient(graph))))
    conn_comp = nx.number_weakly_connected_components(graph) # ugraph
    deg_cen = nx.degree_centrality(graph)
    bet_cen = nx.betweenness_centrality(graph)
    clo_cen = nx.closeness_centrality(graph)
    highest_deg_cen = highest_centrality(deg_cen)
    highest_bet_cen = highest_centrality(bet_cen)
    highest_clo_cen = highest_centrality(clo_cen)
    Gc = len(max(nx.weakly_connected_component_subgraphs(graph), key=len))

    # save variables into list
    graph_values = {'name':filename,
                    'sentiment':sent,
                    '# nodes':nodes,
                    '# edges':edges,
                    #'avg deg':avg_deg,
                    'density':density,
                    'deg assort coef':deg_assort_coeff,
                    'avg deg cent':"%.4f" % avg_deg_cen,
                    'avg bet cent':"%.4f" % avg_bet_cen,
                    'avg clo cent':"%.4f" % avg_clo_cen,
                    'high deg cent':highest_deg_cen,
                    'high bet cent':highest_bet_cen,
                    'high clo cent':highest_clo_cen,
                    'avg node conn':avg_node_con,
                    '# conn comp':conn_comp,
                    'gc size':Gc
                    }
    network_data = network_data.append(graph_values, ignore_index=True)

In [ ]:
# print network data for greatest component
network_data

In [ ]:
# save
#network_data.to_csv('../output/df/d_Gc_pos.csv')

7. network stats for UNDIRECTED GC


In [22]:
# load UNdirected Gc
Gc_files = glob('../output/network/u_Gc_positive.gml')

network_data_columns = ['name',
                    'sentiment',
                    '# nodes',
                    '# edges',
                    #'avg deg',
                    'density',
                    'deg assort coef', 
                    'avg deg cent',
                    'avg bet cent',
                    'avg clo cent',
                    'high deg cent',
                    'high bet cent',
                    'high clo cent',
                    'avg node conn'
                    #'# conn comp',
                    #'gc size'
                    ]
network_data = pd.DataFrame(columns = network_data_columns)

In [23]:
# Gc_files
for graph_num, gml_graph in enumerate(Gc_files):
    graph = nx.read_gml(gml_graph)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 10)
    print(gml_graph)
    calculate_graph_inf(graph)
    
    # calculate variables
    sent = "pos"
    nodes = nx.number_of_nodes(graph)
    edges = nx.number_of_edges(graph)
    density = float("{0:.4f}".format(nx.density(graph)))
    avg_deg_cen = np.array(nx.degree_centrality(graph).values()).mean()
    avg_bet_cen = np.array(nx.betweenness_centrality(graph).values()).mean()
    avg_clo_cen = np.array(nx.closeness_centrality(graph).values()).mean()
    #avg_deg = float("{0:.4f}".format(in_deg + out_deg))
    avg_node_con = float("{0:.4f}".format((nx.average_node_connectivity(graph))))
    deg_assort_coeff = float("{0:.4f}".format((nx.degree_assortativity_coefficient(graph))))
    #conn_comp = nx.number_weakly_connected_components(graph) # ugraph
    deg_cen = nx.degree_centrality(graph)
    bet_cen = nx.betweenness_centrality(graph)
    clo_cen = nx.closeness_centrality(graph)
    highest_deg_cen = highest_centrality(deg_cen)
    highest_bet_cen = highest_centrality(bet_cen)
    highest_clo_cen = highest_centrality(clo_cen)
    #Gc = len(max(nx.weakly_connected_component_subgraphs(graph), key=len))

    # save variables into list
    graph_values = {'name':filename,
                    'sentiment':sent,
                    '# nodes':nodes,
                    '# edges':edges,
                    #'avg deg':avg_deg,
                    'density':density,
                    'deg assort coef':deg_assort_coeff,
                    'avg deg cent':"%.4f" % avg_deg_cen,
                    'avg bet cent':"%.4f" % avg_bet_cen,
                    'avg clo cent':"%.4f" % avg_clo_cen,
                    'high deg cent':highest_deg_cen,
                    'high bet cent':highest_bet_cen,
                    'high clo cent':highest_clo_cen,
                    'avg node conn':avg_node_con
                    #'# conn comp':conn_comp,
                    #'gc size':Gc
                    }
    network_data = network_data.append(graph_values, ignore_index=True)


----------
../output/network/u_Gc_positive.gml
Name: u_Gc_positive.gml
Type: MultiGraph
Number of nodes: 585
Number of edges: 1088
Average degree:   3.7197

In [24]:
# print network data for greatest component
network_data


Out[24]:
name sentiment # nodes # edges density deg assort coef avg deg cent avg bet cent avg clo cent high deg cent high bet cent high clo cent avg node conn
0 u_Gc_positive.gml pos 585.0 1088.0 0.0064 -0.099 0.0064 0.0060 0.2292 (vaccines, 0.116438356164) (parents, 0.271838812819) (parents, 0.368686868687) 1.3117

In [25]:
# save
#network_data.to_csv('../output/df/u_Gc_pos.csv')

Gc nodes table


In [26]:
#gml_files = glob('../output/network/d_Gc_positive.gml')
gml_files = glob('../output/network/u_Gc_positive.gml')

In [27]:
# 2_node_df: list all nodes and centrality
data_columns = ['name',
                'sentiment'
                ]
data = pd.DataFrame(columns = data_columns)
#combined_df = pd.DataFrame()

In [28]:
for graph_num, gml_graph in enumerate(gml_files):
    graph = nx.read_gml(gml_graph)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 10)
    print(gml_graph)
    calculate_graph_inf(graph)

    # calculate variables and save into list
    sent = "pos"    
    deg_cent = nx.degree_centrality(graph)
    bet_cent = nx.betweenness_centrality(graph)
    clo_cent = nx.closeness_centrality(graph)
    graph_values = {'name':filename,
                    'sentiment':sent
                    }
    data = data.append(graph_values, ignore_index=True)

    degree = nx.degree(graph)
    deg_df = pd.DataFrame.from_dict(degree, orient = 'index')
    deg_df.columns = ['degree']
    # degree centrality
    deg_cent = nx.degree_centrality(graph)
    dc_df = pd.DataFrame.from_dict(deg_cent, orient = 'index')
    dc_df.columns = ['deg cent']
    # betweenness centrality
    bet_cent = nx.betweenness_centrality(graph)
    bc_df = pd.DataFrame.from_dict(bet_cent, orient = 'index')
    bc_df.columns = ['bet cent']
    # closeness centrality
    clo_cent = nx.closeness_centrality(graph)
    cc_df = pd.DataFrame.from_dict(clo_cent, orient = 'index')
    cc_df.columns = ['clo cent']
    # concat node frames into node_df
    frames = [deg_df, dc_df, bc_df, cc_df]
    node_df = pd.concat(frames, axis = 1)
    node_df.index.name = 'node'
    node_df = node_df.reset_index()

    values = pd.DataFrame(graph_values, columns = ('name', 'sentiment'), index = [0])
    
    # df = merges graph_values with node_df for single graph and fill NaNs
    df = pd.concat([values, node_df], axis = 1)
    df = df.fillna(method='ffill')
    #combined_df = combined_df.append(df)


----------
../output/network/u_Gc_positive.gml
Name: u_Gc_positive.gml
Type: MultiGraph
Number of nodes: 585
Number of edges: 1088
Average degree:   3.7197

In [29]:
# print positive gc nodes
df


Out[29]:
name sentiment node degree deg cent bet cent clo cent
0 u_Gc_positive.gml pos neighbors 1 0.001712 0.000000 0.198033
1 u_Gc_positive.gml pos colleges 1 0.001712 0.000000 0.204482
2 u_Gc_positive.gml pos influenza 2 0.003425 0.000744 0.168009
3 u_Gc_positive.gml pos parents of autistic children 6 0.010274 0.005560 0.265938
4 u_Gc_positive.gml pos religious exemption 9 0.015411 0.007146 0.269995
5 u_Gc_positive.gml pos results 1 0.001712 0.000000 0.215181
6 u_Gc_positive.gml pos no brainer 1 0.001712 0.000000 0.236725
7 u_Gc_positive.gml pos repetitive behaviors 1 0.001712 0.000000 0.125322
8 u_Gc_positive.gml pos children 31 0.053082 0.082502 0.341520
9 u_Gc_positive.gml pos vaccination schedule 4 0.006849 0.001338 0.260134
10 u_Gc_positive.gml pos best-sellers 1 0.001712 0.000000 0.205851
11 u_Gc_positive.gml pos American Medical Association 5 0.008562 0.001073 0.236150
12 u_Gc_positive.gml pos Orthodox Jewish communities 1 0.001712 0.000000 0.252814
13 u_Gc_positive.gml pos fence-sitters 4 0.006849 0.015501 0.277435
14 u_Gc_positive.gml pos Journal of the American Medical Association 1 0.001712 0.000000 0.210906
15 u_Gc_positive.gml pos sexually transmitted virus 4 0.006849 0.007282 0.280500
16 u_Gc_positive.gml pos fear of autism 5 0.008562 0.008598 0.276515
17 u_Gc_positive.gml pos genetic risk factors for ASD 1 0.001712 0.000000 0.163907
18 u_Gc_positive.gml pos siblings 1 0.001712 0.000000 0.175112
19 u_Gc_positive.gml pos resources 2 0.003425 0.001836 0.205924
20 u_Gc_positive.gml pos risk 1 0.001712 0.000000 0.236725
21 u_Gc_positive.gml pos increased sensitivity to light 1 0.001712 0.000000 0.221128
22 u_Gc_positive.gml pos choice 1 0.001712 0.000000 0.258407
23 u_Gc_positive.gml pos stiff neck 1 0.001712 0.000000 0.161773
24 u_Gc_positive.gml pos nausea 2 0.003425 0.001685 0.245172
25 u_Gc_positive.gml pos ill effects 1 0.001712 0.000000 0.195776
26 u_Gc_positive.gml pos cultured cells 3 0.005137 0.002273 0.160307
27 u_Gc_positive.gml pos spread of infectious diseases 4 0.006849 0.001095 0.261766
28 u_Gc_positive.gml pos decrease in exemption rates 6 0.010274 0.010310 0.234162
29 u_Gc_positive.gml pos debunked 1 0.001712 0.000000 0.195776
... ... ... ... ... ... ... ...
555 u_Gc_positive.gml pos vaccine requirements 7 0.011986 0.003956 0.272897
556 u_Gc_positive.gml pos role 1 0.001712 0.000000 0.195579
557 u_Gc_positive.gml pos driving factors 3 0.005137 0.000122 0.280365
558 u_Gc_positive.gml pos sexually active 7 0.011986 0.010474 0.277567
559 u_Gc_positive.gml pos immunity 6 0.010274 0.004142 0.282535
560 u_Gc_positive.gml pos expected 1 0.001712 0.000000 0.210223
561 u_Gc_positive.gml pos sense of urgency 1 0.001712 0.000000 0.269497
562 u_Gc_positive.gml pos rubella 13 0.022260 0.029994 0.260714
563 u_Gc_positive.gml pos former gastroenterologist 1 0.001712 0.000000 0.141097
564 u_Gc_positive.gml pos varicella vaccine 3 0.005137 0.000006 0.138323
565 u_Gc_positive.gml pos magnitude of benefits 1 0.001712 0.000000 0.184111
566 u_Gc_positive.gml pos serogroups 1 0.001712 0.000000 0.221128
567 u_Gc_positive.gml pos HPV infection 3 0.005137 0.000007 0.233507
568 u_Gc_positive.gml pos state vaccination rates 3 0.005137 0.003646 0.175957
569 u_Gc_positive.gml pos loss of limb 1 0.001712 0.000000 0.186701
570 u_Gc_positive.gml pos Early Childhood Australia 3 0.005137 0.003425 0.223242
571 u_Gc_positive.gml pos religious groups 24 0.041096 0.101817 0.321940
572 u_Gc_positive.gml pos age 26 1 0.001712 0.000000 0.181366
573 u_Gc_positive.gml pos Robert F. Kennedy Jr. 2 0.003425 0.009372 0.262354
574 u_Gc_positive.gml pos friends 1 0.001712 0.000000 0.269497
575 u_Gc_positive.gml pos Catholic Church 1 0.001712 0.000000 0.196898
576 u_Gc_positive.gml pos Amish 3 0.005137 0.001786 0.245481
577 u_Gc_positive.gml pos scheduled appointment 1 0.001712 0.000000 0.269497
578 u_Gc_positive.gml pos meningococcal disease symptoms 4 0.006849 0.006155 0.192930
579 u_Gc_positive.gml pos Netherlands Reformed Congregation 2 0.003425 0.000000 0.252814
580 u_Gc_positive.gml pos immune protection 3 0.005137 0.005403 0.264493
581 u_Gc_positive.gml pos environmental trigger 2 0.003425 0.000628 0.219302
582 u_Gc_positive.gml pos time 1 0.001712 0.000000 0.237979
583 u_Gc_positive.gml pos overseas 3 0.005137 0.001082 0.228303
584 u_Gc_positive.gml pos Tdap vaccine 12 0.020548 0.026271 0.287119

585 rows × 7 columns


In [30]:
# save
##df.to_csv('../output/df/d_Gc_nodes_pos.csv')

#df.to_csv('../output/df/u_Gc_nodes_pos.csv')

Cutsets


In [31]:
print "Greatest component size =", len(graph)


Greatest component size = 585

In [32]:
# returns all minimum k cutsets of an undirected graph
# i.e., the set(s) of nodes of cardinality equal to the node connectivity of G
# thus if removed, would break G into two or more connected components

#cutsets = list(nx.all_node_cuts(graph))  # must be undirected

print "Greatest component size =", len(graph)
#print "# of cutsets =", len(cutsets)

# returns a set of nodes or edges of minimum cardinality that disconnects G
min_ncut = nx.minimum_node_cut(graph)
min_ecut = nx.minimum_edge_cut(graph)

print "Min node cut =", min_ncut
print "Min edge cut =", min_ecut

# min cuts with source and target
print nx.minimum_node_cut(graph, s='vaccines', t='autism')
print nx.minimum_edge_cut(graph, s='vaccines', t='autism')


Greatest component size = 585
Min node cut = set([u'vaccine message'])
Min edge cut = set([(u'anti-vaccination', u'time')])
set([u'protective effect of vaccines', u'families', u'vaccinated children', u'MMR vaccine', u'autism risk', u'anti-vaccination', u'parents', u'genetic predisposition', u'scientists', u'children at higher risk for autism', u'children', u'Jain study'])
set([(u'vaccinated high-risk children', u'autism'), (u'genetic predisposition', u'autism'), (u'children', u'autism'), (u'MMR vaccine', u'autism'), (u'vaccines', u'autism'), (u'scientists', u'autism'), (u'anti-vaccination', u'autism'), (u'children with autistic sibling', u'autism'), (u'parents', u'autism'), (u'vaccinated children and unvaccinated children', u'autism'), (u'families', u'autism'), (u'harmful association', u'autism'), (u'protective effect of vaccines', u'autism'), (u'vaccinated children', u'autism'), (u'children at higher risk for autism', u'autism')])

In [33]:
# read edge labels in min cut for Gc
# change source and target
a = nx.minimum_edge_cut(graph, s='vaccines', t='autism')
#a = nx.minimum_edge_cut(graph)

labels = nx.get_edge_attributes(graph,'edge')
edgelabels = {}
for e in labels.keys():
    e1 = e[0:2]
    edgelabels[e1]=labels[e]

for e in a:
    if edgelabels.has_key(e):
        print e,edgelabels[e]
    else:
        rev_e = e[::-1]
        print rev_e, edgelabels[rev_e]


(u'vaccinated high-risk children', u'autism') are less likely to be diagnosed with
(u'genetic predisposition', u'autism') makes more vulnerable to
(u'children', u'autism') one in 68 kids has some form of
(u'MMR vaccine', u'autism') researchers were unable to find any association with
(u'vaccines', u'autism') cause
(u'scientists', u'autism') remains challenge for
(u'anti-vaccination', u'autism') is driven by fears that shots cause
(u'children with autistic sibling', u'autism') more likely to have
(u'parents', u'autism') who already have a child with autism seem even more concerned
(u'vaccinated children and unvaccinated children', u'autism') severity does not differ between
(u'families', u'autism') remains challenge for
(u'harmful association', u'autism') none between MMR vaccine and
(u'protective effect of vaccines', u'autism') may protect children from
(u'vaccinated children', u'autism') are somewhat less likely to be diagnosed with
(u'children at higher risk for autism', u'autism') actually less likely to receive diagnosis for

full network node centrality


In [34]:
# make sure you're using the right graph
print "gml_files = ", gml_files
print "gml_graph = ", gml_graph


gml_files =  ['../output/network/u_Gc_positive.gml']
gml_graph =  ../output/network/u_Gc_positive.gml

In [35]:
# FULL UNDIRECTED
graph = nx.read_gml('../output/network/u_pos.gml')  

## graph = nx.read_gml('../output/network/article_pos1.gml')  # full network directed

print nx.info(graph)


Name: article_pos1.gml
Type: MultiGraph
Number of nodes: 652
Number of edges: 1140
Average degree:   3.4969

In [36]:
# degree centrality
dc = nx.degree_centrality(graph)
dc_df = pd.DataFrame.from_dict(dc, orient = 'index')
dc_df.columns = ['degree cent']
dc_df = dc_df.sort_values(by = ['degree cent'])
#dc_df

# betweenness centrality
bc = nx.betweenness_centrality(graph)
bc_df = pd.DataFrame.from_dict(bc, orient = 'index')
bc_df.columns = ['betweenness cent']
bc_df = bc_df.sort_values(by = ['betweenness cent'])
#bc_df

# closeness centrality
cc = nx.closeness_centrality(graph)
cc_df = pd.DataFrame.from_dict(cc, orient = 'index')
cc_df.columns = ['closeness cent']
cc_df = cc_df.sort_values(by = ['closeness cent'])
#cc_df

In [37]:
dc_df


Out[37]:
degree cent
neighbors 0.001536
arm 0.001536
elite list 0.001536
sex 0.001536
testing 0.001536
free vaccine 0.001536
Caribbean 0.001536
medical law 0.001536
strong-arm tactics 0.001536
gift from God 0.001536
false concerns 0.001536
Early Childhood Australia's chief executive 0.001536
unvaccinated high-risk children 0.001536
Federal Circuit 0.001536
random cases 0.001536
unethical 0.001536
medical conditions 0.001536
psychiatrist 0.001536
factor 0.001536
rash 0.001536
severe symptoms 0.001536
fear 0.001536
Jewish dietary laws 0.001536
opportunistic infections 0.001536
Sydney, Australia 0.001536
efficacious 0.001536
public schools 0.001536
healthy people 0.001536
risk of rubella 0.001536
computer models 0.001536
... ...
children at higher risk for autism 0.018433
studies 0.019969
rubella 0.019969
vaccine refusal 0.019969
anti-vaccination website 0.021505
Gardasil 0.021505
states 0.023041
personal belief exemption 0.024578
Jain study 0.024578
side effects 0.024578
community 0.026114
vaccination exemption 0.026114
meningococcal vaccine 0.027650
herd immunity 0.029186
disease 0.029186
autism risk 0.030722
SB 277 0.033794
measles vaccine 0.035330
religious groups 0.036866
MMR vaccine 0.039939
anti-vaccination 0.043011
children 0.047619
meningococcal disease 0.049155
vaccine-autism link 0.050691
HPV vaccine 0.052227
autism 0.059908
vaccination 0.078341
parents 0.089094
measles 0.099846
vaccines 0.104455

652 rows × 1 columns


In [38]:
bc_df


Out[38]:
betweenness cent
neighbors 0.000000
public schools 0.000000
behavioral research 0.000000
diarrhea deaths 0.000000
efficacious 0.000000
Early Childhood Australia's chief executive 0.000000
arm 0.000000
elite list 0.000000
sex 0.000000
testing 0.000000
free vaccine 0.000000
Caribbean 0.000000
unconscionable 0.000000
medical law 0.000000
gift from God 0.000000
false concerns 0.000000
unvaccinated high-risk children 0.000000
opportunistic infections 0.000000
Federal Circuit 0.000000
imitation infection 0.000000
random cases 0.000000
unethical 0.000000
opponent of sanity-oriented legislation 0.000000
medical conditions 0.000000
vaccine efficacy 0.000000
strong-arm tactics 0.000000
Department of Public Health Immunization Program 0.000000
MMR vaccine doesn't trigger autism 0.000000
genes 0.000000
argument 0.000000
... ...
CDC 0.019609
vaccinated 0.019861
personal belief exemption 0.020411
studies 0.020874
Tdap vaccine 0.021138
Muslim fundamentalists 0.021258
parents who refuse to vaccinate their children 0.022245
disease 0.022347
rubella 0.024134
vaccination exemption 0.024163
Wakefield study 0.024712
polio vaccine opposition 0.024914
Jain study 0.034253
United States 0.036525
Gardasil 0.039917
side effects 0.041014
measles vaccine 0.042119
SB 277 0.045962
community 0.046205
HPV vaccine 0.058863
autism 0.064317
children 0.066382
meningococcal disease 0.072818
vaccine-autism link 0.073800
religious groups 0.081924
vaccination 0.086563
anti-vaccination 0.101460
measles 0.124398
vaccines 0.175097
parents 0.218725

652 rows × 1 columns


In [39]:
cc_df


Out[39]:
closeness cent
meningococcal conjugate booster 0.001536
autism-linked genes 0.001536
benefit 0.001536
short amount of time 0.001536
critical period 0.001536
16 years of age 0.001536
prenatal development 0.001536
factor 0.001536
government 0.001536
insulin 0.001536
reduced vaccine potency 0.001536
Samantha Page 0.001536
Early Childhood Australia's chief executive 0.001536
peer pressure 0.001536
suboptimal protection 0.001536
Scott Morrison 0.001536
decision to vaccinate 0.001536
no jab, no pay policy 0.001536
compliance 0.001536
Australian social services minister 0.001536
reactions 0.001536
Northern Hemisphere flu vaccine 0.001536
Assembly 0.002048
vaccines are not necessary 0.002048
healing through prayer 0.002048
medical student 0.002048
critics 0.002048
opposition 0.002048
Princeton University 0.002048
Minnesota 0.002048
... ...
Tdap vaccine 0.257569
health care 0.258331
vaccine-preventable diseases 0.259611
infectious disease 0.259998
Jehovah's Witnesses 0.260385
Jews 0.260645
measles vaccine 0.261295
herd immunity 0.262473
vaccine delay 0.262868
developmental disability 0.263794
vaccination exemption 0.264193
religion 0.264727
public health 0.264861
side effects 0.265398
protection 0.269078
vaccine refusal 0.269355
disease 0.271871
personal belief exemption 0.273004
schools 0.273718
vaccination 0.278076
anti-vaccination 0.280008
vaccine-autism link 0.282423
SB 277 0.283340
autism 0.285969
religious groups 0.288807
community 0.289445
measles 0.303356
children 0.306372
vaccines 0.312400
parents 0.330742

652 rows × 1 columns

Gc node centrality


In [40]:
# GC undirected
graph = nx.read_gml('../output/network/u_Gc_positive.gml')  

## graph = nx.read_gml('../output/network/d_Gc_positive.gml')  # gc directed

print nx.info(graph)


Name: undirected Gc
Type: MultiGraph
Number of nodes: 585
Number of edges: 1088
Average degree:   3.7197

In [41]:
# degree centrality
dc = nx.degree_centrality(graph)
dc_df = pd.DataFrame.from_dict(dc, orient = 'index')
dc_df.columns = ['degree cent']
dc_df = dc_df.sort_values(by = ['degree cent'])
#dc_df

# betweenness centrality
bc = nx.betweenness_centrality(graph)
bc_df = pd.DataFrame.from_dict(bc, orient = 'index')
bc_df.columns = ['betweenness cent']
bc_df = bc_df.sort_values(by = ['betweenness cent'])
#bc_df

# closeness centrality
cc = nx.closeness_centrality(graph)
cc_df = pd.DataFrame.from_dict(cc, orient = 'index')
cc_df.columns = ['closeness cent']
cc_df = cc_df.sort_values(by = ['closeness cent'])
#cc_df

In [42]:
dc_df


Out[42]:
degree cent
neighbors 0.001712
Merck 0.001712
Sydney, Australia 0.001712
false concerns 0.001712
prophylaxis 0.001712
India 0.001712
German measles 0.001712
revaccinated 0.001712
measles experience 0.001712
programs 0.001712
party lines 0.001712
Muslim leaders 0.001712
gift from God 0.001712
bad news 0.001712
children with family history of autism 0.001712
immune memory cells 0.001712
democrat 0.001712
three-dose course 0.001712
neurological problems 0.001712
childcare benefits 0.001712
worried 0.001712
part of the story 0.001712
reduction in HPV 0.001712
state-level policy 0.001712
protect the kid next to you 0.001712
Afghanistan 0.001712
medical law 0.001712
women without previous HPV 0.001712
consequences 0.001712
one dose Gardasil 0.001712
... ...
Tdap vaccine 0.020548
children at higher risk for autism 0.020548
vaccine refusal 0.022260
studies 0.022260
rubella 0.022260
Gardasil 0.023973
states 0.025685
side effects 0.027397
personal belief exemption 0.027397
Jain study 0.027397
community 0.029110
vaccination exemption 0.029110
meningococcal vaccine 0.030822
disease 0.032534
herd immunity 0.032534
autism risk 0.034247
SB 277 0.037671
measles vaccine 0.039384
religious groups 0.041096
MMR vaccine 0.044521
anti-vaccination 0.047945
children 0.053082
meningococcal disease 0.054795
vaccine-autism link 0.056507
HPV vaccine 0.058219
autism 0.066781
vaccination 0.087329
parents 0.099315
measles 0.111301
vaccines 0.116438

585 rows × 1 columns


In [43]:
bc_df


Out[43]:
betweenness cent
neighbors 0.000000
threat 0.000000
backfire 0.000000
wealthy regions 0.000000
benefits 0.000000
arm 0.000000
partial protection 0.000000
voluntary 0.000000
members 0.000000
diarrhea deaths 0.000000
caregivers 0.000000
German measles 0.000000
theory 0.000000
11-18 year olds 0.000000
behavioral research 0.000000
Johns Hopkins University 0.000000
state legislatures 0.000000
Prabhupada Village 0.000000
sex 0.000000
mumps vaccine 0.000000
compromise 0.000000
anyone 0.000000
vaccination of pregnant women 0.000000
children with cancer 0.000000
pneumonia deaths 0.000000
small nudges 0.000000
tuition 0.000000
Americans 0.000000
prohibits vaccinating members 0.000000
seizures 0.000000
... ...
CDC 0.024371
vaccinated 0.024684
personal belief exemption 0.025367
studies 0.025942
Tdap vaccine 0.026271
Muslim fundamentalists 0.026421
parents who refuse to vaccinate their children 0.027647
disease 0.027773
rubella 0.029994
vaccination exemption 0.030031
Wakefield study 0.030712
polio vaccine opposition 0.030964
Jain study 0.042571
United States 0.045395
Gardasil 0.049611
side effects 0.050973
measles vaccine 0.052347
SB 277 0.057123
community 0.057426
HPV vaccine 0.073157
autism 0.079935
children 0.082502
meningococcal disease 0.090500
vaccine-autism link 0.091721
religious groups 0.101817
vaccination 0.107583
anti-vaccination 0.126098
measles 0.154606
vaccines 0.217616
parents 0.271839

585 rows × 1 columns


In [44]:
cc_df


Out[44]:
closeness cent
unable to speak 0.125322
repetitive behaviors 0.125322
The Lancet 0.133516
Federal Circuit 0.133516
flu vaccine recall 0.137089
rubella vaccine 0.138323
varicella vaccine 0.138323
former gastroenterologist 0.141097
shame 0.141097
children with autism 0.143243
part of the story 0.149667
vaccine information sources 0.151767
MMR vaccine doesn't trigger autism 0.153806
vaccines do not cause autism 0.154049
Department of Public Health Immunization Program 0.156317
reduced effectiveness 0.158653
safety concern 0.158653
GlaxoSmithKline 0.158739
vaccine potency 0.158739
Catholic parents 0.160307
aborted fetuses 0.160307
cultured cells 0.160307
vaccinations should be voluntary 0.161015
stiff neck 0.161773
genetic risk factors for ASD 0.163907
pharmacological interventions 0.163907
behavioral interventions 0.163907
Andrew Wakefield 0.164229
three-dose course 0.166382
one dose of vaccine 0.166382
... ...
Tdap vaccine 0.287119
health care 0.287968
vaccine-preventable diseases 0.289395
infectious disease 0.289826
Jehovah's Witnesses 0.290258
Jews 0.290547
measles vaccine 0.291272
herd immunity 0.292585
vaccine delay 0.293026
developmental disability 0.294058
vaccination exemption 0.294503
religion 0.295099
public health 0.295248
side effects 0.295846
protection 0.299949
vaccine refusal 0.300257
disease 0.303062
personal belief exemption 0.304325
schools 0.305120
vaccination 0.309979
anti-vaccination 0.312133
vaccine-autism link 0.314825
SB 277 0.315846
autism 0.318777
religious groups 0.321940
community 0.322652
measles 0.338159
children 0.341520
vaccines 0.348241
parents 0.368687

585 rows × 1 columns


In [ ]: