Gc positive graph

imports "article_pos1.gml"

  • saves "nodes_df_positive.csv"
    • node labels, degrees, and centralities for entire network
  • saves "Gc_positive.gml"

imports "Gc_positive.gml"

  • saves "Gc_df_pos.csv"
    • node labels, degrees, and centralities for greatest component

In [1]:
# 1_network_df

import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
from glob import glob

plt.style.use('ggplot')
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)

gml_files = glob('../output/network/article_pos1.gml')

In [2]:
def calculate_graph_inf(graph):
    graph.name = filename
    info = nx.info(graph)
    print info
    #plt.figure(figsize=(10,10))
    #nx.draw_spring(graph, arrows=True, with_labels=True)

def highest_centrality(cent_dict):
    """Returns a tuple (node,value) with the node
    with largest value from centrality dictionary."""
    # create ordered tuple of centrality data
    cent_items = [(b,a) for (a,b) in cent_dict.iteritems()]
    # sort in descending order
    cent_items.sort()
    cent_items.reverse()
    return tuple(reversed(cent_items[0]))

all nodes table


In [57]:
# 2_node_df: list all nodes and centrality
data_columns = ['name',
                'sentiment'
                ]
data = pd.DataFrame(columns = data_columns)
combined_df = pd.DataFrame()

In [4]:
# graph = directed, ugraph = undirected
for graph_num, gml_graph in enumerate(gml_files):
    graph = nx.read_gml(gml_graph)
    ugraph = graph.to_undirected() # to undirected graph
    U = graph.to_undirected(reciprocal=True)
    e = U.edges()
    ugraph.add_edges_from(e)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 10)
    print(gml_graph)
    calculate_graph_inf(graph)
    calculate_graph_inf(ugraph)

    ## calculate variables and save into list
    sent = "pos"    
    deg_cent = nx.degree_centrality(graph)
    bet_cent = nx.betweenness_centrality(graph)
    clo_cent = nx.closeness_centrality(graph)
    graph_values = {'name':filename,
                    'sentiment':sent
                    }
    data = data.append(graph_values, ignore_index=True)

    degree = nx.degree(graph)
    deg_df = pd.DataFrame.from_dict(degree, orient = 'index')
    deg_df.columns = ['degree']
    # degree centrality
    deg_cent = nx.degree_centrality(graph)
    dc_df = pd.DataFrame.from_dict(deg_cent, orient = 'index')
    dc_df.columns = ['deg cent']
    # betweenness centrality
    bet_cent = nx.betweenness_centrality(graph)
    bc_df = pd.DataFrame.from_dict(bet_cent, orient = 'index')
    bc_df.columns = ['bet cent']
    # closeness centrality
    clo_cent = nx.closeness_centrality(graph)
    cc_df = pd.DataFrame.from_dict(clo_cent, orient = 'index')
    cc_df.columns = ['clo cent']
    # concat node frames into node_df
    frames = [deg_df, dc_df, bc_df, cc_df]
    node_df = pd.concat(frames, axis = 1)
    node_df.index.name = 'node'
    node_df = node_df.reset_index()

    values = pd.DataFrame(graph_values, columns = ('name', 'sentiment'), index = [0])
    
    # df = merges graph_values with node_df for single graph and fill NaNs
    df = pd.concat([values, node_df], axis = 1)
    df = df.fillna(method='ffill')
    combined_df = combined_df.append(df)


----------
../output/network/article_pos1.gml
Name: article_pos1.gml
Type: MultiDiGraph
Number of nodes: 652
Number of edges: 1140
Average in degree:   1.7485
Average out degree:   1.7485
Name: article_pos1.gml
Type: MultiGraph
Number of nodes: 652
Number of edges: 1140
Average degree:   3.4969

In [5]:
# what the network looks like without adding back edges e = U.edges()

#for graph_num, gml_graph in enumerate(gml_files):
#    graph2 = nx.read_gml(gml_graph)
#    ugraph2 = graph.to_undirected() # to undirected graph
#    U2 = graph.to_undirected(reciprocal=True)
#    (filepath, filename) = os.path.split(gml_graph)
#    print('-' * 10)
#    print(gml_graph)
#    calculate_graph_inf(graph2)
#    calculate_graph_inf(ugraph2)

In [6]:
# print entire network
combined_df


Out[6]:
name sentiment node degree deg cent bet cent clo cent
0 article_pos1.gml pos neighbors 1 0.001536 0.000000 0.000000
1 article_pos1.gml pos vitamins 1 0.001536 0.000000 0.000000
2 article_pos1.gml pos colleges 1 0.001536 0.000000 0.000000
3 article_pos1.gml pos influenza 2 0.003072 0.000000 0.000000
4 article_pos1.gml pos parents of autistic children 6 0.009217 0.000000 0.143817
5 article_pos1.gml pos religious exemption 9 0.013825 0.002346 0.118709
6 article_pos1.gml pos vaccine campaign 2 0.003072 0.000000 0.106358
7 article_pos1.gml pos Scott Morrison 1 0.001536 0.000000 0.001536
8 article_pos1.gml pos repetitive behaviors 1 0.001536 0.000000 0.000000
9 article_pos1.gml pos Michael Mina 2 0.003072 0.000000 0.003072
10 article_pos1.gml pos children 31 0.047619 0.035493 0.164520
11 article_pos1.gml pos Dr. Paul Offit 3 0.004608 0.000000 0.004608
12 article_pos1.gml pos vaccination schedule 4 0.006144 0.000610 0.128124
13 article_pos1.gml pos Samantha Page 1 0.001536 0.000000 0.001536
14 article_pos1.gml pos best-sellers 1 0.001536 0.000000 0.000000
15 article_pos1.gml pos American Medical Association 5 0.007680 0.000000 0.115518
16 article_pos1.gml pos Orthodox Jewish communities 1 0.001536 0.000000 0.000000
17 article_pos1.gml pos fence-sitters 4 0.006144 0.000000 0.136112
18 article_pos1.gml pos Journal of the American Medical Association 1 0.001536 0.000000 0.000000
19 article_pos1.gml pos sexually transmitted virus 4 0.006144 0.000088 0.003200
20 article_pos1.gml pos fear of autism 5 0.007680 0.000405 0.078647
21 article_pos1.gml pos genetic risk factors for ASD 1 0.001536 0.000000 0.000000
22 article_pos1.gml pos siblings 1 0.001536 0.000000 0.000000
23 article_pos1.gml pos resources 2 0.003072 0.000000 0.079812
24 article_pos1.gml pos risk 1 0.001536 0.000000 0.117591
25 article_pos1.gml pos results 1 0.001536 0.000000 0.000000
26 article_pos1.gml pos stiff neck 1 0.001536 0.000000 0.000000
27 article_pos1.gml pos Faith Assembly 2 0.003072 0.000000 0.001536
28 article_pos1.gml pos nausea 2 0.003072 0.000000 0.000000
29 article_pos1.gml pos ill effects 1 0.001536 0.000000 0.000000
... ... ... ... ... ... ... ...
622 article_pos1.gml pos role 1 0.001536 0.000000 0.000000
623 article_pos1.gml pos The Lancet 1 0.001536 0.000000 0.001536
624 article_pos1.gml pos sexually active 7 0.010753 0.000149 0.001536
625 article_pos1.gml pos immunity 6 0.009217 0.000000 0.000000
626 article_pos1.gml pos expected 1 0.001536 0.000000 0.000000
627 article_pos1.gml pos sense of urgency 1 0.001536 0.000000 0.000000
628 article_pos1.gml pos health officials 5 0.007680 0.000000 0.158717
629 article_pos1.gml pos rubella 13 0.019969 0.008155 0.136162
630 article_pos1.gml pos former gastroenterologist 1 0.001536 0.000000 0.000000
631 article_pos1.gml pos varicella vaccine 3 0.004608 0.000002 0.003072
632 article_pos1.gml pos magnitude of benefits 1 0.001536 0.000000 0.086659
633 article_pos1.gml pos time 1 0.001536 0.000000 0.000000
634 article_pos1.gml pos 16 years of age 1 0.001536 0.000000 0.000000
635 article_pos1.gml pos state vaccination rates 3 0.004608 0.000000 0.063875
636 article_pos1.gml pos loss of limb 1 0.001536 0.000000 0.000000
637 article_pos1.gml pos Early Childhood Australia 3 0.004608 0.000555 0.001536
638 article_pos1.gml pos religious groups 24 0.036866 0.029318 0.154134
639 article_pos1.gml pos age 26 1 0.001536 0.000000 0.000000
640 article_pos1.gml pos Robert F. Kennedy Jr. 2 0.003072 0.002668 0.131576
641 article_pos1.gml pos friends 1 0.001536 0.000000 0.155294
642 article_pos1.gml pos Catholic Church 1 0.001536 0.000000 0.090615
643 article_pos1.gml pos Amish 3 0.004608 0.000290 0.114472
644 article_pos1.gml pos scheduled appointment 1 0.001536 0.000000 0.000000
645 article_pos1.gml pos meningococcal disease symptoms 4 0.006144 0.000000 0.006144
646 article_pos1.gml pos prenatal development 1 0.001536 0.000000 0.001536
647 article_pos1.gml pos immune protection 3 0.004608 0.000000 0.000000
648 article_pos1.gml pos environmental trigger 2 0.003072 0.000000 0.000000
649 article_pos1.gml pos serogroups 1 0.001536 0.000000 0.000000
650 article_pos1.gml pos overseas 3 0.004608 0.004154 0.110243
651 article_pos1.gml pos Tdap vaccine 12 0.018433 0.024946 0.160107

652 rows × 7 columns


In [7]:
# save
#combined_df.to_csv('../output/df/nodes_df_positive.csv')

Gc nodes table


In [65]:
gml_files = glob('../output/network/d_Gc_positive.gml')

In [66]:
# 2_node_df: list all nodes and centrality
data_columns = ['name',
                'sentiment'
                ]
data = pd.DataFrame(columns = data_columns)
#combined_df = pd.DataFrame()

In [67]:
for graph_num, gml_graph in enumerate(gml_files):
    graph = nx.read_gml(gml_graph)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 10)
    print(gml_graph)
    calculate_graph_inf(graph)

    # calculate variables and save into list
    sent = "pos"    
    deg_cent = nx.degree_centrality(graph)
    bet_cent = nx.betweenness_centrality(graph)
    clo_cent = nx.closeness_centrality(graph)
    graph_values = {'name':filename,
                    'sentiment':sent
                    }
    data = data.append(graph_values, ignore_index=True)

    degree = nx.degree(graph)
    deg_df = pd.DataFrame.from_dict(degree, orient = 'index')
    deg_df.columns = ['degree']
    # degree centrality
    deg_cent = nx.degree_centrality(graph)
    dc_df = pd.DataFrame.from_dict(deg_cent, orient = 'index')
    dc_df.columns = ['deg cent']
    # betweenness centrality
    bet_cent = nx.betweenness_centrality(graph)
    bc_df = pd.DataFrame.from_dict(bet_cent, orient = 'index')
    bc_df.columns = ['bet cent']
    # closeness centrality
    clo_cent = nx.closeness_centrality(graph)
    cc_df = pd.DataFrame.from_dict(clo_cent, orient = 'index')
    cc_df.columns = ['clo cent']
    # concat node frames into node_df
    frames = [deg_df, dc_df, bc_df, cc_df]
    node_df = pd.concat(frames, axis = 1)
    node_df.index.name = 'node'
    node_df = node_df.reset_index()

    values = pd.DataFrame(graph_values, columns = ('name', 'sentiment'), index = [0])
    
    # df = merges graph_values with node_df for single graph and fill NaNs
    df = pd.concat([values, node_df], axis = 1)
    df = df.fillna(method='ffill')
    #combined_df = combined_df.append(df)


----------
../output/network/d_Gc_positive.gml
Name: d_Gc_positive.gml
Type: MultiDiGraph
Number of nodes: 585
Number of edges: 1088
Average in degree:   1.8598
Average out degree:   1.8598

In [68]:
# print positive Gc nodes
df


Out[68]:
name sentiment node degree deg cent bet cent clo cent
0 d_Gc_positive.gml pos neighbors 1 0.001712 0.000000 0.000000
1 d_Gc_positive.gml pos colleges 1 0.001712 0.000000 0.000000
2 d_Gc_positive.gml pos influenza 2 0.003425 0.000000 0.000000
3 d_Gc_positive.gml pos parents of autistic children 6 0.010274 0.000000 0.160317
4 d_Gc_positive.gml pos religious exemption 9 0.015411 0.002916 0.132328
5 d_Gc_positive.gml pos vaccine campaign 2 0.003425 0.000000 0.118560
6 d_Gc_positive.gml pos mild 1 0.001712 0.000000 0.000000
7 d_Gc_positive.gml pos repetitive behaviors 1 0.001712 0.000000 0.000000
8 d_Gc_positive.gml pos children 31 0.053082 0.044112 0.183395
9 d_Gc_positive.gml pos vaccination schedule 4 0.006849 0.000758 0.142823
10 d_Gc_positive.gml pos best-sellers 1 0.001712 0.000000 0.000000
11 d_Gc_positive.gml pos American Medical Association 5 0.008562 0.000000 0.128771
12 d_Gc_positive.gml pos Orthodox Jewish communities 1 0.001712 0.000000 0.000000
13 d_Gc_positive.gml pos fence-sitters 4 0.006849 0.000000 0.151728
14 d_Gc_positive.gml pos Journal of the American Medical Association 1 0.001712 0.000000 0.000000
15 d_Gc_positive.gml pos sexually transmitted virus 4 0.006849 0.000110 0.003567
16 d_Gc_positive.gml pos fear of autism 5 0.008562 0.000503 0.087670
17 d_Gc_positive.gml pos genetic risk factors for ASD 1 0.001712 0.000000 0.000000
18 d_Gc_positive.gml pos siblings 1 0.001712 0.000000 0.000000
19 d_Gc_positive.gml pos resources 2 0.003425 0.000000 0.088969
20 d_Gc_positive.gml pos risk 1 0.001712 0.000000 0.131081
21 d_Gc_positive.gml pos results 1 0.001712 0.000000 0.000000
22 d_Gc_positive.gml pos stiff neck 1 0.001712 0.000000 0.000000
23 d_Gc_positive.gml pos nausea 2 0.003425 0.000000 0.000000
24 d_Gc_positive.gml pos ill effects 1 0.001712 0.000000 0.000000
25 d_Gc_positive.gml pos cultured cells 3 0.005137 0.000000 0.000000
26 d_Gc_positive.gml pos spread of infectious diseases 4 0.006849 0.000000 0.000000
27 d_Gc_positive.gml pos decrease in exemption rates 6 0.010274 0.001107 0.118024
28 d_Gc_positive.gml pos debunked 1 0.001712 0.000000 0.000000
29 d_Gc_positive.gml pos no brainer 1 0.001712 0.000000 0.000000
... ... ... ... ... ... ... ...
555 d_Gc_positive.gml pos vaccine requirements 7 0.011986 0.003169 0.109267
556 d_Gc_positive.gml pos role 1 0.001712 0.000000 0.000000
557 d_Gc_positive.gml pos The Lancet 1 0.001712 0.000000 0.001712
558 d_Gc_positive.gml pos sexually active 7 0.011986 0.000185 0.001712
559 d_Gc_positive.gml pos immunity 6 0.010274 0.000000 0.000000
560 d_Gc_positive.gml pos expected 1 0.001712 0.000000 0.000000
561 d_Gc_positive.gml pos sense of urgency 1 0.001712 0.000000 0.000000
562 d_Gc_positive.gml pos rubella 13 0.022260 0.010135 0.151783
563 d_Gc_positive.gml pos former gastroenterologist 1 0.001712 0.000000 0.000000
564 d_Gc_positive.gml pos varicella vaccine 3 0.005137 0.000002 0.003425
565 d_Gc_positive.gml pos Governor Jerry Brown 2 0.003425 0.000000 0.132597
566 d_Gc_positive.gml pos time 1 0.001712 0.000000 0.000000
567 d_Gc_positive.gml pos HPV infection 3 0.005137 0.000000 0.000000
568 d_Gc_positive.gml pos variation 2 0.003425 0.000003 0.001712
569 d_Gc_positive.gml pos loss of limb 1 0.001712 0.000000 0.000000
570 d_Gc_positive.gml pos Early Childhood Australia 3 0.005137 0.000690 0.001712
571 d_Gc_positive.gml pos parents who refuse to vaccinate their children 11 0.018836 0.007762 0.089538
572 d_Gc_positive.gml pos age 26 1 0.001712 0.000000 0.000000
573 d_Gc_positive.gml pos Robert F. Kennedy Jr. 2 0.003425 0.003316 0.146671
574 d_Gc_positive.gml pos friends 1 0.001712 0.000000 0.173110
575 d_Gc_positive.gml pos Catholic Church 1 0.001712 0.000000 0.101011
576 d_Gc_positive.gml pos Amish 3 0.005137 0.000360 0.127605
577 d_Gc_positive.gml pos scheduled appointment 1 0.001712 0.000000 0.000000
578 d_Gc_positive.gml pos meningococcal disease symptoms 4 0.006849 0.000000 0.006849
579 d_Gc_positive.gml pos Netherlands Reformed Congregation 2 0.003425 0.000000 0.000000
580 d_Gc_positive.gml pos immune protection 3 0.005137 0.000000 0.000000
581 d_Gc_positive.gml pos environmental trigger 2 0.003425 0.000000 0.000000
582 d_Gc_positive.gml pos serogroups 1 0.001712 0.000000 0.000000
583 d_Gc_positive.gml pos overseas 3 0.005137 0.005162 0.122891
584 d_Gc_positive.gml pos Tdap vaccine 12 0.020548 0.031003 0.178475

585 rows × 7 columns


In [69]:
# save
df.to_csv('../output/df/Gc_nodes_pos.csv')

Draw network


In [8]:
# 7_graph_calculation
def drawIt(graph, what = 'graph'):
    nsize = graph.number_of_nodes()
    print "Drawing %s of size %s:" % (what, nsize)
    
    if nsize > 20:
        plt.figure(figsize=(10, 10))
        if nsize > 40:
            nx.draw_spring(graph, with_labels = True, node_size = 70, font_size = 12)
        else:
            nx.draw_spring(graph, with_labels = True)
    else:
        nx.draw_spring(graph, with_labels = True)
    plt.show()

# for undirected graphs
def describeGraph(graph):
    components = sorted(nx.connected_components(graph), key = len, reverse = True)
    cc = [len(c) for c in components]
    subgraphs = list(nx.connected_component_subgraphs(graph))
    params = (graph.number_of_edges(),graph.number_of_nodes(),len(cc))
    print "Graph has %s edges, %s nodes, %s connected components\n" % params
    drawIt(graph)
    for sub in components:
        drawIt(graph.subgraph(sub), what = 'component')

# for directed graphs
def describeGraph_d(graph):
    components = sorted(nx.weakly_connected_components(graph), key = len, reverse = True)
    cc = [len(c) for c in components]
    subgraphs = list(nx.weakly_connected_component_subgraphs(graph))
    params = (graph.number_of_edges(),graph.number_of_nodes(),len(cc))
    print "Graph has %s edges, %s nodes, %s connected components\n" % params
    drawIt(graph)
    for sub in components:
        drawIt(graph.subgraph(sub), what = 'component')

In [9]:
# UNDIRECTED network graph
describeGraph(ugraph)


Graph has 1140 edges, 652 nodes, 21 connected components

Drawing graph of size 652:
Drawing component of size 585:
Drawing component of size 15:
Drawing component of size 7:
Drawing component of size 4:
Drawing component of size 4:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:

In [10]:
# DIRECTED network graph
describeGraph_d(graph)


Graph has 1140 edges, 652 nodes, 21 connected components

Drawing graph of size 652:
Drawing component of size 585:
Drawing component of size 15:
Drawing component of size 7:
Drawing component of size 4:
Drawing component of size 4:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:

Undirected


In [11]:
# list of connected components by size (undirected graph)
connected_components = [len(c) for c in sorted(nx.connected_components(ugraph), key=len, reverse=True)]

# generate connected components as subgraphs (undirected graph)
subgraphs = list(nx.connected_component_subgraphs(ugraph))

# greatest component (undirected MultiGraph)
u_Gc = max(nx.connected_component_subgraphs(ugraph), key=len)
u_Gc.name = "undirected Gc"

In [12]:
print "connected components = ", connected_components
print nx.info(u_Gc)


connected components =  [585, 15, 7, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Name: undirected Gc
Type: MultiGraph
Number of nodes: 585
Number of edges: 1088
Average degree:   3.7197

Directed


In [13]:
# use directed graph
components = sorted(nx.weakly_connected_components(graph), key = len, reverse = True)
cc = [len(c) for c in components]

# generate connected components as subgraphs 
subgraphs = list(nx.weakly_connected_component_subgraphs(graph))

# greatest component
d_Gc = max(nx.weakly_connected_component_subgraphs(graph), key=len)
d_Gc.name = "directed Gc"

In [14]:
print "connected components = ", cc
print nx.info(d_Gc)


connected components =  [585, 15, 7, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Name: directed Gc
Type: MultiDiGraph
Number of nodes: 585
Number of edges: 1088
Average in degree:   1.8598
Average out degree:   1.8598

In [15]:
## understand how direction changes degree ##

print nx.info(graph)  # original directed
print nx.info(ugraph)  # to undirected
temp = ugraph.to_directed()  # back to directed
print nx.info(temp)


Name: article_pos1.gml
Type: MultiDiGraph
Number of nodes: 652
Number of edges: 1140
Average in degree:   1.7485
Average out degree:   1.7485
Name: article_pos1.gml
Type: MultiGraph
Number of nodes: 652
Number of edges: 1140
Average degree:   3.4969
Name: article_pos1.gml
Type: MultiDiGraph
Number of nodes: 652
Number of edges: 2280
Average in degree:   3.4969
Average out degree:   3.4969

In [16]:
# finally, greatest components for undirected and directed graphs
print nx.info(u_Gc)
print nx.info(d_Gc)


Name: undirected Gc
Type: MultiGraph
Number of nodes: 585
Number of edges: 1088
Average degree:   3.7197
Name: directed Gc
Type: MultiDiGraph
Number of nodes: 585
Number of edges: 1088
Average in degree:   1.8598
Average out degree:   1.8598

In [17]:
# save Gc
#nx.write_gml(u_Gc, "../output/network/u_Gc_positive.gml")
#nx.write_gml(d_Gc, "../output/network/d_Gc_positive.gml")

Calculate network statistics (averages) for greatest component.


In [18]:
# load directed Gc
Gc_files = glob('../output/network/d_Gc_positive.gml')

network_data_columns = ['name',
                    'sentiment',
                    '# nodes',
                    '# edges',
                    #'avg deg',
                    'density',
                    'deg assort coef', 
                    'avg deg cent',
                    'avg bet cent',
                    'avg clo cent',
                    'high deg cent',
                    'high bet cent',
                    'high clo cent',
                    'avg node conn',
                    '# conn comp',
                    'gc size'
                    ]
network_data = pd.DataFrame(columns = network_data_columns)

In [19]:
# Gc_files

for graph_num, gml_graph in enumerate(Gc_files):
    graph = nx.read_gml(gml_graph)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 10)
    print(gml_graph)
    calculate_graph_inf(graph)
    
    # calculate variables
    sent = "positive"
    nodes = nx.number_of_nodes(graph)
    edges = nx.number_of_edges(graph)
    density = float("{0:.4f}".format(nx.density(graph)))
    avg_deg_cen = np.array(nx.degree_centrality(graph).values()).mean()
    avg_bet_cen = np.array(nx.betweenness_centrality(graph).values()).mean()
    avg_clo_cen = np.array(nx.closeness_centrality(graph).values()).mean()
    #avg_deg = float("{0:.4f}".format(in_deg + out_deg))
    avg_node_con = float("{0:.4f}".format((nx.average_node_connectivity(graph))))
    deg_assort_coeff = float("{0:.4f}".format((nx.degree_assortativity_coefficient(graph))))
    conn_comp = nx.number_weakly_connected_components(graph) # ugraph
    deg_cen = nx.degree_centrality(graph)
    bet_cen = nx.betweenness_centrality(graph)
    clo_cen = nx.closeness_centrality(graph)
    highest_deg_cen = highest_centrality(deg_cen)
    highest_bet_cen = highest_centrality(bet_cen)
    highest_clo_cen = highest_centrality(clo_cen)
    Gc = len(max(nx.weakly_connected_component_subgraphs(graph), key=len))

    # save variables into list
    graph_values = {'name':filename,
                    'sentiment':sent,
                    '# nodes':nodes,
                    '# edges':edges,
                    #'avg deg':avg_deg,
                    'density':density,
                    'deg assort coef':deg_assort_coeff,
                    'avg deg cent':"%.4f" % avg_deg_cen,
                    'avg bet cent':"%.4f" % avg_bet_cen,
                    'avg clo cent':"%.4f" % avg_clo_cen,
                    'high deg cent':highest_deg_cen,
                    'high bet cent':highest_bet_cen,
                    'high clo cent':highest_clo_cen,
                    'avg node conn':avg_node_con,
                    '# conn comp':conn_comp,
                    'gc size':Gc
                    }
    network_data = network_data.append(graph_values, ignore_index=True)


----------
../output/network/d_Gc_positive.gml
Name: d_Gc_positive.gml
Type: MultiDiGraph
Number of nodes: 585
Number of edges: 1088
Average in degree:   1.8598
Average out degree:   1.8598

In [20]:
# print network data for greatest component
network_data


Out[20]:
name sentiment # nodes # edges density deg assort coef avg deg cent avg bet cent avg clo cent high deg cent high bet cent high clo cent avg node conn # conn comp gc size
0 d_Gc_positive.gml positive 585.0 1088.0 0.0032 -0.0336 0.0064 0.0018 0.0517 (vaccines, 0.116438356164) (vaccines, 0.0795089514173) (parents, 0.23574182936) 0.2974 1.0 585.0

In [21]:
# save
#network_data.to_csv('../output/df/Gc_df_pos.csv')

Cutsets


In [22]:
print "Greatest component size =", len(graph)


Greatest component size = 585

In [23]:
# returns all minimum k cutsets of an undirected graph
# i.e., the set(s) of nodes of cardinality equal to the node connectivity of G
# thus if removed, would break G into two or more connected components

#cutsets = list(nx.all_node_cuts(graph))  # must be undirected

print "Greatest component size =", len(graph)
#print "# of cutsets =", len(cutsets)

# returns a set of nodes or edges of minimum cardinality that disconnects G
min_ncut = nx.minimum_node_cut(graph)
min_ecut = nx.minimum_edge_cut(graph)

print "Min node cut =", min_ncut
print "Min edge cut =", min_ecut

# min cuts with source and target
print nx.minimum_node_cut(graph, s='vaccines', t='autism')
print nx.minimum_edge_cut(graph, s='vaccines', t='autism')


Greatest component size = 585
Min node cut = set([])
Min edge cut = set([])
set([u'protective effect of vaccines', u'children at higher risk for autism', u'MMR vaccine', u'children with autistic sibling', u'anti-vaccination', u'parents', u'children'])
set([(u'children at higher risk for autism', u'autism'), (u'genetic predisposition', u'autism'), (u'children', u'autism'), (u'MMR vaccine', u'autism'), (u'vaccines', u'autism'), (u'anti-vaccination', u'autism'), (u'children with autistic sibling', u'autism'), (u'parents', u'autism'), (u'protective effect of vaccines', u'autism')])

In [24]:
# read edge labels in min cut for Gc
# change source and target
a = nx.minimum_edge_cut(graph, s='vaccines', t='autism')
#a = nx.minimum_edge_cut(graph)

labels = nx.get_edge_attributes(graph,'edge')
edgelabels = {}
for e in labels.keys():
    e1 = e[0:2]
    edgelabels[e1]=labels[e]

for e in a:
    if edgelabels.has_key(e):
        print e,edgelabels[e]
    else:
        rev_e = e[::-1]
        print rev_e, edgelabels[rev_e]


(u'children at higher risk for autism', u'autism') were not more likely to develop
(u'genetic predisposition', u'autism') makes more vulnerable to
(u'children', u'autism') one in 68 kids has some form of
(u'MMR vaccine', u'autism') researchers were unable to find any association with
(u'vaccines', u'autism') cause
(u'anti-vaccination', u'autism') is driven by fears that shots cause
(u'children with autistic sibling', u'autism') more likely to have
(u'parents', u'autism') who already have a child with autism seem even more concerned
(u'protective effect of vaccines', u'autism') may protect children from

Centrality tables


In [49]:
# make sure you're using the right graph
print "gml_files = ", gml_files
print "gml_graph = ", gml_graph


gml_files =  ['../output/network/article_pos1.gml']
gml_graph =  ../output/network/d_Gc_positive.gml

In [53]:
#graph = nx.read_gml('../output/network/article_pos1.gml')  # full network
graph = nx.read_gml('../output/network/d_Gc_positive.gml')  # gc network

print nx.info(graph)


Name: directed Gc
Type: MultiDiGraph
Number of nodes: 585
Number of edges: 1088
Average in degree:   1.8598
Average out degree:   1.8598

In [54]:
# degree centrality
dc = nx.degree_centrality(graph)
dc_df = pd.DataFrame.from_dict(dc, orient = 'index')
dc_df.columns = ['degree cent']
dc_df = dc_df.sort_values(by = ['degree cent'])
#dc_df

# betweenness centrality
bc = nx.betweenness_centrality(graph)
bc_df = pd.DataFrame.from_dict(bc, orient = 'index')
bc_df.columns = ['betweenness cent']
bc_df = bc_df.sort_values(by = ['betweenness cent'])
#bc_df

# closeness centrality
cc = nx.closeness_centrality(graph)
cc_df = pd.DataFrame.from_dict(cc, orient = 'index')
cc_df.columns = ['closeness cent']
cc_df = cc_df.sort_values(by = ['closeness cent'])
#cc_df

In [55]:
dc_df


Out[55]:
degree cent
neighbors 0.001712
children with family history of autism 0.001712
immune memory cells 0.001712
democrat 0.001712
three-dose course 0.001712
protect the kid next to you 0.001712
childcare benefits 0.001712
worried 0.001712
state-level policy 0.001712
infant mortality 0.001712
one dose Gardasil 0.001712
vaccines are safe 0.001712
women without previous HPV 0.001712
new transmission 0.001712
theological issues 0.001712
reduction in HPV 0.001712
abnormal cells 0.001712
important message 0.001712
11-18 year olds 0.001712
members 0.001712
three doses Gardasil 0.001712
pharmacological interventions 0.001712
benefits 0.001712
hardness of the skin 0.001712
Prabhupada Village 0.001712
strong-arm tactics 0.001712
social norms 0.001712
daily activities 0.001712
vast amounts of time 0.001712
at-risk unvaccinated children 0.001712
... ...
children at higher risk for autism 0.020548
Tdap vaccine 0.020548
rubella 0.022260
vaccine refusal 0.022260
studies 0.022260
Gardasil 0.023973
states 0.025685
Jain study 0.027397
side effects 0.027397
personal belief exemption 0.027397
community 0.029110
vaccination exemption 0.029110
meningococcal vaccine 0.030822
herd immunity 0.032534
disease 0.032534
autism risk 0.034247
SB 277 0.037671
measles vaccine 0.039384
religious groups 0.041096
MMR vaccine 0.044521
anti-vaccination 0.047945
children 0.053082
meningococcal disease 0.054795
vaccine-autism link 0.056507
HPV vaccine 0.058219
autism 0.066781
vaccination 0.087329
parents 0.099315
measles 0.111301
vaccines 0.116438

585 rows × 1 columns


In [56]:
bc_df


Out[56]:
betweenness cent
neighbors 0.000000
rash 0.000000
severe symptoms 0.000000
mouth 0.000000
delaying Gardasil 0.000000
World Health Organization 0.000000
reduction in mortality 0.000000
fear 0.000000
deafness 0.000000
Jewish dietary laws 0.000000
social network 0.000000
true cause of autism 0.000000
swelling 0.000000
Sydney, Australia 0.000000
mental disorders 0.000000
commitment 0.000000
part of the story 0.000000
one child 0.000000
MMR vaccine safety 0.000000
consequences 0.000000
reduction in child mortality 0.000000
peers 0.000000
religious beliefs 0.000000
risk to pregnant women 0.000000
Americans 0.000000
fast 0.000000
respiratory secretions 0.000000
Dr. Mark Schleiss 0.000000
Cervarix 0.000000
testing 0.000000
... ...
Gardasil 0.009744
rubella 0.010135
health benefits 0.011427
pregnant women 0.011663
disease 0.011924
side effects 0.012069
vaccination exemption 0.012286
MMR vaccine 0.012414
children at higher risk for autism 0.012528
herd immunity 0.014070
measles vaccine 0.014741
families 0.016184
community 0.016301
protection 0.016387
United States 0.017827
vaccine-preventable diseases 0.019262
SB 277 0.020944
vaccine-autism link 0.022396
genital warts 0.023904
meningococcal disease 0.027709
Tdap vaccine 0.031003
parents 0.034612
vaccination 0.035068
religious groups 0.036437
anti-vaccination 0.037666
autism 0.039746
HPV vaccine 0.040091
children 0.044112
measles 0.064198
vaccines 0.079509

585 rows × 1 columns


In [52]:
cc_df


Out[52]:
closeness cent
neighbors 0.000000
arm 0.000000
elite list 0.000000
sex 0.000000
testing 0.000000
free vaccine 0.000000
medical law 0.000000
strong-arm tactics 0.000000
gift from God 0.000000
at risk 0.000000
false concerns 0.000000
Early Childhood Australia's chief executive 0.000000
unvaccinated high-risk children 0.000000
random cases 0.000000
public schools 0.000000
unethical 0.000000
recommended vaccines 0.000000
psychiatrist 0.000000
rash 0.000000
severe symptoms 0.000000
Jewish dietary laws 0.000000
social network 0.000000
Sydney, Australia 0.000000
opportunistic infections 0.000000
efficacious 0.000000
vaccinated children and unvaccinated children 0.000000
unconscionable 0.000000
argument 0.000000
revaccinated 0.000000
Pennsylvania 0.000000
... ...
Disneyland measles outbreak 0.138087
vaccine delay 0.138980
vaccine refusal 0.140802
United States 0.140802
Jehovah's Witnesses 0.140986
Jain study 0.141099
Jews 0.141357
polio 0.141637
parents of autistic children 0.143817
autism 0.145080
vaccination 0.146965
SB 277 0.148489
protection 0.148900
HPV vaccine 0.151740
religious groups 0.154134
anti-vaccination 0.155136
peers 0.155294
friends 0.155294
behavioral research 0.155406
compromise 0.155518
Washington State public-private pilot program 0.157605
Dr. Adams 0.157863
health officials 0.158717
Tdap vaccine 0.160107
driving factors 0.160129
Mina study 0.161436
children 0.164520
measles 0.167079
vaccines 0.169852
parents 0.211480

652 rows × 1 columns


In [ ]: