Gc neutral graph

imports "article_neu1.gml"

  • saves "nodes_df_neutral.csv"
    • node labels, degrees, and centralities for entire network
  • saves "Gc_neutral.gml"

imports "Gc_neutral.gml"

  • saves "Gc_df_neu.csv"
    • node labels, degrees, and centralities for greatest component

In [29]:
# 1_network_df

import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
from glob import glob

plt.style.use('ggplot')
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)

gml_files = glob('../output/network/article_neu1.gml')

In [30]:
def calculate_graph_inf(graph):
    graph.name = filename
    info = nx.info(graph)
    print info
    #plt.figure(figsize=(10,10))
    #nx.draw_spring(graph, arrows=True, with_labels=True)

def highest_centrality(cent_dict):
    """Returns a tuple (node,value) with the node
    with largest value from centrality dictionary."""
    # create ordered tuple of centrality data
    cent_items = [(b,a) for (a,b) in cent_dict.iteritems()]
    # sort in descending order
    cent_items.sort()
    cent_items.reverse()
    return tuple(reversed(cent_items[0]))

all nodes table


In [31]:
# 2_node_df: list all nodes and centrality
data_columns = ['name',
                'sentiment'
                ]
data = pd.DataFrame(columns = data_columns)
combined_df = pd.DataFrame()

In [32]:
# graph = directed, ugraph = undirected
for graph_num, gml_graph in enumerate(gml_files):
    graph = nx.read_gml(gml_graph)
    ugraph = graph.to_undirected() # to undirected graph
    U = graph.to_undirected(reciprocal=True)
    e = U.edges()
    ugraph.add_edges_from(e)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 10)
    print(gml_graph)
    calculate_graph_inf(graph)
    calculate_graph_inf(ugraph)

    ## calculate variables and save into list
    sent = "neutral"    
    deg_cent = nx.degree_centrality(graph)
    bet_cent = nx.betweenness_centrality(graph)
    clo_cent = nx.closeness_centrality(graph)
    graph_values = {'name':filename,
                    'sentiment':sent
                    }
    data = data.append(graph_values, ignore_index=True)

    degree = nx.degree(graph)
    deg_df = pd.DataFrame.from_dict(degree, orient = 'index')
    deg_df.columns = ['degree']
    # degree centrality
    deg_cent = nx.degree_centrality(graph)
    dc_df = pd.DataFrame.from_dict(deg_cent, orient = 'index')
    dc_df.columns = ['deg cent']
    # betweenness centrality
    bet_cent = nx.betweenness_centrality(graph)
    bc_df = pd.DataFrame.from_dict(bet_cent, orient = 'index')
    bc_df.columns = ['bet cent']
    # closeness centrality
    clo_cent = nx.closeness_centrality(graph)
    cc_df = pd.DataFrame.from_dict(clo_cent, orient = 'index')
    cc_df.columns = ['clo cent']
    # concat node frames into node_df
    frames = [deg_df, dc_df, bc_df, cc_df]
    node_df = pd.concat(frames, axis = 1)
    node_df.index.name = 'node'
    node_df = node_df.reset_index()

    values = pd.DataFrame(graph_values, columns = ('name', 'sentiment'), index = [0])
    
    # df = merges graph_values with node_df for single graph and fill NaNs
    df = pd.concat([values, node_df], axis = 1)
    df = df.fillna(method='ffill')
    combined_df = combined_df.append(df)


----------
../output/network/article_neu1.gml
Name: article_neu1.gml
Type: MultiDiGraph
Number of nodes: 201
Number of edges: 241
Average in degree:   1.1990
Average out degree:   1.1990
Name: article_neu1.gml
Type: MultiGraph
Number of nodes: 201
Number of edges: 241
Average degree:   2.3980

In [33]:
# what the network looks like without adding back edges e = U.edges()

#for graph_num, gml_graph in enumerate(gml_files):
#    graph2 = nx.read_gml(gml_graph)
#    ugraph2 = graph.to_undirected() # to undirected graph
#    U2 = graph.to_undirected(reciprocal=True)
#    (filepath, filename) = os.path.split(gml_graph)
#    print('-' * 10)
#    print(gml_graph)
#    calculate_graph_inf(graph2)
#    calculate_graph_inf(ugraph2)

In [34]:
# print entire network
combined_df


Out[34]:
name sentiment node degree deg cent bet cent clo cent
0 article_neu1.gml neutral 7-11 year olds 2 0.010 0.000000 0.033800
1 article_neu1.gml neutral Alfred and Lisa Claire Dwoskin 1 0.005 0.000000 0.000000
2 article_neu1.gml neutral Americans 1 0.005 0.000000 0.000000
3 article_neu1.gml neutral Andrew Wakefield 5 0.025 0.001131 0.044138
4 article_neu1.gml neutral Australia 1 0.005 0.000000 0.000000
5 article_neu1.gml neutral Barry Segal 1 0.005 0.000000 0.032216
6 article_neu1.gml neutral Ben Allen 4 0.020 0.000000 0.061455
7 article_neu1.gml neutral CDC 4 0.020 0.004523 0.023226
8 article_neu1.gml neutral California 1 0.005 0.000000 0.000000
9 article_neu1.gml neutral Chairwoman Carol Liu 2 0.010 0.000000 0.061780
10 article_neu1.gml neutral Children's Hospital of Philadelphia 1 0.005 0.000000 0.000000
11 article_neu1.gml neutral Chris Christie 3 0.015 0.000000 0.015000
12 article_neu1.gml neutral Connie Leyva 3 0.015 0.000000 0.062593
13 article_neu1.gml neutral Disneyland measles outbreak 1 0.005 0.000000 0.000000
14 article_neu1.gml neutral Dr. Anna Acosta 2 0.010 0.000000 0.021778
15 article_neu1.gml neutral Dr. Paul Offit 3 0.015 0.000000 0.070700
16 article_neu1.gml neutral Drew Downing 1 0.005 0.000000 0.005000
17 article_neu1.gml neutral Dwoskin Family Foundation 8 0.040 0.000000 0.040500
18 article_neu1.gml neutral Focus for Health 4 0.020 0.000603 0.040000
19 article_neu1.gml neutral Gambhir study 3 0.015 0.000000 0.015000
20 article_neu1.gml neutral Generation Rescue 4 0.020 0.000905 0.036000
21 article_neu1.gml neutral Immunization Action Coalition 1 0.005 0.000000 0.019205
22 article_neu1.gml neutral Jenny McCarthy 3 0.015 0.000000 0.000000
23 article_neu1.gml neutral Lisa and J.B. Handley 1 0.005 0.000000 0.000000
24 article_neu1.gml neutral MMR vaccine 1 0.005 0.000000 0.011429
25 article_neu1.gml neutral National Vaccine Information Center 2 0.010 0.000025 0.005000
26 article_neu1.gml neutral PLOS Computational Biology 1 0.005 0.000000 0.072727
27 article_neu1.gml neutral Pez dispensers 1 0.005 0.000000 0.000000
28 article_neu1.gml neutral Rand Paul 3 0.015 0.000754 0.010000
29 article_neu1.gml neutral Renee Gentry 1 0.005 0.000000 0.005000
... ... ... ... ... ... ... ...
171 article_neu1.gml neutral teens 1 0.005 0.000000 0.000000
172 article_neu1.gml neutral the sake of being anti-vaccine 1 0.005 0.000000 0.000000
173 article_neu1.gml neutral upward trend 1 0.005 0.000000 0.005000
174 article_neu1.gml neutral vaccinated 1 0.005 0.000000 0.000000
175 article_neu1.gml neutral vaccinated children 2 0.010 0.000553 0.005000
176 article_neu1.gml neutral vaccination 6 0.030 0.004849 0.031842
177 article_neu1.gml neutral vaccination exemption 3 0.015 0.000553 0.005000
178 article_neu1.gml neutral vaccination schedule 1 0.005 0.000000 0.000000
179 article_neu1.gml neutral vaccine allergy 1 0.005 0.000000 0.000000
180 article_neu1.gml neutral vaccine choice 1 0.005 0.000000 0.000000
181 article_neu1.gml neutral vaccine concerns 1 0.005 0.000000 0.000000
182 article_neu1.gml neutral vaccine debate 4 0.020 0.000000 0.022500
183 article_neu1.gml neutral vaccine duration 2 0.010 0.000000 0.000000
184 article_neu1.gml neutral vaccine efficacy 3 0.015 0.000352 0.076296
185 article_neu1.gml neutral vaccine refusal 3 0.015 0.000000 0.006667
186 article_neu1.gml neutral vaccine risk 3 0.015 0.000302 0.005000
187 article_neu1.gml neutral vaccine safety 2 0.010 0.000000 0.000000
188 article_neu1.gml neutral vaccine-autism link 4 0.020 0.000000 0.000000
189 article_neu1.gml neutral vaccine-injured children 3 0.015 0.000000 0.000000
190 article_neu1.gml neutral vaccine-preventable diseases 1 0.005 0.000000 0.000000
191 article_neu1.gml neutral vaccines 19 0.095 0.015427 0.079808
192 article_neu1.gml neutral vaccines cause childhood illnesses 1 0.005 0.000000 0.000000
193 article_neu1.gml neutral voluntary 1 0.005 0.000000 0.000000
194 article_neu1.gml neutral vomiting 1 0.005 0.000000 0.000000
195 article_neu1.gml neutral waning effectiveness 2 0.010 0.000000 0.000000
196 article_neu1.gml neutral waning immunity 1 0.005 0.000000 0.068817
197 article_neu1.gml neutral wealthy family foundations 1 0.005 0.000000 0.000000
198 article_neu1.gml neutral whole-cell vaccine 5 0.025 0.014874 0.090449
199 article_neu1.gml neutral whooping cough 2 0.010 0.000000 0.027500
200 article_neu1.gml neutral young people 1 0.005 0.000000 0.005000

201 rows × 7 columns


In [35]:
# save
combined_df.to_csv('../output/df/nodes_df_neutral.csv')

Gc nodes table


In [23]:
gml_files = glob('../output/network/d_Gc_neutral.gml')

In [24]:
# 2_node_df: list all nodes and centrality
data_columns = ['name',
                'sentiment'
                ]
data = pd.DataFrame(columns = data_columns)
#combined_df = pd.DataFrame()

In [25]:
# graph = directed, ugraph = undirected
for graph_num, gml_graph in enumerate(gml_files):
    graph = nx.read_gml(gml_graph)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 10)
    print(gml_graph)
    calculate_graph_inf(graph)

    ## calculate variables and save into list
    sent = "neu"    
    deg_cent = nx.degree_centrality(graph)
    bet_cent = nx.betweenness_centrality(graph)
    clo_cent = nx.closeness_centrality(graph)
    graph_values = {'name':filename,
                    'sentiment':sent
                    }
    data = data.append(graph_values, ignore_index=True)

    degree = nx.degree(graph)
    deg_df = pd.DataFrame.from_dict(degree, orient = 'index')
    deg_df.columns = ['degree']
    # degree centrality
    deg_cent = nx.degree_centrality(graph)
    dc_df = pd.DataFrame.from_dict(deg_cent, orient = 'index')
    dc_df.columns = ['deg cent']
    # betweenness centrality
    bet_cent = nx.betweenness_centrality(graph)
    bc_df = pd.DataFrame.from_dict(bet_cent, orient = 'index')
    bc_df.columns = ['bet cent']
    # closeness centrality
    clo_cent = nx.closeness_centrality(graph)
    cc_df = pd.DataFrame.from_dict(clo_cent, orient = 'index')
    cc_df.columns = ['clo cent']
    # concat node frames into node_df
    frames = [deg_df, dc_df, bc_df, cc_df]
    node_df = pd.concat(frames, axis = 1)
    node_df.index.name = 'node'
    node_df = node_df.reset_index()

    values = pd.DataFrame(graph_values, columns = ('name', 'sentiment'), index = [0])
    
    # df = merges graph_values with node_df for single graph and fill NaNs
    df = pd.concat([values, node_df], axis = 1)
    df = df.fillna(method='ffill')
    #combined_df = combined_df.append(df)


----------
../output/network/d_Gc_neutral.gml
Name: d_Gc_neutral.gml
Type: MultiDiGraph
Number of nodes: 171
Number of edges: 221
Average in degree:   1.2924
Average out degree:   1.2924

In [26]:
# print neutral Gc nodes
df


Out[26]:
name sentiment node degree deg cent bet cent clo cent
0 d_Gc_neutral.gml neu 7-11 year olds 2 0.011765 0.000000 0.039765
1 d_Gc_neutral.gml neu Alfred and Lisa Claire Dwoskin 1 0.005882 0.000000 0.000000
2 d_Gc_neutral.gml neu Americans 1 0.005882 0.000000 0.000000
3 d_Gc_neutral.gml neu Andrew Wakefield 5 0.029412 0.001566 0.051927
4 d_Gc_neutral.gml neu Australia 1 0.005882 0.000000 0.000000
5 d_Gc_neutral.gml neu Barry Segal 1 0.005882 0.000000 0.037902
6 d_Gc_neutral.gml neu Ben Allen 4 0.023529 0.000000 0.072299
7 d_Gc_neutral.gml neu CDC 4 0.023529 0.006265 0.027324
8 d_Gc_neutral.gml neu California 1 0.005882 0.000000 0.000000
9 d_Gc_neutral.gml neu Chairwoman Carol Liu 2 0.011765 0.000000 0.072682
10 d_Gc_neutral.gml neu Children's Hospital of Philadelphia 1 0.005882 0.000000 0.000000
11 d_Gc_neutral.gml neu Chris Christie 3 0.017647 0.000000 0.017647
12 d_Gc_neutral.gml neu Connie Leyva 3 0.017647 0.000000 0.073638
13 d_Gc_neutral.gml neu Disneyland measles outbreak 1 0.005882 0.000000 0.000000
14 d_Gc_neutral.gml neu Dr. Anna Acosta 2 0.011765 0.000000 0.025621
15 d_Gc_neutral.gml neu Dr. Paul Offit 3 0.017647 0.000000 0.083177
16 d_Gc_neutral.gml neu Dwoskin Family Foundation 8 0.047059 0.000000 0.047647
17 d_Gc_neutral.gml neu Focus for Health 4 0.023529 0.000835 0.047059
18 d_Gc_neutral.gml neu Generation Rescue 4 0.023529 0.001253 0.042353
19 d_Gc_neutral.gml neu Immunization Action Coalition 1 0.005882 0.000000 0.022594
20 d_Gc_neutral.gml neu Jenny McCarthy 3 0.017647 0.000000 0.000000
21 d_Gc_neutral.gml neu Lisa and J.B. Handley 1 0.005882 0.000000 0.000000
22 d_Gc_neutral.gml neu MMR vaccine 1 0.005882 0.000000 0.013445
23 d_Gc_neutral.gml neu National Vaccine Information Center 2 0.011765 0.000035 0.005882
24 d_Gc_neutral.gml neu PLOS Computational Biology 1 0.005882 0.000000 0.085561
25 d_Gc_neutral.gml neu Pez dispensers 1 0.005882 0.000000 0.000000
26 d_Gc_neutral.gml neu Rand Paul 3 0.017647 0.001044 0.011765
27 d_Gc_neutral.gml neu Republican 2 0.011765 0.000000 0.000000
28 d_Gc_neutral.gml neu Richard Pan 2 0.011765 0.000000 0.013235
29 d_Gc_neutral.gml neu SB 277 33 0.194118 0.016429 0.114890
... ... ... ... ... ... ... ...
141 d_Gc_neutral.gml neu state-required vaccinations 1 0.005882 0.000000 0.000000
142 d_Gc_neutral.gml neu students 1 0.005882 0.000000 0.000000
143 d_Gc_neutral.gml neu teens 1 0.005882 0.000000 0.000000
144 d_Gc_neutral.gml neu the sake of being anti-vaccine 1 0.005882 0.000000 0.000000
145 d_Gc_neutral.gml neu upward trend 1 0.005882 0.000000 0.005882
146 d_Gc_neutral.gml neu vaccinated 1 0.005882 0.000000 0.000000
147 d_Gc_neutral.gml neu vaccinated children 2 0.011765 0.000766 0.005882
148 d_Gc_neutral.gml neu vaccination 6 0.035294 0.006718 0.037461
149 d_Gc_neutral.gml neu vaccination exemption 3 0.017647 0.000766 0.005882
150 d_Gc_neutral.gml neu vaccine choice 1 0.005882 0.000000 0.000000
151 d_Gc_neutral.gml neu vaccine concerns 1 0.005882 0.000000 0.000000
152 d_Gc_neutral.gml neu vaccine debate 4 0.023529 0.000000 0.026471
153 d_Gc_neutral.gml neu vaccine duration 2 0.011765 0.000000 0.000000
154 d_Gc_neutral.gml neu vaccine efficacy 3 0.017647 0.000487 0.089760
155 d_Gc_neutral.gml neu vaccine refusal 3 0.017647 0.000000 0.007843
156 d_Gc_neutral.gml neu vaccine risk 3 0.017647 0.000418 0.005882
157 d_Gc_neutral.gml neu vaccine safety 2 0.011765 0.000000 0.000000
158 d_Gc_neutral.gml neu vaccine-autism link 4 0.023529 0.000000 0.000000
159 d_Gc_neutral.gml neu vaccine-injured children 3 0.017647 0.000000 0.000000
160 d_Gc_neutral.gml neu vaccine-preventable diseases 1 0.005882 0.000000 0.000000
161 d_Gc_neutral.gml neu vaccines 19 0.111765 0.021371 0.093892
162 d_Gc_neutral.gml neu vaccines cause childhood illnesses 1 0.005882 0.000000 0.000000
163 d_Gc_neutral.gml neu voluntary 1 0.005882 0.000000 0.000000
164 d_Gc_neutral.gml neu vomiting 1 0.005882 0.000000 0.000000
165 d_Gc_neutral.gml neu waning effectiveness 2 0.011765 0.000000 0.000000
166 d_Gc_neutral.gml neu waning immunity 1 0.005882 0.000000 0.080961
167 d_Gc_neutral.gml neu wealthy family foundations 1 0.005882 0.000000 0.000000
168 d_Gc_neutral.gml neu whole-cell vaccine 5 0.029412 0.020606 0.106411
169 d_Gc_neutral.gml neu whooping cough 2 0.011765 0.000000 0.032353
170 d_Gc_neutral.gml neu young people 1 0.005882 0.000000 0.005882

171 rows × 7 columns


In [27]:
# save
df.to_csv('../output/df/Gc_nodes_neu.csv')

Draw network


In [8]:
# 7_graph_calculation
def drawIt(graph, what = 'graph'):
    nsize = graph.number_of_nodes()
    print "Drawing %s of size %s:" % (what, nsize)
    
    if nsize > 20:
        plt.figure(figsize=(10, 10))
        if nsize > 40:
            nx.draw_spring(graph, with_labels = True, node_size = 70, font_size = 12)
        else:
            nx.draw_spring(graph, with_labels = True)
    else:
        nx.draw_spring(graph, with_labels = True)
    plt.show()

# for undirected graphs
def describeGraph(graph):
    components = sorted(nx.connected_components(graph), key = len, reverse = True)
    cc = [len(c) for c in components]
    subgraphs = list(nx.connected_component_subgraphs(graph))
    params = (graph.number_of_edges(),graph.number_of_nodes(),len(cc))
    print "Graph has %s edges, %s nodes, %s connected components\n" % params
    drawIt(graph)
    for sub in components:
        drawIt(graph.subgraph(sub), what = 'component')

# for directed graphs
def describeGraph_d(graph):
    components = sorted(nx.weakly_connected_components(graph), key = len, reverse = True)
    cc = [len(c) for c in components]
    subgraphs = list(nx.weakly_connected_component_subgraphs(graph))
    params = (graph.number_of_edges(),graph.number_of_nodes(),len(cc))
    print "Graph has %s edges, %s nodes, %s connected components\n" % params
    drawIt(graph)
    for sub in components:
        drawIt(graph.subgraph(sub), what = 'component')

In [9]:
# UNDIRECTED network graph
describeGraph(ugraph)


Graph has 241 edges, 201 nodes, 12 connected components

Drawing graph of size 201:
Drawing component of size 171:
Drawing component of size 4:
Drawing component of size 4:
Drawing component of size 4:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:

In [10]:
# DIRECTED network graph
describeGraph_d(graph)


Graph has 241 edges, 201 nodes, 12 connected components

Drawing graph of size 201:
Drawing component of size 171:
Drawing component of size 4:
Drawing component of size 4:
Drawing component of size 4:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:

Undirected


In [11]:
# list of connected components by size (undirected graph)
connected_components = [len(c) for c in sorted(nx.connected_components(ugraph), key=len, reverse=True)]

# generate connected components as subgraphs (undirected graph)
subgraphs = list(nx.connected_component_subgraphs(ugraph))

# greatest component (undirected MultiGraph)
u_Gc = max(nx.connected_component_subgraphs(ugraph), key=len)
u_Gc.name = "undirected Gc"

In [12]:
print "connected components = ", connected_components
print nx.info(u_Gc)


connected components =  [171, 4, 4, 4, 3, 3, 2, 2, 2, 2, 2, 2]
Name: undirected Gc
Type: MultiGraph
Number of nodes: 171
Number of edges: 221
Average degree:   2.5848

Directed


In [13]:
# use directed graph
components = sorted(nx.weakly_connected_components(graph), key = len, reverse = True)
cc = [len(c) for c in components]

# generate connected components as subgraphs 
subgraphs = list(nx.weakly_connected_component_subgraphs(graph))

# greatest component
d_Gc = max(nx.weakly_connected_component_subgraphs(graph), key=len)
d_Gc.name = "directed Gc"

In [14]:
print "connected components = ", cc
print nx.info(d_Gc)


connected components =  [171, 4, 4, 4, 3, 3, 2, 2, 2, 2, 2, 2]
Name: directed Gc
Type: MultiDiGraph
Number of nodes: 171
Number of edges: 221
Average in degree:   1.2924
Average out degree:   1.2924

In [15]:
## understand how direction changes degree ##

print nx.info(graph)  # original directed
print nx.info(ugraph)  # to undirected
temp = ugraph.to_directed()  # back to directed
print nx.info(temp)


Name: article_neu1.gml
Type: MultiDiGraph
Number of nodes: 201
Number of edges: 241
Average in degree:   1.1990
Average out degree:   1.1990
Name: article_neu1.gml
Type: MultiGraph
Number of nodes: 201
Number of edges: 241
Average degree:   2.3980
Name: article_neu1.gml
Type: MultiDiGraph
Number of nodes: 201
Number of edges: 482
Average in degree:   2.3980
Average out degree:   2.3980

In [16]:
# finally, greatest components for undirected and directed graphs
print nx.info(u_Gc)
print nx.info(d_Gc)


Name: undirected Gc
Type: MultiGraph
Number of nodes: 171
Number of edges: 221
Average degree:   2.5848
Name: directed Gc
Type: MultiDiGraph
Number of nodes: 171
Number of edges: 221
Average in degree:   1.2924
Average out degree:   1.2924

In [17]:
# save Gc
#nx.write_gml(u_Gc, "../output/network/u_Gc_neutral.gml")
#nx.write_gml(d_Gc, "../output/network/d_Gc_neutral.gml")

Calculate network statistics (averages) for greatest component.


In [18]:
# load directed Gc
Gc_files = glob('../output/network/d_Gc_neutral.gml')

network_data_columns = ['name',
                    'sentiment',
                    '# nodes',
                    '# edges',
                    #'avg deg',
                    'density',
                    'deg assort coef', 
                    'avg deg cent',
                    'avg bet cent',
                    'avg clo cent',
                    'high deg cent',
                    'high bet cent',
                    'high clo cent',
                    'avg node conn',
                    '# conn comp',
                    'gc size'
                    ]
network_data = pd.DataFrame(columns = network_data_columns)

In [19]:
# Gc_files

for graph_num, gml_graph in enumerate(Gc_files):
    graph = nx.read_gml(gml_graph)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 10)
    print(gml_graph)
    calculate_graph_inf(graph)
    
    # calculate variables
    sent = "neutral"
    nodes = nx.number_of_nodes(graph)
    edges = nx.number_of_edges(graph)
    density = float("{0:.4f}".format(nx.density(graph)))
    avg_deg_cen = np.array(nx.degree_centrality(graph).values()).mean()
    avg_bet_cen = np.array(nx.betweenness_centrality(graph).values()).mean()
    avg_clo_cen = np.array(nx.closeness_centrality(graph).values()).mean()
    #avg_deg = float("{0:.4f}".format(in_deg + out_deg))
    avg_node_con = float("{0:.4f}".format((nx.average_node_connectivity(graph))))
    deg_assort_coeff = float("{0:.4f}".format((nx.degree_assortativity_coefficient(graph))))
    conn_comp = nx.number_weakly_connected_components(graph) # ugraph
    deg_cen = nx.degree_centrality(graph)
    bet_cen = nx.betweenness_centrality(graph)
    clo_cen = nx.closeness_centrality(graph)
    highest_deg_cen = highest_centrality(deg_cen)
    highest_bet_cen = highest_centrality(bet_cen)
    highest_clo_cen = highest_centrality(clo_cen)
    Gc = len(max(nx.weakly_connected_component_subgraphs(graph), key=len))

    # save variables into list
    graph_values = {'name':filename,
                    'sentiment':sent,
                    '# nodes':nodes,
                    '# edges':edges,
                    #'avg deg':avg_deg,
                    'density':density,
                    'deg assort coef':deg_assort_coeff,
                    'avg deg cent':"%.4f" % avg_deg_cen,
                    'avg bet cent':"%.4f" % avg_bet_cen,
                    'avg clo cent':"%.4f" % avg_clo_cen,
                    'high deg cent':highest_deg_cen,
                    'high bet cent':highest_bet_cen,
                    'high clo cent':highest_clo_cen,
                    'avg node conn':avg_node_con,
                    '# conn comp':conn_comp,
                    'gc size':Gc
                    }
    network_data = network_data.append(graph_values, ignore_index=True)


----------
../output/network/d_Gc_neutral.gml
Name: d_Gc_neutral.gml
Type: MultiDiGraph
Number of nodes: 171
Number of edges: 221
Average in degree:   1.2924
Average out degree:   1.2924

In [20]:
# print network data for greatest component
network_data


Out[20]:
name sentiment # nodes # edges density deg assort coef avg deg cent avg bet cent avg clo cent high deg cent high bet cent high clo cent avg node conn # conn comp gc size
0 d_Gc_neutral.gml neutral 171.0 221.0 0.0076 -0.194 0.0152 0.0011 0.0183 (SB 277, 0.194117647059) (children, 0.0226940480334) (children, 0.133013640239) 0.0614 1.0 171.0

In [21]:
# save
#network_data.to_csv('../output/df/Gc_df_neu.csv')

Cutsets


In [22]:
print "Greatest component size =", len(graph)


Greatest component size = 171

In [23]:
# returns all minimum k cutsets of an undirected graph
# i.e., the set(s) of nodes of cardinality equal to the node connectivity of G
# thus if removed, would break G into two or more connected components

#cutsets = list(nx.all_node_cuts(graph))  # must be undirected

print "Greatest component size =", len(graph)
#print "# of cutsets =", len(cutsets)

# returns a set of nodes or edges of minimum cardinality that disconnects G
min_ncut = nx.minimum_node_cut(graph)
min_ecut = nx.minimum_edge_cut(graph)

print "Min node cut =", min_ncut
print "Min edge cut =", min_ecut

# min cuts with source and target
print nx.minimum_node_cut(graph, s='vaccines', t='autism')
print nx.minimum_edge_cut(graph, s='vaccines', t='autism')


Greatest component size = 171
Min node cut = set([])
Min edge cut = set([])
set([])
set([])

In [24]:
# read edge labels in min cut for Gc
# change source and target
a = nx.minimum_edge_cut(graph, s='vaccines', t='autism')
#a = nx.minimum_edge_cut(graph)

labels = nx.get_edge_attributes(graph,'edge')
edgelabels = {}
for e in labels.keys():
    e1 = e[0:2]
    edgelabels[e1]=labels[e]

for e in a:
    if edgelabels.has_key(e):
        print e,edgelabels[e]
    else:
        rev_e = e[::-1]
        print rev_e, edgelabels[rev_e]

Centrality tables


In [5]:
# make sure you're using the right graph
print "gml_files = ", gml_files
print "gml_graph = ", gml_graph


gml_files =  ['../output/network/article_neu1.gml']
gml_graph = 
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-33e68186301e> in <module>()
      1 # make sure you're using the right graph
      2 print "gml_files = ", gml_files
----> 3 print "gml_graph = ", gml_graph

NameError: name 'gml_graph' is not defined

In [15]:
#graph = nx.read_gml('../output/network/article_neu1.gml')  # full network
graph = nx.read_gml('../output/network/d_Gc_neutral.gml')  # gc network

print nx.info(graph)


Name: directed Gc
Type: MultiDiGraph
Number of nodes: 171
Number of edges: 221
Average in degree:   1.2924
Average out degree:   1.2924

In [16]:
# degree centrality
dc = nx.degree_centrality(graph)
dc_df = pd.DataFrame.from_dict(dc, orient = 'index')
dc_df.columns = ['degree cent']
dc_df = dc_df.sort_values(by = ['degree cent'])
#dc_df

# betweenness centrality
bc = nx.betweenness_centrality(graph)
bc_df = pd.DataFrame.from_dict(bc, orient = 'index')
bc_df.columns = ['betweenness cent']
bc_df = bc_df.sort_values(by = ['betweenness cent'])
#bc_df

# closeness centrality
cc = nx.closeness_centrality(graph)
cc_df = pd.DataFrame.from_dict(cc, orient = 'index')
cc_df.columns = ['closeness cent']
cc_df = cc_df.sort_values(by = ['closeness cent'])
#cc_df

In [17]:
dc_df


Out[17]:
degree cent
miracles 0.005882
diagnosis 0.005882
Wakefield study 0.005882
young people 0.005882
vomiting 0.005882
vaccine choice 0.005882
childhood illnesses 0.005882
pregnant women 0.005882
minimal 0.005882
Pez dispensers 0.005882
adults 0.005882
protested 0.005882
ages 65-84 0.005882
Lisa and J.B. Handley 0.005882
MMR vaccine 0.005882
debate 0.005882
waning immunity 0.005882
alzheimer's 0.005882
voluntary 0.005882
medical belief 0.005882
California 0.005882
teens 0.005882
maternal inoculation 0.005882
diagnoses 0.005882
the sake of being anti-vaccine 0.005882
Alfred and Lisa Claire Dwoskin 0.005882
infectious disease 0.005882
vaccine concerns 0.005882
misrepresentation 0.005882
mixed findings 0.005882
... ...
vaccination exemption 0.017647
pertussis increase 0.017647
United States 0.017647
vaccine risk 0.017647
education 0.017647
parents with vaccine-injured children 0.023529
proposed restrictions 0.023529
Ben Allen 0.023529
vaccine-autism link 0.023529
vaccine debate 0.023529
Generation Rescue 0.023529
CDC 0.023529
elderly 0.023529
Focus for Health 0.023529
Andrew Wakefield 0.029412
disease 0.029412
whole-cell vaccine 0.029412
autism 0.029412
home-school 0.029412
vaccination 0.035294
children 0.041176
acellular pertussis vaccine 0.047059
Dwoskin Family Foundation 0.047059
parents 0.058824
high-dose flu vaccine 0.064706
pertussis vaccine 0.082353
pertussis 0.094118
anti-vaccination 0.111765
vaccines 0.111765
SB 277 0.194118

171 rows × 1 columns


In [13]:
bc_df


Out[13]:
betweenness cent
influenza 0.000000
Saron Runner 0.000000
pertussis booster shot 0.000000
Chairwoman Carol Liu 0.000000
Senate committee 0.000000
preteens 0.000000
imperfect 0.000000
rubella vaccination is safe 0.000000
severe coughing 0.000000
Connie Leyva 0.000000
cases 0.000000
decreasing effectiveness 0.000000
mixed findings 0.000000
vaccine-preventable diseases 0.000000
Tina Kimmel 0.000000
Jenny McCarthy 0.000000
Richard Pan 0.000000
immunization waivers 0.000000
medical science 0.000000
required vaccinations 0.000000
Senate Education Committee 0.000000
California 0.000000
cognitive disorders 0.000000
vaccine-autism link 0.000000
for-profit 0.000000
measles vaccine is safe 0.000000
adolescents 0.000000
options 0.000000
miracles 0.000000
low-income family 0.000000
... ...
autism 0.000226
vaccine risk 0.000302
anti-vaxxers 0.000327
vaccine efficacy 0.000352
disease 0.000427
vaccination exemption 0.000553
vaccinated children 0.000553
personal belief exemption 0.000553
fair balance 0.000553
proposed restrictions 0.000578
Focus for Health 0.000603
parents 0.000704
pertussis increase 0.000729
Rand Paul 0.000754
Generation Rescue 0.000905
Andrew Wakefield 0.001131
artificial vaccine 0.001131
infants 0.002010
anti-vaccination 0.002990
CDC 0.004523
vaccination 0.004849
pertussis 0.004975
side effects 0.010754
protection 0.010779
SB 277 0.011859
acellular pertussis vaccine 0.014422
whole-cell vaccine 0.014874
vaccines 0.015427
pertussis vaccine 0.016106
children 0.016382

201 rows × 1 columns


In [14]:
cc_df


Out[14]:
closeness cent
influenza 0.000000
adolescents 0.000000
misrepresentation 0.000000
measles vaccine is safe 0.000000
medical belief 0.000000
for-profit 0.000000
vaccine-autism link 0.000000
the sake of being anti-vaccine 0.000000
age 85 and older 0.000000
diagnoses 0.000000
crazy people 0.000000
immune response 0.000000
diagnosis 0.000000
pertussis in teens 0.000000
options 0.000000
vaccine-injured children 0.000000
United States 0.000000
vomiting 0.000000
severe disease 0.000000
childhood illnesses 0.000000
chronic illness 0.000000
minimal 0.000000
Pez dispensers 0.000000
Republican 0.000000
Senate Education Committee 0.000000
ages 65-84 0.000000
Lisa and J.B. Handley 0.000000
effectiveness 0.000000
immunity 0.000000
concerns 0.000000
... ...
high-dose flu vaccine 0.035556
Generation Rescue 0.036000
anti-vaxxers 0.038793
Focus for Health 0.040000
Dwoskin Family Foundation 0.040500
Andrew Wakefield 0.044138
pertussis 0.050417
Saron Runner 0.058276
anti-vaccination 0.060357
Ben Allen 0.061455
Chairwoman Carol Liu 0.061780
Tina Kimmel 0.061780
parents 0.062500
Connie Leyva 0.062593
side effects 0.068282
pregnant women 0.068817
waning immunity 0.068817
Dr. Paul Offit 0.070700
maternal inoculation 0.071161
medical science 0.071991
protection 0.072067
PLOS Computational Biology 0.072727
vaccine efficacy 0.076296
vaccines 0.079808
pertussis vaccine 0.081052
infants 0.084211
acellular pertussis vaccine 0.086681
whole-cell vaccine 0.090449
SB 277 0.097656
children 0.113062

201 rows × 1 columns


In [ ]: