negative graph


In [1]:
# 1_network_df

import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
from glob import glob

plt.style.use('ggplot')
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)

gml_files = glob('../output/network/article_neg1.gml')

In [2]:
def calculate_graph_inf(graph):
    graph.name = filename
    info = nx.info(graph)
    print info

def highest_centrality(cent_dict):
    """Returns a tuple (node,value) with the node
    with largest value from centrality dictionary."""
    # create ordered tuple of centrality data
    cent_items = [(b,a) for (a,b) in cent_dict.iteritems()]
    # sort in descending order
    cent_items.sort()
    cent_items.reverse()
    return tuple(reversed(cent_items[0]))

2. convert to undirected


In [3]:
for graph_num, gml_graph in enumerate(gml_files):
    dgraph = nx.read_gml(gml_graph)
    ugraph = dgraph.to_undirected() # to undirected graph
    #U = dgraph.to_undirected(reciprocal=True)
    #e = U.edges()
    #ugraph.add_edges_from(e)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 10)
    print(gml_graph)
    calculate_graph_inf(dgraph)
    calculate_graph_inf(ugraph)


----------
../output/network/article_neg1.gml
Name: article_neg1.gml
Type: MultiDiGraph
Number of nodes: 1257
Number of edges: 1898
Average in degree:   1.5099
Average out degree:   1.5099
Name: article_neg1.gml
Type: MultiGraph
Number of nodes: 1257
Number of edges: 1854
Average degree:   2.9499

In [4]:
# save undirected gml
#nx.write_gml(ugraph, "../output/network/article_u_neg.gml")

dgraph = directed ugraph = undirected

undirected graph


In [5]:
# load
gml_files = glob('../output/network/article_u_neg.gml')

In [6]:
# ugraph = undirected; dgraph = directed
for graph_num, gml_graph in enumerate(gml_files):
    ugraph = nx.read_gml(gml_graph)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 10)
    print(gml_graph)
    calculate_graph_inf(ugraph)


----------
../output/network/article_u_neg.gml
Name: article_u_neg.gml
Type: MultiGraph
Number of nodes: 1257
Number of edges: 1854
Average degree:   2.9499

3. all nodes table


In [7]:
# 2_node_df: list all nodes and centrality
data_columns = ['name',
                'sentiment'
                ]
data = pd.DataFrame(columns = data_columns)
combined_df = pd.DataFrame()

In [8]:
# calculate variables and save into list
sent = "negative"    
deg_cent = nx.degree_centrality(ugraph)
bet_cent = nx.betweenness_centrality(ugraph)
clo_cent = nx.closeness_centrality(ugraph)
graph_values = {'name':filename,
                'sentiment':sent
                }
data = data.append(graph_values, ignore_index=True)

degree = nx.degree(ugraph)
deg_df = pd.DataFrame.from_dict(degree, orient = 'index')
deg_df.columns = ['degree']

# degree centrality
deg_cent = nx.degree_centrality(ugraph)
dc_df = pd.DataFrame.from_dict(deg_cent, orient = 'index')
dc_df.columns = ['deg cent']

# betweenness centrality
bet_cent = nx.betweenness_centrality(ugraph)
bc_df = pd.DataFrame.from_dict(bet_cent, orient = 'index')
bc_df.columns = ['bet cent']

# closeness centrality
clo_cent = nx.closeness_centrality(ugraph)
cc_df = pd.DataFrame.from_dict(clo_cent, orient = 'index')
cc_df.columns = ['clo cent']

# concat node frames into node_df
frames = [deg_df, dc_df, bc_df, cc_df]
node_df = pd.concat(frames, axis = 1)
node_df.index.name = 'node'
node_df = node_df.reset_index()

values = pd.DataFrame(graph_values, columns = ('name', 'sentiment'), index = [0])

# df = merges graph_values with node_df for single graph and fill NaNs
df = pd.concat([values, node_df], axis = 1)
df = df.fillna(method='ffill')
combined_df = combined_df.append(df)

In [9]:
# print entire network
combined_df


Out[9]:
name sentiment node degree deg cent bet cent clo cent
0 article_u_neg.gml negative ACIP 1 0.000796 0.000000e+00 0.149349
1 article_u_neg.gml negative ACIP's rotavirus use recommendation 1 0.000796 0.000000e+00 0.187493
2 article_u_neg.gml negative ADHD 1 0.000796 0.000000e+00 0.228872
3 article_u_neg.gml negative AIDS 1 0.000796 0.000000e+00 0.177383
4 article_u_neg.gml negative African American males 2 0.001592 0.000000e+00 0.222944
5 article_u_neg.gml negative African women 3 0.002389 2.158333e-03 0.241219
6 article_u_neg.gml negative African-American children 1 0.000796 0.000000e+00 0.206704
7 article_u_neg.gml negative Alysia Osoff 6 0.004777 5.157534e-03 0.190783
8 article_u_neg.gml negative America 2 0.001592 2.885274e-03 0.166489
9 article_u_neg.gml negative American Academy of Pediatrics 2 0.001592 4.519712e-04 0.234217
10 article_u_neg.gml negative American Nursing Association's Code of Ethics 2 0.001592 8.038547e-04 0.203367
11 article_u_neg.gml negative Americans 7 0.005573 4.718502e-03 0.251436
12 article_u_neg.gml negative Amish 2 0.001592 1.443906e-03 0.153204
13 article_u_neg.gml negative Andrew Wakefield 1 0.000796 0.000000e+00 0.210839
14 article_u_neg.gml negative Apartheid 1 0.000796 0.000000e+00 0.159324
15 article_u_neg.gml negative Attkisson's website 2 0.001592 7.213185e-04 0.166731
16 article_u_neg.gml negative Australia 1 0.000796 0.000000e+00 0.196968
17 article_u_neg.gml negative Baby Boom 2 0.001592 0.000000e+00 0.219766
18 article_u_neg.gml negative Baby Boomers 5 0.003981 7.314888e-03 0.230249
19 article_u_neg.gml negative Baker College nursing school 3 0.002389 2.641888e-03 0.172380
20 article_u_neg.gml negative Baker College nursing school instructors 6 0.004777 3.977791e-03 0.205675
21 article_u_neg.gml negative Bell's Palsy 1 0.000796 0.000000e+00 0.196968
22 article_u_neg.gml negative Big Pharma 12 0.009554 1.170330e-02 0.261493
23 article_u_neg.gml negative Big Tobacco 2 0.001592 1.984357e-04 0.225721
24 article_u_neg.gml negative Bill of Rights 1 0.000796 0.000000e+00 0.159324
25 article_u_neg.gml negative Brian Hooker 1 0.000796 0.000000e+00 0.189766
26 article_u_neg.gml negative Bruesewitz v. Wyeth 2 0.001592 2.885274e-03 0.223959
27 article_u_neg.gml negative Bush Administration 2 0.001592 5.187031e-05 0.204940
28 article_u_neg.gml negative CDC 60 0.047771 1.047685e-01 0.289896
29 article_u_neg.gml negative CDC and Big Pharma 3 0.002389 9.561462e-05 0.202410
... ... ... ... ... ... ... ...
1227 article_u_neg.gml negative violation of basic human rights 2 0.001592 5.059144e-03 0.210152
1228 article_u_neg.gml negative violation of law 1 0.000796 0.000000e+00 0.159324
1229 article_u_neg.gml negative viral replication 2 0.001592 6.344051e-07 0.001791
1230 article_u_neg.gml negative vitamin A supplements 2 0.001592 1.443906e-03 0.171892
1231 article_u_neg.gml negative vitamin B12 2 0.001592 6.344051e-07 0.001791
1232 article_u_neg.gml negative vitamin C 2 0.001592 6.344051e-07 0.001791
1233 article_u_neg.gml negative vitamin D 1 0.000796 0.000000e+00 0.193535
1234 article_u_neg.gml negative vitamin D deficiency 1 0.000796 0.000000e+00 0.000796
1235 article_u_neg.gml negative vitamin supplements 1 0.000796 0.000000e+00 0.124521
1236 article_u_neg.gml negative vulnerable 1 0.000796 0.000000e+00 0.223765
1237 article_u_neg.gml negative wander 1 0.000796 0.000000e+00 0.131697
1238 article_u_neg.gml negative wander off 1 0.000796 0.000000e+00 0.131697
1239 article_u_neg.gml negative war propaganda 2 0.001592 6.344051e-07 0.001791
1240 article_u_neg.gml negative water 1 0.000796 0.000000e+00 0.149306
1241 article_u_neg.gml negative whistle 1 0.000796 0.000000e+00 0.189314
1242 article_u_neg.gml negative whistleblower 3 0.002389 5.864966e-04 0.187493
1243 article_u_neg.gml negative whistleblowers 1 0.000796 0.000000e+00 0.189314
1244 article_u_neg.gml negative whooping cough 1 0.000796 0.000000e+00 0.213057
1245 article_u_neg.gml negative whooping cough outbreaks 1 0.000796 0.000000e+00 0.169355
1246 article_u_neg.gml negative widespread 1 0.000796 0.000000e+00 0.000796
1247 article_u_neg.gml negative widespread fear 2 0.001592 3.252148e-03 0.239652
1248 article_u_neg.gml negative widespread health problems 2 0.001592 0.000000e+00 0.255226
1249 article_u_neg.gml negative words 2 0.001592 0.000000e+00 0.249734
1250 article_u_neg.gml negative world 1 0.000796 0.000000e+00 0.166731
1251 article_u_neg.gml negative world's healthiest children 1 0.000796 0.000000e+00 0.196257
1252 article_u_neg.gml negative wrong doing 3 0.002389 1.692245e-03 0.212312
1253 article_u_neg.gml negative years 1 0.000796 0.000000e+00 0.182491
1254 article_u_neg.gml negative you 10 0.007962 5.704791e-03 0.235070
1255 article_u_neg.gml negative young adults 1 0.000796 0.000000e+00 0.182911
1256 article_u_neg.gml negative young doctors 7 0.005573 5.104453e-03 0.190186

1257 rows × 7 columns


In [10]:
# save
#combined_df.to_csv('../output/df/article_u_neg.csv')

4. Draw undirected and directed network


In [11]:
# 7_graph_calculation
def drawIt(graph, what = 'graph'):
    nsize = graph.number_of_nodes()
    print "Drawing %s of size %s:" % (what, nsize)
    
    if nsize > 20:
        plt.figure(figsize=(10, 10))
        if nsize > 40:
            nx.draw_spring(graph, with_labels = True, node_size = 70, font_size = 12)
        else:
            nx.draw_spring(graph, with_labels = True)
    else:
        nx.draw_spring(graph, with_labels = True)
    plt.show()

# for undirected graphs
def describeGraph(graph):
    components = sorted(nx.connected_components(graph), key = len, reverse = True)
    cc = [len(c) for c in components]
    subgraphs = list(nx.connected_component_subgraphs(graph))
    params = (graph.number_of_edges(),graph.number_of_nodes(),len(cc))
    print "Graph has %s edges, %s nodes, %s connected components\n" % params
    drawIt(graph)
    for sub in components:
        drawIt(graph.subgraph(sub), what = 'component')

# for directed graphs
def describeGraph_d(graph):
    components = sorted(nx.weakly_connected_components(graph), key = len, reverse = True)
    cc = [len(c) for c in components]
    subgraphs = list(nx.weakly_connected_component_subgraphs(graph))
    params = (graph.number_of_edges(),graph.number_of_nodes(),len(cc))
    print "Graph has %s edges, %s nodes, %s connected components\n" % params
    drawIt(graph)
    for sub in components:
        drawIt(graph.subgraph(sub), what = 'component')

In [12]:
# UNDIRECTED network graph
describeGraph(ugraph)


Graph has 1854 edges, 1257 nodes, 49 connected components

Drawing graph of size 1257:
Drawing component of size 1140:
Drawing component of size 7:
Drawing component of size 4:
Drawing component of size 4:
Drawing component of size 4:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:

In [13]:
# DIRECTED network graph
describeGraph_d(dgraph)


Graph has 1898 edges, 1257 nodes, 49 connected components

Drawing graph of size 1257:
Drawing component of size 1140:
Drawing component of size 7:
Drawing component of size 4:
Drawing component of size 4:
Drawing component of size 4:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 3:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:
Drawing component of size 2:

Undirected graph components


In [14]:
# list of connected components by size (undirected graph)
connected_components = [len(c) for c in sorted(nx.connected_components(ugraph), key=len, reverse=True)]

# generate connected components as subgraphs (undirected graph)
subgraphs = list(nx.connected_component_subgraphs(ugraph))

# greatest component (undirected MultiGraph)
u_Gc = max(nx.connected_component_subgraphs(ugraph), key=len)
u_Gc.name = "undirected Gc"

In [15]:
print "connected components = ", connected_components
print nx.info(u_Gc)


connected components =  [1140, 7, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Name: undirected Gc
Type: MultiGraph
Number of nodes: 1140
Number of edges: 1783
Average degree:   3.1281

Directed graph components


In [16]:
# use directed dgraph
components = sorted(nx.weakly_connected_components(dgraph), key = len, reverse = True)
cc = [len(c) for c in components]

# generate connected components as subgraphs 
subgraphs = list(nx.weakly_connected_component_subgraphs(dgraph))

# greatest component
d_Gc = max(nx.weakly_connected_component_subgraphs(dgraph), key=len)
d_Gc.name = "directed Gc"

In [17]:
print "connected components = ", cc
print nx.info(d_Gc)


connected components =  [1140, 7, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Name: directed Gc
Type: MultiDiGraph
Number of nodes: 1140
Number of edges: 1826
Average in degree:   1.6018
Average out degree:   1.6018

5. Greatest component graph


In [18]:
# finally, greatest components for undirected and directed graphs
print nx.info(u_Gc)
print nx.info(d_Gc)


Name: undirected Gc
Type: MultiGraph
Number of nodes: 1140
Number of edges: 1783
Average degree:   3.1281
Name: directed Gc
Type: MultiDiGraph
Number of nodes: 1140
Number of edges: 1826
Average in degree:   1.6018
Average out degree:   1.6018

In [19]:
# save Gc
#nx.write_gml(u_Gc, "../output/network/u_Gc_negative2.gml")
#nx.write_gml(d_Gc, "../output/network/d_Gc_negative2.gml")

6. network stats for DIRECTED GC


In [20]:
# load directed Gc
Gc_files = glob('../output/network/d_Gc_negative2.gml')

network_data_columns = ['name',
                    'sentiment',
                    '# nodes',
                    '# edges',
                    #'avg deg',
                    'density',
                    'deg assort coef', 
                    'avg deg cent',
                    'avg bet cent',
                    'avg clo cent',
                    'high deg cent',
                    'high bet cent',
                    'high clo cent',
                    'avg node conn',
                    '# conn comp',
                    'gc size'
                    ]
network_data = pd.DataFrame(columns = network_data_columns)

In [21]:
# Gc_files
for graph_num, gml_graph in enumerate(Gc_files):
    graph = nx.read_gml(gml_graph)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 10)
    print(gml_graph)
    calculate_graph_inf(graph)
    
    # calculate variables
    sent = "neg"
    nodes = nx.number_of_nodes(graph)
    edges = nx.number_of_edges(graph)
    density = float("{0:.4f}".format(nx.density(graph)))
    avg_deg_cen = np.array(nx.degree_centrality(graph).values()).mean()
    avg_bet_cen = np.array(nx.betweenness_centrality(graph).values()).mean()
    avg_clo_cen = np.array(nx.closeness_centrality(graph).values()).mean()
    #avg_deg = float("{0:.4f}".format(in_deg + out_deg))
    avg_node_con = float("{0:.4f}".format((nx.average_node_connectivity(graph))))
    deg_assort_coeff = float("{0:.4f}".format((nx.degree_assortativity_coefficient(graph))))
    conn_comp = nx.number_weakly_connected_components(graph) # ugraph
    deg_cen = nx.degree_centrality(graph)
    bet_cen = nx.betweenness_centrality(graph)
    clo_cen = nx.closeness_centrality(graph)
    highest_deg_cen = highest_centrality(deg_cen)
    highest_bet_cen = highest_centrality(bet_cen)
    highest_clo_cen = highest_centrality(clo_cen)
    Gc = len(max(nx.weakly_connected_component_subgraphs(graph), key=len))

    # save variables into list
    graph_values = {'name':filename,
                    'sentiment':sent,
                    '# nodes':nodes,
                    '# edges':edges,
                    #'avg deg':avg_deg,
                    'density':density,
                    'deg assort coef':deg_assort_coeff,
                    'avg deg cent':"%.4f" % avg_deg_cen,
                    'avg bet cent':"%.4f" % avg_bet_cen,
                    'avg clo cent':"%.4f" % avg_clo_cen,
                    'high deg cent':highest_deg_cen,
                    'high bet cent':highest_bet_cen,
                    'high clo cent':highest_clo_cen,
                    'avg node conn':avg_node_con,
                    '# conn comp':conn_comp,
                    'gc size':Gc
                    }
    network_data = network_data.append(graph_values, ignore_index=True)


----------
../output/network/d_Gc_negative2.gml
Name: d_Gc_negative2.gml
Type: MultiDiGraph
Number of nodes: 1140
Number of edges: 1826
Average in degree:   1.6018
Average out degree:   1.6018

In [22]:
# print network data for greatest component
network_data


Out[22]:
name sentiment # nodes # edges density deg assort coef avg deg cent avg bet cent avg clo cent high deg cent high bet cent high clo cent avg node conn # conn comp gc size
0 d_Gc_negative2.gml neg 1140.0 1826.0 0.0014 -0.0122 0.0028 0.0007 0.0399 (vaccines, 0.117647058824) (vaccines, 0.0838129050492) (vaccine industry, 0.174485053966) 0.2135 1.0 1140.0

In [23]:
# save
#network_data.to_csv('../output/df/d_Gc_neg2.csv')

7. network stats for UNDIRECTED GC


In [24]:
# load UNdirected Gc
Gc_files = glob('../output/network/u_Gc_negative2.gml')

network_data_columns = ['name',
                    'sentiment',
                    '# nodes',
                    '# edges',
                    #'avg deg',
                    'density',
                    'deg assort coef', 
                    'avg deg cent',
                    'avg bet cent',
                    'avg clo cent',
                    'high deg cent',
                    'high bet cent',
                    'high clo cent',
                    'avg node conn'
                    #'# conn comp',
                    #'gc size'
                    ]
network_data = pd.DataFrame(columns = network_data_columns)

In [25]:
# Gc_files
for graph_num, gml_graph in enumerate(Gc_files):
    graph = nx.read_gml(gml_graph)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 10)
    print(gml_graph)
    calculate_graph_inf(graph)
    
    # calculate variables
    sent = "neg"
    nodes = nx.number_of_nodes(graph)
    edges = nx.number_of_edges(graph)
    density = float("{0:.4f}".format(nx.density(graph)))
    avg_deg_cen = np.array(nx.degree_centrality(graph).values()).mean()
    avg_bet_cen = np.array(nx.betweenness_centrality(graph).values()).mean()
    avg_clo_cen = np.array(nx.closeness_centrality(graph).values()).mean()
    #avg_deg = float("{0:.4f}".format(in_deg + out_deg))
    avg_node_con = float("{0:.4f}".format((nx.average_node_connectivity(graph))))
    deg_assort_coeff = float("{0:.4f}".format((nx.degree_assortativity_coefficient(graph))))
    #conn_comp = nx.number_weakly_connected_components(graph) # ugraph
    deg_cen = nx.degree_centrality(graph)
    bet_cen = nx.betweenness_centrality(graph)
    clo_cen = nx.closeness_centrality(graph)
    highest_deg_cen = highest_centrality(deg_cen)
    highest_bet_cen = highest_centrality(bet_cen)
    highest_clo_cen = highest_centrality(clo_cen)
    #Gc = len(max(nx.weakly_connected_component_subgraphs(graph), key=len))

    # save variables into list
    graph_values = {'name':filename,
                    'sentiment':sent,
                    '# nodes':nodes,
                    '# edges':edges,
                    #'avg deg':avg_deg,
                    'density':density,
                    'deg assort coef':deg_assort_coeff,
                    'avg deg cent':"%.4f" % avg_deg_cen,
                    'avg bet cent':"%.4f" % avg_bet_cen,
                    'avg clo cent':"%.4f" % avg_clo_cen,
                    'high deg cent':highest_deg_cen,
                    'high bet cent':highest_bet_cen,
                    'high clo cent':highest_clo_cen,
                    'avg node conn':avg_node_con
                    #'# conn comp':conn_comp,
                    #'gc size':Gc
                    }
    network_data = network_data.append(graph_values, ignore_index=True)


----------
../output/network/u_Gc_negative2.gml
Name: u_Gc_negative2.gml
Type: MultiGraph
Number of nodes: 1140
Number of edges: 1783
Average degree:   3.1281

In [26]:
# print network data for greatest component
network_data


Out[26]:
name sentiment # nodes # edges density deg assort coef avg deg cent avg bet cent avg clo cent high deg cent high bet cent high clo cent avg node conn
0 u_Gc_negative2.gml neg 1140.0 1783.0 0.0027 -0.0482 0.0027 0.0033 0.2161 (vaccines, 0.105355575066) (vaccines, 0.328040065826) (vaccines, 0.358176100629) 1.1835

In [27]:
# save
#network_data.to_csv('../output/df/u_Gc_neg2.csv')

Gc nodes table (directed & undirected)


In [33]:
#gml_files = glob('../output/network/d_Gc_negative2.gml')
gml_files = glob('../output/network/u_Gc_negative2.gml')

In [34]:
# 2_node_df: list all nodes and centrality
data_columns = ['name',
                'sentiment'
                ]
data = pd.DataFrame(columns = data_columns)
#combined_df = pd.DataFrame()

In [35]:
for graph_num, gml_graph in enumerate(gml_files):
    graph = nx.read_gml(gml_graph)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 10)
    print(gml_graph)
    calculate_graph_inf(graph)

    # calculate variables and save into list
    sent = "neg"    
    deg_cent = nx.degree_centrality(graph)
    bet_cent = nx.betweenness_centrality(graph)
    clo_cent = nx.closeness_centrality(graph)
    graph_values = {'name':filename,
                    'sentiment':sent
                    }
    data = data.append(graph_values, ignore_index=True)

    degree = nx.degree(graph)
    deg_df = pd.DataFrame.from_dict(degree, orient = 'index')
    deg_df.columns = ['degree']
    # degree centrality
    deg_cent = nx.degree_centrality(graph)
    dc_df = pd.DataFrame.from_dict(deg_cent, orient = 'index')
    dc_df.columns = ['deg cent']
    # betweenness centrality
    bet_cent = nx.betweenness_centrality(graph)
    bc_df = pd.DataFrame.from_dict(bet_cent, orient = 'index')
    bc_df.columns = ['bet cent']
    # closeness centrality
    clo_cent = nx.closeness_centrality(graph)
    cc_df = pd.DataFrame.from_dict(clo_cent, orient = 'index')
    cc_df.columns = ['clo cent']
    # concat node frames into node_df
    frames = [deg_df, dc_df, bc_df, cc_df]
    node_df = pd.concat(frames, axis = 1)
    node_df.index.name = 'node'
    node_df = node_df.reset_index()

    values = pd.DataFrame(graph_values, columns = ('name', 'sentiment'), index = [0])
    
    # df = merges graph_values with node_df for single graph and fill NaNs
    df = pd.concat([values, node_df], axis = 1)
    df = df.fillna(method='ffill')
    #combined_df = combined_df.append(df)


----------
../output/network/u_Gc_negative2.gml
Name: u_Gc_negative2.gml
Type: MultiGraph
Number of nodes: 1140
Number of edges: 1783
Average degree:   3.1281

In [36]:
# print negative gc nodes
df


Out[36]:
name sentiment node degree deg cent bet cent clo cent
0 u_Gc_negative2.gml neg ACIP 1 0.000878 0.000000 0.164691
1 u_Gc_negative2.gml neg ACIP's rotavirus use recommendation 1 0.000878 0.000000 0.206753
2 u_Gc_negative2.gml neg ADHD 1 0.000878 0.000000 0.252382
3 u_Gc_negative2.gml neg AIDS 1 0.000878 0.000000 0.195604
4 u_Gc_negative2.gml neg African American males 2 0.001756 0.000000 0.245845
5 u_Gc_negative2.gml neg African women 3 0.002634 0.002625 0.265997
6 u_Gc_negative2.gml neg African-American children 1 0.000878 0.000000 0.227937
7 u_Gc_negative2.gml neg Alysia Osoff 6 0.005268 0.006272 0.210380
8 u_Gc_negative2.gml neg America 2 0.001756 0.003509 0.183591
9 u_Gc_negative2.gml neg American Academy of Pediatrics 2 0.001756 0.000550 0.258277
10 u_Gc_negative2.gml neg American Nursing Association's Code of Ethics 2 0.001756 0.000978 0.224257
11 u_Gc_negative2.gml neg Americans 7 0.006146 0.005738 0.277264
12 u_Gc_negative2.gml neg Amish 2 0.001756 0.001756 0.168941
13 u_Gc_negative2.gml neg Andrew Wakefield 1 0.000878 0.000000 0.232496
14 u_Gc_negative2.gml neg Apartheid 1 0.000878 0.000000 0.175690
15 u_Gc_negative2.gml neg Attkisson's website 2 0.001756 0.000877 0.183858
16 u_Gc_negative2.gml neg Australia 1 0.000878 0.000000 0.217201
17 u_Gc_negative2.gml neg Baby Boom 2 0.001756 0.000000 0.242340
18 u_Gc_negative2.gml neg Baby Boomers 5 0.004390 0.008896 0.253901
19 u_Gc_negative2.gml neg Baker College nursing school 3 0.002634 0.003213 0.190087
20 u_Gc_negative2.gml neg Baker College nursing school instructors 6 0.005268 0.004837 0.226802
21 u_Gc_negative2.gml neg Bell's Palsy 1 0.000878 0.000000 0.217201
22 u_Gc_negative2.gml neg Big Pharma 12 0.010536 0.014232 0.288354
23 u_Gc_negative2.gml neg Big Tobacco 2 0.001756 0.000241 0.248907
24 u_Gc_negative2.gml neg Bill of Rights 1 0.000878 0.000000 0.175690
25 u_Gc_negative2.gml neg Brian Hooker 1 0.000878 0.000000 0.209260
26 u_Gc_negative2.gml neg Bruesewitz v. Wyeth 2 0.001756 0.003509 0.246964
27 u_Gc_negative2.gml neg Bush Administration 2 0.001756 0.000063 0.225992
28 u_Gc_negative2.gml neg CDC 60 0.052678 0.127408 0.319674
29 u_Gc_negative2.gml neg CDC and Big Pharma 3 0.002634 0.000116 0.223202
... ... ... ... ... ... ... ...
1110 u_Gc_negative2.gml neg vaccines spread disease 1 0.000878 0.000000 0.219123
1111 u_Gc_negative2.gml neg vaccinia virus-naive subjects 2 0.001756 0.001756 0.155537
1112 u_Gc_negative2.gml neg value 1 0.000878 0.000000 0.171407
1113 u_Gc_negative2.gml neg variant genotypes 1 0.000878 0.000000 0.202850
1114 u_Gc_negative2.gml neg victims 4 0.003512 0.002634 0.257111
1115 u_Gc_negative2.gml neg violation of Hippocratic Oath 1 0.000878 0.000000 0.228991
1116 u_Gc_negative2.gml neg violation of basic human rights 2 0.001756 0.006152 0.231740
1117 u_Gc_negative2.gml neg violation of law 1 0.000878 0.000000 0.175690
1118 u_Gc_negative2.gml neg vitamin A supplements 2 0.001756 0.001756 0.189549
1119 u_Gc_negative2.gml neg vitamin D 1 0.000878 0.000000 0.213416
1120 u_Gc_negative2.gml neg vitamin supplements 1 0.000878 0.000000 0.137312
1121 u_Gc_negative2.gml neg vulnerable 1 0.000878 0.000000 0.246750
1122 u_Gc_negative2.gml neg wander 1 0.000878 0.000000 0.145225
1123 u_Gc_negative2.gml neg wander off 1 0.000878 0.000000 0.145225
1124 u_Gc_negative2.gml neg water 1 0.000878 0.000000 0.164643
1125 u_Gc_negative2.gml neg whistle 1 0.000878 0.000000 0.208761
1126 u_Gc_negative2.gml neg whistleblower 3 0.002634 0.000713 0.206753
1127 u_Gc_negative2.gml neg whistleblowers 1 0.000878 0.000000 0.208761
1128 u_Gc_negative2.gml neg whooping cough 1 0.000878 0.000000 0.234942
1129 u_Gc_negative2.gml neg whooping cough outbreaks 1 0.000878 0.000000 0.186752
1130 u_Gc_negative2.gml neg widespread fear 2 0.001756 0.003955 0.264269
1131 u_Gc_negative2.gml neg widespread health problems 2 0.001756 0.000000 0.281443
1132 u_Gc_negative2.gml neg words 2 0.001756 0.000000 0.275387
1133 u_Gc_negative2.gml neg world 1 0.000878 0.000000 0.183858
1134 u_Gc_negative2.gml neg world's healthiest children 1 0.000878 0.000000 0.216416
1135 u_Gc_negative2.gml neg wrong doing 3 0.002634 0.002058 0.234121
1136 u_Gc_negative2.gml neg years 1 0.000878 0.000000 0.201237
1137 u_Gc_negative2.gml neg you 10 0.008780 0.006938 0.259217
1138 u_Gc_negative2.gml neg young adults 1 0.000878 0.000000 0.201700
1139 u_Gc_negative2.gml neg young doctors 7 0.006146 0.006207 0.209722

1140 rows × 7 columns


In [37]:
# save
#df.to_csv('../output/df/d_Gc_nodes_neg2.csv')
#df.to_csv('../output/df/u_Gc_nodes_neg2.csv')

full network node centrality (directed & undirected)


In [38]:
# make sure you're using the right graph
print "gml_files = ", gml_files
print "gml_graph = ", gml_graph


gml_files =  ['../output/network/u_Gc_negative2.gml']
gml_graph =  ../output/network/u_Gc_negative2.gml

In [39]:
# FULL DIRECTED
#graph = nx.read_gml('../output/network/article_neg1.gml')

# FULL UNDIRECTED
graph = nx.read_gml('../output/network/article_u_neg.gml')

print nx.info(graph)


Name: article_neg1.gml
Type: MultiGraph
Number of nodes: 1257
Number of edges: 1854
Average degree:   2.9499

In [40]:
# degree centrality
dc = nx.degree_centrality(graph)
dc_df = pd.DataFrame.from_dict(dc, orient = 'index')
dc_df.columns = ['degree cent']
dc_df = dc_df.sort_values(by = ['degree cent'])
#dc_df

# betweenness centrality
bc = nx.betweenness_centrality(graph)
bc_df = pd.DataFrame.from_dict(bc, orient = 'index')
bc_df.columns = ['betweenness cent']
bc_df = bc_df.sort_values(by = ['betweenness cent'])
#bc_df

# closeness centrality
cc = nx.closeness_centrality(graph)
cc_df = pd.DataFrame.from_dict(cc, orient = 'index')
cc_df.columns = ['closeness cent']
cc_df = cc_df.sort_values(by = ['closeness cent'])
#cc_df

In [41]:
dc_df


Out[41]:
degree cent
state 0.000796
omitting deaths 0.000796
gene products 0.000796
humanity 0.000796
nobody 0.000796
lupus 0.000796
variant genotypes 0.000796
demand for justice 0.000796
corporate-funded make-believe science tabloids 0.000796
labor and delivery floor 0.000796
pediatrics instructor 0.000796
skin reactions 0.000796
obedient 0.000796
human muscle tissue 0.000796
rational approach 0.000796
mental illness 0.000796
target 0.000796
anaphylactic shock 0.000796
solid marks 0.000796
government healthcare reform 0.000796
Italian descent 0.000796
secret gathering 0.000796
swelling of the brain 0.000796
teratogen 0.000796
gene variation 0.000796
shocking revelations 0.000796
Vitamin B12 0.000796
AIDS 0.000796
medical fascism propaganda 0.000796
surveys 0.000796
... ...
National Vaccine Injury Compensation Program 0.013535
scientific fraud 0.013535
Nichole Rolfe 0.014331
vaccine-autism link 0.015127
parents 0.015127
hepatitis B vaccine 0.015127
SV40 0.015127
adverse effects 0.015924
people 0.016720
Merck 0.016720
vaccine ingredients 0.016720
pandemic H1N1 swine flu vaccine 0.016720
measles mortality 0.016720
informed consent 0.016720
measles 0.017516
United States 0.018312
SB 277 0.020701
vaccination 0.021497
mandatory vaccines 0.023089
pharmaceutical companies 0.023885
flu shots 0.023885
mercury 0.030255
mainstream media 0.031847
doctors 0.031847
autism 0.035032
vaccine industry 0.046975
CDC 0.047771
thimerosal 0.053344
children 0.056529
vaccines 0.095541

1257 rows × 1 columns


In [42]:
bc_df


Out[42]:
betweenness cent
marketing vaccines to children 0.000000
great records 0.000000
compensation 0.000000
self-insurance policy 0.000000
First do no harm 0.000000
herd immunity 0.000000
Pandemrix-narcolepsy link 0.000000
troubling 0.000000
eye pain 0.000000
U.S. public 0.000000
public hygiene improvements 0.000000
medical police state 0.000000
injury claims 0.000000
harm's way 0.000000
medical professionals 0.000000
flu shot toxins 0.000000
coma 0.000000
article 0.000000
neonatal infection 0.000000
normal behavior 0.000000
death from congenital malformation 0.000000
risk of cancer 0.000000
Herpes Zoster 0.000000
New Jersey law 0.000000
vaccine laws 0.000000
informal 0.000000
ignorance of scientific facts 0.000000
sell vaccines to minors 0.000000
Flinstones 0.000000
pharmaceutical profits 0.000000
... ...
parents 0.022253
vaccine-autism link 0.022450
Nichole Rolfe 0.022723
Gardasil 0.022748
National Vaccine Injury Compensation Program 0.024565
science 0.024960
drug companies 0.025317
Merck 0.025324
lobbying 0.026660
SB 277 0.026841
adverse effects 0.029113
vaccine safety 0.032844
vaccination 0.036054
measles 0.036488
United States 0.036930
vaccine ingredients 0.037273
people 0.038971
informed consent 0.039844
pharmaceutical companies 0.045431
flu shots 0.047372
mandatory vaccines 0.047960
mainstream media 0.051301
mercury 0.051695
doctors 0.070929
thimerosal 0.071427
autism 0.084571
vaccine industry 0.099709
CDC 0.104768
children 0.155314
vaccines 0.269749

1257 rows × 1 columns


In [43]:
cc_df


Out[43]:
closeness cent
short lived 0.000796
false consent 0.000796
parental right 0.000796
human right 0.000796
recent increase of autism 0.000796
War on Poverty 0.000796
California's brain-damaged lawmakers 0.000796
unvaccinated health care workers 0.000796
medical treatment 0.000796
cancer cells 0.000796
fabrication of vaccine successes 0.000796
widespread 0.000796
vitamin D deficiency 0.000796
children with severe autism 0.000796
vaccine abuse 0.000796
health care sector 0.000796
governor of California 0.000796
fascist 0.000796
bacillus thuringiensis bacteria 0.000796
Saddam's imaginary WMDs 0.000796
mutated genes 0.000796
too much of a jump 0.000796
uninformed consent 0.000796
genetically weakened children 0.000796
medical fascism propaganda 0.000796
dissenters 0.000796
Iraq war 0.000796
pertussis vaccine booster 0.000796
federal government representatives 0.000796
article 0.000796
... ...
aluminum 0.256048
clinical trials 0.257324
chronic disease 0.257388
vaccine efficacy 0.257774
informed consent 0.258031
quackery 0.258096
vaccination 0.258677
public health 0.258742
vaccine damage 0.261097
Big Pharma 0.261493
vaccine-injured children 0.261692
pharmaceutical companies 0.261958
vaccines are safe 0.262490
intelligent questions 0.263427
vaccine safety 0.266005
vaccine-autism link 0.267660
toxic chemical ingredients 0.268285
mandatory vaccines 0.269265
vaccine ingredients 0.271102
doctors 0.274634
flu shots 0.275440
mainstream media 0.278409
SB 277 0.278560
CDC 0.289896
thimerosal 0.291039
mercury 0.294273
autism 0.294609
vaccine industry 0.296981
children 0.306044
vaccines 0.324811

1257 rows × 1 columns

Gc node centrality (directed & undirected)


In [44]:
# Gc directed
#graph = nx.read_gml('../output/network/d_Gc_negative2.gml')

# Gc undirected
graph = nx.read_gml('../output/network/u_Gc_negative2.gml')  

print nx.info(graph)


Name: undirected Gc
Type: MultiGraph
Number of nodes: 1140
Number of edges: 1783
Average degree:   3.1281

In [45]:
# degree centrality
dc = nx.degree_centrality(graph)
dc_df = pd.DataFrame.from_dict(dc, orient = 'index')
dc_df.columns = ['degree cent']
dc_df = dc_df.sort_values(by = ['degree cent'])
#dc_df

# betweenness centrality
bc = nx.betweenness_centrality(graph)
bc_df = pd.DataFrame.from_dict(bc, orient = 'index')
bc_df.columns = ['betweenness cent']
bc_df = bc_df.sort_values(by = ['betweenness cent'])
#bc_df

# closeness centrality
cc = nx.closeness_centrality(graph)
cc_df = pd.DataFrame.from_dict(cc, orient = 'index')
cc_df.columns = ['closeness cent']
cc_df = cc_df.sort_values(by = ['closeness cent'])
#cc_df

In [46]:
dc_df


Out[46]:
degree cent
myelin 0.000878
skin reactions 0.000878
obedient 0.000878
diabetes 0.000878
immunization schedule 0.000878
lavish gifts 0.000878
rational approach 0.000878
medical associations 0.000878
top health officials 0.000878
swelling of the brain 0.000878
rest of the world 0.000878
gene products 0.000878
nobody 0.000878
lupus 0.000878
cellular degeneration 0.000878
toxic heavy metal 0.000878
labor and delivery floor 0.000878
Italian descent 0.000878
human carcinogen 0.000878
big profit centers 0.000878
corporate-funded make-believe science tabloids 0.000878
sick 0.000878
compensation claims 0.000878
vaccine messages 0.000878
environmental issues 0.000878
real 0.000878
humanity 0.000878
gene variation 0.000878
pediatrics instructor 0.000878
shocking revelations 0.000878
... ...
vaccine safety 0.014925
scientific fraud 0.014925
Nichole Rolfe 0.015803
vaccine-autism link 0.016681
hepatitis B vaccine 0.016681
SV40 0.016681
parents 0.016681
adverse effects 0.017559
vaccine ingredients 0.018437
people 0.018437
informed consent 0.018437
pandemic H1N1 swine flu vaccine 0.018437
measles mortality 0.018437
Merck 0.018437
measles 0.019315
United States 0.020193
SB 277 0.022827
vaccination 0.023705
mandatory vaccines 0.025461
flu shots 0.026339
pharmaceutical companies 0.026339
mercury 0.033363
doctors 0.035119
mainstream media 0.035119
autism 0.038630
vaccine industry 0.051800
CDC 0.052678
thimerosal 0.058824
children 0.062335
vaccines 0.105356

1140 rows × 1 columns


In [47]:
bc_df


Out[47]:
betweenness cent
lifelong immunity 0.000000
labor and delivery floor 0.000000
Civil War 0.000000
monster under the bed 0.000000
mindless 0.000000
good bacteria 0.000000
VAERS hepatitis B reports 0.000000
target 0.000000
global population 0.000000
Paul Offit 0.000000
car 0.000000
government healthcare reform 0.000000
lives of children 0.000000
outrageous 0.000000
learning disabilities 0.000000
compensation 0.000000
guilt 0.000000
iron 0.000000
sanity 0.000000
acute respiratory illness 0.000000
genetically-modified human albumin 0.000000
lupus 0.000000
single dose vaccines 0.000000
unexposed 0.000000
big profit centers 0.000000
top health officials 0.000000
environmental chemicals 0.000000
multi-dose vaccine 0.000000
death from congenital disease 0.000000
immunization schedule 0.000000
... ...
parents 0.027062
vaccine-autism link 0.027301
Nichole Rolfe 0.027634
Gardasil 0.027664
National Vaccine Injury Compensation Program 0.029873
science 0.030354
drug companies 0.030788
Merck 0.030797
lobbying 0.032421
SB 277 0.032641
adverse effects 0.035404
vaccine safety 0.039941
vaccination 0.043845
measles 0.044373
United States 0.044910
vaccine ingredients 0.045328
people 0.047392
informed consent 0.048455
pharmaceutical companies 0.055248
flu shots 0.057609
mandatory vaccines 0.058324
mainstream media 0.062387
mercury 0.062866
doctors 0.086256
thimerosal 0.086862
autism 0.102846
vaccine industry 0.121255
CDC 0.127408
children 0.188876
vaccines 0.328040

1140 rows × 1 columns


In [48]:
cc_df


Out[48]:
closeness cent
respiratory system 0.107850
digestive tract 0.107850
injecting any substance 0.120874
lavish gifts 0.121831
extensive medication 0.126471
endangered children 0.128934
stay awake during the school day 0.132611
revisionist stance 0.133497
radicalism 0.134348
truth seekers 0.134522
pock formation 0.134618
flu shot campaign 0.134777
travel 0.134777
higher education 0.134777
serious problems 0.136407
drugs 0.136407
hospital 0.136407
shocking dangers 0.137047
vitamin supplements 0.137312
folate 0.137312
toxic 0.137411
Eli Lily drug company reps 0.138716
pig skin 0.143704
pig bones 0.143704
pre-emptive strike 0.144068
national security 0.144068
to vaccinate children 0.144543
incurable condition 0.144727
lifelong condition 0.144727
peer bullying 0.145225
... ...
aluminum 0.282350
clinical trials 0.283757
chronic disease 0.283828
vaccine efficacy 0.284253
informed consent 0.284537
quackery 0.284608
vaccination 0.285249
public health 0.285321
vaccine damage 0.287917
Big Pharma 0.288354
vaccine-injured children 0.288574
pharmaceutical companies 0.288866
vaccines are safe 0.289454
intelligent questions 0.290487
vaccine safety 0.293330
vaccine-autism link 0.295154
toxic chemical ingredients 0.295844
mandatory vaccines 0.296924
vaccine ingredients 0.298950
doctors 0.302845
flu shots 0.303733
mainstream media 0.307008
SB 277 0.307174
CDC 0.319674
thimerosal 0.320935
mercury 0.324501
autism 0.324872
vaccine industry 0.327487
children 0.337481
vaccines 0.358176

1140 rows × 1 columns


Cutsets (directed & undirected)


In [49]:
# Gc directed
#graph = nx.read_gml('../output/network/d_Gc_positive2.gml')

# Gc undirected
graph = nx.read_gml('../output/network/u_Gc_positive2.gml')

print nx.info(graph)


Name: undirected Gc
Type: MultiGraph
Number of nodes: 585
Number of edges: 1042
Average degree:   3.5624

In [50]:
print "Greatest component size =", len(graph)


Greatest component size = 585

In [51]:
# returns all minimum k cutsets of an undirected graph
# i.e., the set(s) of nodes of cardinality equal to the node connectivity of G
# thus if removed, would break G into two or more connected components

#cutsets = list(nx.all_node_cuts(graph))  # must be undirected

print "Greatest component size =", len(graph)
#print "# of cutsets =", len(cutsets)

# returns a set of nodes or edges of minimum cardinality that disconnects G
min_ncut = nx.minimum_node_cut(graph)
min_ecut = nx.minimum_edge_cut(graph)

print "Min node cut =", min_ncut
print "Min edge cut =", min_ecut

# min cuts with source and target
print nx.minimum_node_cut(graph, s='vaccines', t='autism')
print nx.minimum_edge_cut(graph, s='vaccines', t='autism')


Greatest component size = 585
Min node cut = set([u'vaccine message'])
Min edge cut = set([(u'anti-vaccination', u'time')])
set([u'protective effect of vaccines', u'families', u'vaccinated children', u'MMR vaccine', u'autism risk', u'anti-vaccination', u'parents', u'genetic predisposition', u'scientists', u'children at higher risk for autism', u'children', u'Jain study'])
set([(u'vaccinated high-risk children', u'autism'), (u'genetic predisposition', u'autism'), (u'children', u'autism'), (u'MMR vaccine', u'autism'), (u'vaccines', u'autism'), (u'scientists', u'autism'), (u'anti-vaccination', u'autism'), (u'children with autistic sibling', u'autism'), (u'parents', u'autism'), (u'vaccinated children and unvaccinated children', u'autism'), (u'families', u'autism'), (u'harmful association', u'autism'), (u'protective effect of vaccines', u'autism'), (u'vaccinated children', u'autism'), (u'children at higher risk for autism', u'autism')])