u_neg stats


In [1]:
# 1_network_df

import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
from glob import glob

plt.style.use('ggplot')
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)

In [2]:
def calculate_graph_inf(graph):
    graph.name = filename
    info = nx.info(graph)
    print info

def highest_centrality(cent_dict):
    """Returns a tuple (node,value) with the node
    with largest value from centrality dictionary."""
    # create ordered tuple of centrality data
    cent_items = [(b,a) for (a,b) in cent_dict.iteritems()]
    # sort in descending order
    cent_items.sort()
    cent_items.reverse()
    return tuple(reversed(cent_items[0]))

Calculate network statistics


In [3]:
# load undirected
gml_files = glob('../output/network/u_neg.gml')

network_data_columns = ['name',
                    'sentiment',
                    '# nodes',
                    '# edges',
                    #'avg deg',
                    'density',
                    'deg assort coef', 
                    'avg deg cent',
                    'avg bet cent',
                    'avg clo cent',
                    'high deg cent',
                    'high bet cent',
                    'high clo cent',
                    'avg node conn',
                    '# conn comp',
                    'gc size'
                    ]
network_data = pd.DataFrame(columns = network_data_columns)

In [4]:
for graph_num, gml_graph in enumerate(gml_files):
    graph = nx.read_gml(gml_graph)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 10)
    print(gml_graph)
    calculate_graph_inf(graph)
    
    # calculate variables
    sent = "negative"
    nodes = nx.number_of_nodes(graph)
    edges = nx.number_of_edges(graph)
    density = float("{0:.4f}".format(nx.density(graph)))
    avg_deg_cen = np.array(nx.degree_centrality(graph).values()).mean()
    avg_bet_cen = np.array(nx.betweenness_centrality(graph).values()).mean()
    avg_clo_cen = np.array(nx.closeness_centrality(graph).values()).mean()
    #avg_deg = float("{0:.4f}".format(in_deg + out_deg))
    avg_node_con = float("{0:.4f}".format((nx.average_node_connectivity(graph))))
    deg_assort_coeff = float("{0:.4f}".format((nx.degree_assortativity_coefficient(graph))))
    #conn_comp = nx.number_weakly_connected_components(graph) # not for undirected
    deg_cen = nx.degree_centrality(graph)
    bet_cen = nx.betweenness_centrality(graph)
    clo_cen = nx.closeness_centrality(graph)
    highest_deg_cen = highest_centrality(deg_cen)
    highest_bet_cen = highest_centrality(bet_cen)
    highest_clo_cen = highest_centrality(clo_cen)
    #Gc = len(max(nx.weakly_connected_component_subgraphs(graph), key=len))

    # save variables into list
    graph_values = {'name':filename,
                    'sentiment':sent,
                    '# nodes':nodes,
                    '# edges':edges,
                    #'avg deg':avg_deg,
                    'density':density,
                    'deg assort coef':deg_assort_coeff,
                    'avg deg cent':"%.4f" % avg_deg_cen,
                    'avg bet cent':"%.4f" % avg_bet_cen,
                    'avg clo cent':"%.4f" % avg_clo_cen,
                    'high deg cent':highest_deg_cen,
                    'high bet cent':highest_bet_cen,
                    'high clo cent':highest_clo_cen,
                    'avg node conn':avg_node_con
                    #'# conn comp':conn_comp,
                    #'gc size':Gc
                    }
    network_data = network_data.append(graph_values, ignore_index=True)


----------
../output/network/u_neg.gml
Name: u_neg.gml
Type: MultiGraph
Number of nodes: 1257
Number of edges: 1898
Average degree:   3.0199

In [5]:
# print network data for greatest component
network_data


Out[5]:
name sentiment # nodes # edges density deg assort coef avg deg cent avg bet cent avg clo cent high deg cent high bet cent high clo cent avg node conn # conn comp gc size
0 u_neg.gml negative 1257.0 1898.0 0.0024 -0.0064 0.0024 0.0025 0.1778 (vaccines, 0.106687898089) (vaccines, 0.269748793744) (vaccines, 0.324810970236) 0.9735 NaN NaN

In [6]:
# save
#network_data.to_csv('../output/df/u_neg.csv')

all nodes table


In [7]:
# 2_node_df: list all nodes and centrality
data_columns = ['name',
                'sentiment'
                ]
data = pd.DataFrame(columns = data_columns)
combined_df = pd.DataFrame()

In [8]:
for graph_num, gml_graph in enumerate(gml_files):
    graph = nx.read_gml(gml_graph)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 10)
    print(gml_graph)
    calculate_graph_inf(graph)

    ## calculate variables and save into list
    sent = "negative"    
    deg_cent = nx.degree_centrality(graph)
    bet_cent = nx.betweenness_centrality(graph)
    clo_cent = nx.closeness_centrality(graph)
    graph_values = {'name':filename,
                    'sentiment':sent
                    }
    data = data.append(graph_values, ignore_index=True)

    degree = nx.degree(graph)
    deg_df = pd.DataFrame.from_dict(degree, orient = 'index')
    deg_df.columns = ['degree']
    # degree centrality
    deg_cent = nx.degree_centrality(graph)
    dc_df = pd.DataFrame.from_dict(deg_cent, orient = 'index')
    dc_df.columns = ['deg cent']
    # betweenness centrality
    bet_cent = nx.betweenness_centrality(graph)
    bc_df = pd.DataFrame.from_dict(bet_cent, orient = 'index')
    bc_df.columns = ['bet cent']
    # closeness centrality
    clo_cent = nx.closeness_centrality(graph)
    cc_df = pd.DataFrame.from_dict(clo_cent, orient = 'index')
    cc_df.columns = ['clo cent']
    # concat node frames into node_df
    frames = [deg_df, dc_df, bc_df, cc_df]
    node_df = pd.concat(frames, axis = 1)
    node_df.index.name = 'node'
    node_df = node_df.reset_index()

    values = pd.DataFrame(graph_values, columns = ('name', 'sentiment'), index = [0])
    
    # df = merges graph_values with node_df for single graph and fill NaNs
    df = pd.concat([values, node_df], axis = 1)
    df = df.fillna(method='ffill')
    combined_df = combined_df.append(df)


----------
../output/network/u_neg.gml
Name: u_neg.gml
Type: MultiGraph
Number of nodes: 1257
Number of edges: 1898
Average degree:   3.0199

In [9]:
# print entire network
combined_df


Out[9]:
name sentiment node degree deg cent bet cent clo cent
0 u_neg.gml negative ACIP 1 0.000796 0.000000e+00 0.149349
1 u_neg.gml negative ACIP's rotavirus use recommendation 1 0.000796 0.000000e+00 0.187493
2 u_neg.gml negative ADHD 1 0.000796 0.000000e+00 0.228872
3 u_neg.gml negative AIDS 1 0.000796 0.000000e+00 0.177383
4 u_neg.gml negative African American males 2 0.001592 0.000000e+00 0.222944
5 u_neg.gml negative African women 3 0.002389 2.158333e-03 0.241219
6 u_neg.gml negative African-American children 1 0.000796 0.000000e+00 0.206704
7 u_neg.gml negative Alysia Osoff 6 0.004777 5.157534e-03 0.190783
8 u_neg.gml negative America 2 0.001592 2.885274e-03 0.166489
9 u_neg.gml negative American Academy of Pediatrics 2 0.001592 4.519712e-04 0.234217
10 u_neg.gml negative American Nursing Association's Code of Ethics 2 0.001592 8.038547e-04 0.203367
11 u_neg.gml negative Americans 7 0.005573 4.718502e-03 0.251436
12 u_neg.gml negative Amish 2 0.001592 1.443906e-03 0.153204
13 u_neg.gml negative Andrew Wakefield 1 0.000796 0.000000e+00 0.210839
14 u_neg.gml negative Apartheid 1 0.000796 0.000000e+00 0.159324
15 u_neg.gml negative Attkisson's website 2 0.001592 7.213185e-04 0.166731
16 u_neg.gml negative Australia 1 0.000796 0.000000e+00 0.196968
17 u_neg.gml negative Baby Boom 2 0.001592 0.000000e+00 0.219766
18 u_neg.gml negative Baby Boomers 5 0.003981 7.314888e-03 0.230249
19 u_neg.gml negative Baker College nursing school 3 0.002389 2.641888e-03 0.172380
20 u_neg.gml negative Baker College nursing school instructors 6 0.004777 3.977791e-03 0.205675
21 u_neg.gml negative Bell's Palsy 1 0.000796 0.000000e+00 0.196968
22 u_neg.gml negative Big Pharma 13 0.010350 1.170330e-02 0.261493
23 u_neg.gml negative Big Tobacco 2 0.001592 1.984357e-04 0.225721
24 u_neg.gml negative Bill of Rights 1 0.000796 0.000000e+00 0.159324
25 u_neg.gml negative Brian Hooker 1 0.000796 0.000000e+00 0.189766
26 u_neg.gml negative Bruesewitz v. Wyeth 2 0.001592 2.885274e-03 0.223959
27 u_neg.gml negative Bush Administration 2 0.001592 5.187031e-05 0.204940
28 u_neg.gml negative CDC 62 0.049363 1.047685e-01 0.289896
29 u_neg.gml negative CDC and Big Pharma 3 0.002389 9.561462e-05 0.202410
... ... ... ... ... ... ... ...
1227 u_neg.gml negative violation of basic human rights 2 0.001592 5.059144e-03 0.210152
1228 u_neg.gml negative violation of law 1 0.000796 0.000000e+00 0.159324
1229 u_neg.gml negative viral replication 2 0.001592 6.344051e-07 0.001791
1230 u_neg.gml negative vitamin A supplements 2 0.001592 1.443906e-03 0.171892
1231 u_neg.gml negative vitamin B12 2 0.001592 6.344051e-07 0.001791
1232 u_neg.gml negative vitamin C 2 0.001592 6.344051e-07 0.001791
1233 u_neg.gml negative vitamin D 1 0.000796 0.000000e+00 0.193535
1234 u_neg.gml negative vitamin D deficiency 1 0.000796 0.000000e+00 0.000796
1235 u_neg.gml negative vitamin supplements 1 0.000796 0.000000e+00 0.124521
1236 u_neg.gml negative vulnerable 1 0.000796 0.000000e+00 0.223765
1237 u_neg.gml negative wander 1 0.000796 0.000000e+00 0.131697
1238 u_neg.gml negative wander off 1 0.000796 0.000000e+00 0.131697
1239 u_neg.gml negative war propaganda 2 0.001592 6.344051e-07 0.001791
1240 u_neg.gml negative water 1 0.000796 0.000000e+00 0.149306
1241 u_neg.gml negative whistle 1 0.000796 0.000000e+00 0.189314
1242 u_neg.gml negative whistleblower 3 0.002389 5.864966e-04 0.187493
1243 u_neg.gml negative whistleblowers 1 0.000796 0.000000e+00 0.189314
1244 u_neg.gml negative whooping cough 1 0.000796 0.000000e+00 0.213057
1245 u_neg.gml negative whooping cough outbreaks 1 0.000796 0.000000e+00 0.169355
1246 u_neg.gml negative widespread 1 0.000796 0.000000e+00 0.000796
1247 u_neg.gml negative widespread fear 2 0.001592 3.252148e-03 0.239652
1248 u_neg.gml negative widespread health problems 2 0.001592 0.000000e+00 0.255226
1249 u_neg.gml negative words 2 0.001592 0.000000e+00 0.249734
1250 u_neg.gml negative world 1 0.000796 0.000000e+00 0.166731
1251 u_neg.gml negative world's healthiest children 1 0.000796 0.000000e+00 0.196257
1252 u_neg.gml negative wrong doing 3 0.002389 1.692245e-03 0.212312
1253 u_neg.gml negative years 1 0.000796 0.000000e+00 0.182491
1254 u_neg.gml negative you 10 0.007962 5.704791e-03 0.235070
1255 u_neg.gml negative young adults 1 0.000796 0.000000e+00 0.182911
1256 u_neg.gml negative young doctors 7 0.005573 5.104453e-03 0.190186

1257 rows × 7 columns


Undirected cc


In [10]:
# list of connected components by size (undirected graph)
connected_components = [len(c) for c in sorted(nx.connected_components(graph), key=len, reverse=True)]

# generate connected components as subgraphs (undirected graph)
subgraphs = list(nx.connected_component_subgraphs(graph))

# greatest component (undirected MultiGraph)
u_Gc = max(nx.connected_component_subgraphs(graph), key=len)
u_Gc.name = "undirected Gc"

In [11]:
print "connected components = ", connected_components
print nx.info(u_Gc)


connected components =  [1140, 7, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Name: undirected Gc
Type: MultiGraph
Number of nodes: 1140
Number of edges: 1826
Average degree:   3.2035

Centrality tables


In [12]:
# make sure you're using the right graph
print gml_files
print gml_graph
print graph


['../output/network/u_neg.gml']
../output/network/u_neg.gml
u_neg.gml

In [13]:
# degree centrality
dc = nx.degree_centrality(graph)
dc_df = pd.DataFrame.from_dict(dc, orient = 'index')
dc_df.columns = ['degree cent']
dc_df = dc_df.sort_values(by = ['degree cent'])
#dc_df

# betweenness centrality
bc = nx.betweenness_centrality(graph)
bc_df = pd.DataFrame.from_dict(bc, orient = 'index')
bc_df.columns = ['betweenness cent']
bc_df = bc_df.sort_values(by = ['betweenness cent'])
#bc_df

# closeness centrality
cc = nx.closeness_centrality(graph)
cc_df = pd.DataFrame.from_dict(cc, orient = 'index')
cc_df.columns = ['closeness cent']
cc_df = cc_df.sort_values(by = ['closeness cent'])
#cc_df

In [14]:
dc_df


Out[14]:
degree cent
state 0.000796
Italian descent 0.000796
omitting deaths 0.000796
gene products 0.000796
variant genotypes 0.000796
nobody 0.000796
lupus 0.000796
demand for justice 0.000796
corporate-funded make-believe science tabloids 0.000796
pediatrics instructor 0.000796
skin reactions 0.000796
labor and delivery floor 0.000796
obedient 0.000796
rational approach 0.000796
human muscle tissue 0.000796
mental illness 0.000796
rest of the world 0.000796
target 0.000796
anaphylactic shock 0.000796
solid marks 0.000796
swelling of the brain 0.000796
government healthcare reform 0.000796
lifelong immunity 0.000796
lavish gifts 0.000796
Vitamin B12 0.000796
medical fascism propaganda 0.000796
gene variation 0.000796
shocking revelations 0.000796
AIDS 0.000796
surveys 0.000796
... ...
vaccine safety 0.013535
Nichole Rolfe 0.014331
scientific fraud 0.014331
vaccine-autism link 0.015127
SV40 0.015127
hepatitis B vaccine 0.015924
adverse effects 0.015924
parents 0.015924
Merck 0.016720
informed consent 0.016720
vaccine ingredients 0.016720
people 0.017516
pandemic H1N1 swine flu vaccine 0.017516
measles mortality 0.017516
measles 0.018312
United States 0.018312
SB 277 0.020701
vaccination 0.021497
mandatory vaccines 0.023089
pharmaceutical companies 0.024682
flu shots 0.026274
doctors 0.031847
mainstream media 0.032643
mercury 0.032643
autism 0.037420
CDC 0.049363
vaccine industry 0.051752
thimerosal 0.057325
children 0.060510
vaccines 0.106688

1257 rows × 1 columns


In [15]:
bc_df


Out[15]:
betweenness cent
marketing vaccines to children 0.000000
great records 0.000000
compensation 0.000000
self-insurance policy 0.000000
First do no harm 0.000000
herd immunity 0.000000
Pandemrix-narcolepsy link 0.000000
troubling 0.000000
eye pain 0.000000
U.S. public 0.000000
public hygiene improvements 0.000000
medical police state 0.000000
injury claims 0.000000
harm's way 0.000000
medical professionals 0.000000
flu shot toxins 0.000000
coma 0.000000
article 0.000000
neonatal infection 0.000000
normal behavior 0.000000
death from congenital malformation 0.000000
risk of cancer 0.000000
Herpes Zoster 0.000000
New Jersey law 0.000000
vaccine laws 0.000000
informal 0.000000
ignorance of scientific facts 0.000000
sell vaccines to minors 0.000000
Flinstones 0.000000
pharmaceutical profits 0.000000
... ...
parents 0.022253
vaccine-autism link 0.022450
Nichole Rolfe 0.022723
Gardasil 0.022748
National Vaccine Injury Compensation Program 0.024565
science 0.024960
drug companies 0.025317
Merck 0.025324
lobbying 0.026660
SB 277 0.026841
adverse effects 0.029113
vaccine safety 0.032844
vaccination 0.036054
measles 0.036488
United States 0.036930
vaccine ingredients 0.037273
people 0.038971
informed consent 0.039844
pharmaceutical companies 0.045431
flu shots 0.047372
mandatory vaccines 0.047960
mainstream media 0.051301
mercury 0.051695
doctors 0.070929
thimerosal 0.071427
autism 0.084571
vaccine industry 0.099709
CDC 0.104768
children 0.155314
vaccines 0.269749

1257 rows × 1 columns


In [16]:
cc_df


Out[16]:
closeness cent
short lived 0.000796
false consent 0.000796
parental right 0.000796
human right 0.000796
recent increase of autism 0.000796
War on Poverty 0.000796
California's brain-damaged lawmakers 0.000796
unvaccinated health care workers 0.000796
medical treatment 0.000796
cancer cells 0.000796
fabrication of vaccine successes 0.000796
widespread 0.000796
vitamin D deficiency 0.000796
children with severe autism 0.000796
vaccine abuse 0.000796
health care sector 0.000796
governor of California 0.000796
fascist 0.000796
bacillus thuringiensis bacteria 0.000796
Saddam's imaginary WMDs 0.000796
mutated genes 0.000796
too much of a jump 0.000796
uninformed consent 0.000796
genetically weakened children 0.000796
medical fascism propaganda 0.000796
dissenters 0.000796
Iraq war 0.000796
pertussis vaccine booster 0.000796
federal government representatives 0.000796
article 0.000796
... ...
aluminum 0.256048
clinical trials 0.257324
chronic disease 0.257388
vaccine efficacy 0.257774
informed consent 0.258031
quackery 0.258096
vaccination 0.258677
public health 0.258742
vaccine damage 0.261097
Big Pharma 0.261493
vaccine-injured children 0.261692
pharmaceutical companies 0.261958
vaccines are safe 0.262490
intelligent questions 0.263427
vaccine safety 0.266005
vaccine-autism link 0.267660
toxic chemical ingredients 0.268285
mandatory vaccines 0.269265
vaccine ingredients 0.271102
doctors 0.274634
flu shots 0.275440
mainstream media 0.278409
SB 277 0.278560
CDC 0.289896
thimerosal 0.291039
mercury 0.294273
autism 0.294609
vaccine industry 0.296981
children 0.306044
vaccines 0.324811

1257 rows × 1 columns


In [ ]: