u_pos stats


In [1]:
# 1_network_df

import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
from glob import glob

plt.style.use('ggplot')
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)

In [2]:
def calculate_graph_inf(graph):
    graph.name = filename
    info = nx.info(graph)
    print info

def highest_centrality(cent_dict):
    """Returns a tuple (node,value) with the node
    with largest value from centrality dictionary."""
    # create ordered tuple of centrality data
    cent_items = [(b,a) for (a,b) in cent_dict.iteritems()]
    # sort in descending order
    cent_items.sort()
    cent_items.reverse()
    return tuple(reversed(cent_items[0]))

Calculate network statistics


In [3]:
# load undirected
gml_files = glob('../output/network/u_pos.gml')

network_data_columns = ['name',
                    'sentiment',
                    '# nodes',
                    '# edges',
                    #'avg deg',
                    'density',
                    'deg assort coef', 
                    'avg deg cent',
                    'avg bet cent',
                    'avg clo cent',
                    'high deg cent',
                    'high bet cent',
                    'high clo cent',
                    'avg node conn',
                    '# conn comp',
                    'gc size'
                    ]
network_data = pd.DataFrame(columns = network_data_columns)

In [4]:
for graph_num, gml_graph in enumerate(gml_files):
    graph = nx.read_gml(gml_graph)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 10)
    print(gml_graph)
    calculate_graph_inf(graph)
    
    # calculate variables
    sent = "positive"
    nodes = nx.number_of_nodes(graph)
    edges = nx.number_of_edges(graph)
    density = float("{0:.4f}".format(nx.density(graph)))
    avg_deg_cen = np.array(nx.degree_centrality(graph).values()).mean()
    avg_bet_cen = np.array(nx.betweenness_centrality(graph).values()).mean()
    avg_clo_cen = np.array(nx.closeness_centrality(graph).values()).mean()
    #avg_deg = float("{0:.4f}".format(in_deg + out_deg))
    avg_node_con = float("{0:.4f}".format((nx.average_node_connectivity(graph))))
    deg_assort_coeff = float("{0:.4f}".format((nx.degree_assortativity_coefficient(graph))))
    #conn_comp = nx.number_weakly_connected_components(graph) # not for undirected
    deg_cen = nx.degree_centrality(graph)
    bet_cen = nx.betweenness_centrality(graph)
    clo_cen = nx.closeness_centrality(graph)
    highest_deg_cen = highest_centrality(deg_cen)
    highest_bet_cen = highest_centrality(bet_cen)
    highest_clo_cen = highest_centrality(clo_cen)
    #Gc = len(max(nx.weakly_connected_component_subgraphs(graph), key=len))

    # save variables into list
    graph_values = {'name':filename,
                    'sentiment':sent,
                    '# nodes':nodes,
                    '# edges':edges,
                    #'avg deg':avg_deg,
                    'density':density,
                    'deg assort coef':deg_assort_coeff,
                    'avg deg cent':"%.4f" % avg_deg_cen,
                    'avg bet cent':"%.4f" % avg_bet_cen,
                    'avg clo cent':"%.4f" % avg_clo_cen,
                    'high deg cent':highest_deg_cen,
                    'high bet cent':highest_bet_cen,
                    'high clo cent':highest_clo_cen,
                    'avg node conn':avg_node_con
                    #'# conn comp':conn_comp,
                    #'gc size':Gc
                    }
    network_data = network_data.append(graph_values, ignore_index=True)


----------
../output/network/u_pos.gml
Name: u_pos.gml
Type: MultiGraph
Number of nodes: 652
Number of edges: 1140
Average degree:   3.4969

In [5]:
# print network data for greatest component
network_data


Out[5]:
name sentiment # nodes # edges density deg assort coef avg deg cent avg bet cent avg clo cent high deg cent high bet cent high clo cent avg node conn # conn comp gc size
0 u_pos.gml positive 652.0 1140.0 0.0054 -0.0799 0.0054 0.0043 0.1850 (vaccines, 0.1044546851) (parents, 0.218725048513) (parents, 0.330742137194) 1.0567 NaN NaN

In [7]:
# save
#network_data.to_csv('../output/df/u_pos.csv')

all nodes table


In [8]:
# 2_node_df: list all nodes and centrality
data_columns = ['name',
                'sentiment'
                ]
data = pd.DataFrame(columns = data_columns)
combined_df = pd.DataFrame()

In [9]:
for graph_num, gml_graph in enumerate(gml_files):
    graph = nx.read_gml(gml_graph)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 10)
    print(gml_graph)
    calculate_graph_inf(graph)

    ## calculate variables and save into list
    sent = "positive"    
    deg_cent = nx.degree_centrality(graph)
    bet_cent = nx.betweenness_centrality(graph)
    clo_cent = nx.closeness_centrality(graph)
    graph_values = {'name':filename,
                    'sentiment':sent
                    }
    data = data.append(graph_values, ignore_index=True)

    degree = nx.degree(graph)
    deg_df = pd.DataFrame.from_dict(degree, orient = 'index')
    deg_df.columns = ['degree']
    # degree centrality
    deg_cent = nx.degree_centrality(graph)
    dc_df = pd.DataFrame.from_dict(deg_cent, orient = 'index')
    dc_df.columns = ['deg cent']
    # betweenness centrality
    bet_cent = nx.betweenness_centrality(graph)
    bc_df = pd.DataFrame.from_dict(bet_cent, orient = 'index')
    bc_df.columns = ['bet cent']
    # closeness centrality
    clo_cent = nx.closeness_centrality(graph)
    cc_df = pd.DataFrame.from_dict(clo_cent, orient = 'index')
    cc_df.columns = ['clo cent']
    # concat node frames into node_df
    frames = [deg_df, dc_df, bc_df, cc_df]
    node_df = pd.concat(frames, axis = 1)
    node_df.index.name = 'node'
    node_df = node_df.reset_index()

    values = pd.DataFrame(graph_values, columns = ('name', 'sentiment'), index = [0])
    
    # df = merges graph_values with node_df for single graph and fill NaNs
    df = pd.concat([values, node_df], axis = 1)
    df = df.fillna(method='ffill')
    combined_df = combined_df.append(df)


----------
../output/network/u_pos.gml
Name: u_pos.gml
Type: MultiGraph
Number of nodes: 652
Number of edges: 1140
Average degree:   3.4969

In [10]:
# print entire network
combined_df


Out[10]:
name sentiment node degree deg cent bet cent clo cent
0 u_pos.gml positive neighbors 1 0.001536 0.000000 0.177652
1 u_pos.gml positive vitamins 1 0.001536 0.000000 0.011151
2 u_pos.gml positive colleges 1 0.001536 0.000000 0.183437
3 u_pos.gml positive influenza 2 0.003072 0.000599 0.150718
4 u_pos.gml positive parents of autistic children 6 0.009217 0.004474 0.238568
5 u_pos.gml positive religious exemption 9 0.013825 0.005750 0.242208
6 u_pos.gml positive results 1 0.001536 0.000000 0.193034
7 u_pos.gml positive Scott Morrison 1 0.001536 0.000000 0.001536
8 u_pos.gml positive repetitive behaviors 1 0.001536 0.000000 0.112424
9 u_pos.gml positive Michael Mina 2 0.003072 0.000005 0.003072
10 u_pos.gml positive children 31 0.047619 0.066382 0.306372
11 u_pos.gml positive Dr. Paul Offit 3 0.004608 0.000014 0.004608
12 u_pos.gml positive vaccination schedule 4 0.006144 0.001076 0.233361
13 u_pos.gml positive Samantha Page 1 0.001536 0.000000 0.001536
14 u_pos.gml positive best-sellers 1 0.001536 0.000000 0.184665
15 u_pos.gml positive American Medical Association 5 0.007680 0.000863 0.211846
16 u_pos.gml positive Orthodox Jewish communities 1 0.001536 0.000000 0.226795
17 u_pos.gml positive fence-sitters 4 0.006144 0.012472 0.248881
18 u_pos.gml positive Journal of the American Medical Association 1 0.001536 0.000000 0.189200
19 u_pos.gml positive sexually transmitted virus 4 0.006144 0.005859 0.251631
20 u_pos.gml positive fear of autism 5 0.007680 0.006918 0.248057
21 u_pos.gml positive genetic risk factors for ASD 1 0.001536 0.000000 0.147038
22 u_pos.gml positive siblings 1 0.001536 0.000000 0.157090
23 u_pos.gml positive resources 2 0.003072 0.001477 0.184730
24 u_pos.gml positive risk 1 0.001536 0.000000 0.212361
25 u_pos.gml positive vaccine campaign 2 0.003072 0.000000 0.185976
26 u_pos.gml positive stiff neck 1 0.001536 0.000000 0.145123
27 u_pos.gml positive Faith Assembly 2 0.003072 0.000000 0.005530
28 u_pos.gml positive nausea 2 0.003072 0.001356 0.219939
29 u_pos.gml positive ill effects 1 0.001536 0.000000 0.175627
... ... ... ... ... ... ... ...
622 u_pos.gml positive role 1 0.001536 0.000000 0.175451
623 u_pos.gml positive driving factors 3 0.004608 0.000098 0.251510
624 u_pos.gml positive sexually active 7 0.010753 0.008428 0.249000
625 u_pos.gml positive immunity 6 0.009217 0.003333 0.253457
626 u_pos.gml positive expected 1 0.001536 0.000000 0.188587
627 u_pos.gml positive sense of urgency 1 0.001536 0.000000 0.241761
628 u_pos.gml positive health officials 5 0.007680 0.010986 0.256309
629 u_pos.gml positive rubella 13 0.019969 0.024134 0.233882
630 u_pos.gml positive former gastroenterologist 1 0.001536 0.000000 0.126575
631 u_pos.gml positive varicella vaccine 3 0.004608 0.000005 0.124087
632 u_pos.gml positive magnitude of benefits 1 0.001536 0.000000 0.165163
633 u_pos.gml positive serogroups 1 0.001536 0.000000 0.198370
634 u_pos.gml positive 16 years of age 1 0.001536 0.000000 0.001536
635 u_pos.gml positive state vaccination rates 3 0.004608 0.002933 0.157847
636 u_pos.gml positive loss of limb 1 0.001536 0.000000 0.167486
637 u_pos.gml positive Early Childhood Australia 3 0.004608 0.002756 0.200266
638 u_pos.gml positive religious groups 24 0.036866 0.081924 0.288807
639 u_pos.gml positive age 26 1 0.001536 0.000000 0.162700
640 u_pos.gml positive Robert F. Kennedy Jr. 2 0.003072 0.007541 0.235353
641 u_pos.gml positive friends 1 0.001536 0.000000 0.241761
642 u_pos.gml positive Catholic Church 1 0.001536 0.000000 0.176634
643 u_pos.gml positive Amish 3 0.004608 0.001437 0.220217
644 u_pos.gml positive scheduled appointment 1 0.001536 0.000000 0.241761
645 u_pos.gml positive meningococcal disease symptoms 4 0.006144 0.004952 0.173074
646 u_pos.gml positive Netherlands Reformed Congregation 2 0.003072 0.000000 0.226795
647 u_pos.gml positive immune protection 3 0.004608 0.004347 0.237272
648 u_pos.gml positive environmental trigger 2 0.003072 0.000505 0.196731
649 u_pos.gml positive time 1 0.001536 0.000000 0.213486
650 u_pos.gml positive overseas 3 0.004608 0.000870 0.204807
651 u_pos.gml positive Tdap vaccine 12 0.018433 0.021138 0.257569

652 rows × 7 columns


Undirected cc


In [11]:
# list of connected components by size (undirected graph)
connected_components = [len(c) for c in sorted(nx.connected_components(graph), key=len, reverse=True)]

# generate connected components as subgraphs (undirected graph)
subgraphs = list(nx.connected_component_subgraphs(graph))

# greatest component (undirected MultiGraph)
u_Gc = max(nx.connected_component_subgraphs(graph), key=len)
u_Gc.name = "undirected Gc"

In [12]:
print "connected components = ", connected_components
print nx.info(u_Gc)


connected components =  [585, 15, 7, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Name: undirected Gc
Type: MultiGraph
Number of nodes: 585
Number of edges: 1088
Average degree:   3.7197

Centrality tables


In [13]:
# make sure you're using the right graph
print gml_files
print gml_graph
print graph


['../output/network/u_pos.gml']
../output/network/u_pos.gml
u_pos.gml

In [14]:
# degree centrality
dc = nx.degree_centrality(graph)
dc_df = pd.DataFrame.from_dict(dc, orient = 'index')
dc_df.columns = ['degree cent']
dc_df = dc_df.sort_values(by = ['degree cent'])
#dc_df

# betweenness centrality
bc = nx.betweenness_centrality(graph)
bc_df = pd.DataFrame.from_dict(bc, orient = 'index')
bc_df.columns = ['betweenness cent']
bc_df = bc_df.sort_values(by = ['betweenness cent'])
#bc_df

# closeness centrality
cc = nx.closeness_centrality(graph)
cc_df = pd.DataFrame.from_dict(cc, orient = 'index')
cc_df.columns = ['closeness cent']
cc_df = cc_df.sort_values(by = ['closeness cent'])
#cc_df

In [15]:
dc_df


Out[15]:
degree cent
neighbors 0.001536
arm 0.001536
elite list 0.001536
sex 0.001536
testing 0.001536
free vaccine 0.001536
Caribbean 0.001536
medical law 0.001536
strong-arm tactics 0.001536
gift from God 0.001536
false concerns 0.001536
Early Childhood Australia's chief executive 0.001536
unvaccinated high-risk children 0.001536
Federal Circuit 0.001536
random cases 0.001536
unethical 0.001536
medical conditions 0.001536
psychiatrist 0.001536
factor 0.001536
rash 0.001536
severe symptoms 0.001536
fear 0.001536
Jewish dietary laws 0.001536
opportunistic infections 0.001536
Sydney, Australia 0.001536
efficacious 0.001536
public schools 0.001536
healthy people 0.001536
risk of rubella 0.001536
computer models 0.001536
... ...
children at higher risk for autism 0.018433
studies 0.019969
rubella 0.019969
vaccine refusal 0.019969
anti-vaccination website 0.021505
Gardasil 0.021505
states 0.023041
personal belief exemption 0.024578
Jain study 0.024578
side effects 0.024578
community 0.026114
vaccination exemption 0.026114
meningococcal vaccine 0.027650
herd immunity 0.029186
disease 0.029186
autism risk 0.030722
SB 277 0.033794
measles vaccine 0.035330
religious groups 0.036866
MMR vaccine 0.039939
anti-vaccination 0.043011
children 0.047619
meningococcal disease 0.049155
vaccine-autism link 0.050691
HPV vaccine 0.052227
autism 0.059908
vaccination 0.078341
parents 0.089094
measles 0.099846
vaccines 0.104455

652 rows × 1 columns


In [16]:
bc_df


Out[16]:
betweenness cent
neighbors 0.000000
public schools 0.000000
behavioral research 0.000000
diarrhea deaths 0.000000
efficacious 0.000000
Early Childhood Australia's chief executive 0.000000
arm 0.000000
elite list 0.000000
sex 0.000000
testing 0.000000
free vaccine 0.000000
Caribbean 0.000000
unconscionable 0.000000
medical law 0.000000
gift from God 0.000000
false concerns 0.000000
unvaccinated high-risk children 0.000000
opportunistic infections 0.000000
Federal Circuit 0.000000
imitation infection 0.000000
random cases 0.000000
unethical 0.000000
opponent of sanity-oriented legislation 0.000000
medical conditions 0.000000
vaccine efficacy 0.000000
strong-arm tactics 0.000000
Department of Public Health Immunization Program 0.000000
MMR vaccine doesn't trigger autism 0.000000
genes 0.000000
argument 0.000000
... ...
CDC 0.019609
vaccinated 0.019861
personal belief exemption 0.020411
studies 0.020874
Tdap vaccine 0.021138
Muslim fundamentalists 0.021258
parents who refuse to vaccinate their children 0.022245
disease 0.022347
rubella 0.024134
vaccination exemption 0.024163
Wakefield study 0.024712
polio vaccine opposition 0.024914
Jain study 0.034253
United States 0.036525
Gardasil 0.039917
side effects 0.041014
measles vaccine 0.042119
SB 277 0.045962
community 0.046205
HPV vaccine 0.058863
autism 0.064317
children 0.066382
meningococcal disease 0.072818
vaccine-autism link 0.073800
religious groups 0.081924
vaccination 0.086563
anti-vaccination 0.101460
measles 0.124398
vaccines 0.175097
parents 0.218725

652 rows × 1 columns


In [17]:
cc_df


Out[17]:
closeness cent
meningococcal conjugate booster 0.001536
autism-linked genes 0.001536
benefit 0.001536
short amount of time 0.001536
critical period 0.001536
16 years of age 0.001536
prenatal development 0.001536
factor 0.001536
government 0.001536
insulin 0.001536
reduced vaccine potency 0.001536
Samantha Page 0.001536
Early Childhood Australia's chief executive 0.001536
peer pressure 0.001536
suboptimal protection 0.001536
Scott Morrison 0.001536
decision to vaccinate 0.001536
no jab, no pay policy 0.001536
compliance 0.001536
Australian social services minister 0.001536
reactions 0.001536
Northern Hemisphere flu vaccine 0.001536
Assembly 0.002048
vaccines are not necessary 0.002048
healing through prayer 0.002048
medical student 0.002048
critics 0.002048
opposition 0.002048
Princeton University 0.002048
Minnesota 0.002048
... ...
Tdap vaccine 0.257569
health care 0.258331
vaccine-preventable diseases 0.259611
infectious disease 0.259998
Jehovah's Witnesses 0.260385
Jews 0.260645
measles vaccine 0.261295
herd immunity 0.262473
vaccine delay 0.262868
developmental disability 0.263794
vaccination exemption 0.264193
religion 0.264727
public health 0.264861
side effects 0.265398
protection 0.269078
vaccine refusal 0.269355
disease 0.271871
personal belief exemption 0.273004
schools 0.273718
vaccination 0.278076
anti-vaccination 0.280008
vaccine-autism link 0.282423
SB 277 0.283340
autism 0.285969
religious groups 0.288807
community 0.289445
measles 0.303356
children 0.306372
vaccines 0.312400
parents 0.330742

652 rows × 1 columns


In [ ]: