node_union_df

  1. create node-level dataframe from combined graphs (directed)
  2. out = node_union_df.csv

to do: 3 dataframes (by sentiment)


In [1]:
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
from glob import glob
import re

pd.set_option('display.mpl_style', 'default') 
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)

# use a list comprehension to create a list that matches files in a directory using regular expressions
# http://stackoverflow.com/questions/2225564/get-a-filtered-list-of-files-in-a-directory
#gml_files = [f for f in os.listdir('.') if re.match(r'(pos|neg|neu)_u*all\.gml', f)]

gml_files = [f for f in os.listdir('.') if re.match(r'(positive|negative|neutral)_all\.gml', f)]


def calculate_graph_inf(graph):
    graph.name = filename
    info = nx.info(graph)
    print info
    #plt.figure(figsize=(10,10))
    #nx.draw_spring(graph, arrows=True, with_labels=True)

def highest_centrality(cent_dict):
    """Returns a tuple (node,value) with the node
    with largest value from centrality dictionary."""
    # create ordered tuple of centrality data
    cent_items = [(b,a) for (a,b) in cent_dict.iteritems()]
    # sort in descending order
    cent_items.sort()
    cent_items.reverse()
    return tuple(reversed(cent_items[0]))

In [2]:
gml_files.sort()
gml_files


Out[2]:
['negative_all.gml', 'neutral_all.gml', 'positive_all.gml']

In [3]:
# create empty dataframe with columns
data_columns = ['name',
                'sentiment'
                ]
data = pd.DataFrame(columns = data_columns)
combined_df = pd.DataFrame()

In [4]:
for graph_num, gml_graph in enumerate(gml_files):
    graph = nx.read_gml(gml_graph)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 40)
    print(gml_graph)
    calculate_graph_inf(graph)

    # calculate variables and save into list
    sent = filename.split('_')[0]
    deg_cent = nx.degree_centrality(graph)
    bet_cent = nx.betweenness_centrality(graph)
    clo_cent = nx.closeness_centrality(graph)
    
    graph_values = {'name':filename,
                    'sentiment':sent,
                    }

    data = data.append(graph_values, ignore_index=True)

    #
    degree = nx.degree(graph)
    deg_df = pd.DataFrame.from_dict(degree, orient = 'index')
    deg_df.columns = ['degree']
    # degree centrality
    deg_cent = nx.degree_centrality(graph)
    dc_df = pd.DataFrame.from_dict(deg_cent, orient = 'index')
    dc_df.columns = ['degree centrality']
    # betweenness centrality
    bet_cent = nx.betweenness_centrality(graph)
    bc_df = pd.DataFrame.from_dict(bet_cent, orient = 'index')
    bc_df.columns = ['betweenness centrality']
    # closeness centrality
    clo_cent = nx.closeness_centrality(graph)
    cc_df = pd.DataFrame.from_dict(clo_cent, orient = 'index')
    cc_df.columns = ['closeness centrality']
    
    # concat node frames into node_df
    frames = [deg_df, dc_df, bc_df, cc_df]
    node_df = pd.concat(frames, axis = 1)
    node_df.index.name = 'node'
    node_df = node_df.reset_index()
    
    values = pd.DataFrame(graph_values, columns = ('name', 'sentiment'), index = [0])
    
    # df = merges graph_values with node_df for single graph and fill NaNs
    df = pd.concat([values, node_df], axis = 1)
    df = df.fillna(method='ffill')
    
    # append to combined_df
    combined_df = combined_df.append(df)
    
    
#    if graph_num == 0:
#        break


----------------------------------------
negative_all.gml
Name: negative_all.gml
Type: MultiDiGraph
Number of nodes: 1563
Number of edges: 1879
Average in degree:   1.2022
Average out degree:   1.2022
----------------------------------------
neutral_all.gml
Name: neutral_all.gml
Type: MultiDiGraph
Number of nodes: 250
Number of edges: 238
Average in degree:   0.9520
Average out degree:   0.9520
----------------------------------------
positive_all.gml
Name: positive_all.gml
Type: MultiDiGraph
Number of nodes: 948
Number of edges: 1114
Average in degree:   1.1751
Average out degree:   1.1751

In [5]:
combined_df


Out[5]:
name sentiment node degree degree centrality betweenness centrality closeness centrality
0 negative_all.gml negative religious exemption 1 0.000640 0.000000e+00 0.000000
1 negative_all.gml negative adverse effects 15 0.009603 3.773148e-05 0.008963
2 negative_all.gml negative flu shot campaign 1 0.000640 0.000000e+00 0.001463
3 negative_all.gml negative toxic adjuvants 2 0.001280 0.000000e+00 0.000000
4 negative_all.gml negative government health agencies 1 0.000640 0.000000e+00 0.001152
5 negative_all.gml negative controversial 1 0.000640 0.000000e+00 0.000000
6 negative_all.gml negative swine-flu-related drugs 1 0.000640 0.000000e+00 0.000000
7 negative_all.gml negative safe level of exposure 1 0.000640 0.000000e+00 0.000000
8 negative_all.gml negative heightened emotion 2 0.001280 1.320602e-04 0.000640
9 negative_all.gml negative Dr. Paul Offit 16 0.010243 0.000000e+00 0.009428
10 negative_all.gml negative tremors 1 0.000640 0.000000e+00 0.000000
11 negative_all.gml negative disability 3 0.001921 0.000000e+00 0.062657
12 negative_all.gml negative proper protocol 1 0.000640 0.000000e+00 0.000000
13 negative_all.gml negative flu shot while pregnant 1 0.000640 0.000000e+00 0.000640
14 negative_all.gml negative genetic tests 2 0.001280 0.000000e+00 0.001280
15 negative_all.gml negative partial facial paralysis 1 0.000640 0.000000e+00 0.000000
16 negative_all.gml negative Merck's RotaTeq vaccine 2 0.001280 0.000000e+00 0.000000
17 negative_all.gml negative safe form of mercury 2 0.001280 1.320602e-04 0.000640
18 negative_all.gml negative financial compensation 7 0.004481 1.608783e-03 0.054305
19 negative_all.gml negative legislators 2 0.001280 8.202497e-07 0.000960
20 negative_all.gml negative risk 1 0.000640 0.000000e+00 0.000000
21 negative_all.gml negative compassion 1 0.000640 0.000000e+00 0.000000
22 negative_all.gml negative vaccine additives 9 0.005762 3.274041e-03 0.080053
23 negative_all.gml negative Hilleman 2 0.001280 0.000000e+00 0.052162
24 negative_all.gml negative medical industry 1 0.000640 0.000000e+00 0.058508
25 negative_all.gml negative medical ethics 1 0.000640 0.000000e+00 0.000000
26 negative_all.gml negative financial conlicts of interest 1 0.000640 0.000000e+00 0.000000
27 negative_all.gml negative school 1 0.000640 0.000000e+00 0.000000
28 negative_all.gml negative irritable bowels 1 0.000640 0.000000e+00 0.000000
29 negative_all.gml negative sicker children 1 0.000640 0.000000e+00 0.000000
... ... ... ... ... ... ... ...
918 positive_all.gml positive vague choice 1 0.001056 0.000000e+00 0.000000
919 positive_all.gml positive variation 2 0.002112 1.116243e-06 0.001056
920 positive_all.gml positive varicella vaccine 3 0.003168 7.441622e-07 0.002112
921 positive_all.gml positive variety of medical conditions 2 0.002112 0.000000e+00 0.001877
922 positive_all.gml positive vast amounts of money 1 0.001056 0.000000e+00 0.001056
923 positive_all.gml positive vast amounts of time 1 0.001056 0.000000e+00 0.001056
924 positive_all.gml positive victims 1 0.001056 0.000000e+00 0.000000
925 positive_all.gml positive video 1 0.001056 0.000000e+00 0.000000
926 positive_all.gml positive virus 1 0.001056 0.000000e+00 0.000000
927 positive_all.gml positive vitamins 1 0.001056 0.000000e+00 0.000000
928 positive_all.gml positive voluntary 1 0.001056 0.000000e+00 0.000000
929 positive_all.gml positive vomiting 1 0.001056 0.000000e+00 0.000000
930 positive_all.gml positive waiting longer 2 0.002112 0.000000e+00 0.054127
931 positive_all.gml positive we 3 0.003168 0.000000e+00 0.003168
932 positive_all.gml positive wealthier regions 2 0.002112 0.000000e+00 0.000000
933 positive_all.gml positive what we already knew 3 0.003168 2.232487e-06 0.002112
934 positive_all.gml positive whooping couch vaccine 1 0.001056 0.000000e+00 0.000000
935 positive_all.gml positive whooping cough 6 0.006336 7.149539e-04 0.050172
936 positive_all.gml positive withholding vaccines 4 0.004224 2.636567e-03 0.062149
937 positive_all.gml positive women 3 0.003168 3.616628e-04 0.002112
938 positive_all.gml positive women previously exposed to HPV 5 0.005280 0.000000e+00 0.000000
939 positive_all.gml positive women without previous HPV 1 0.001056 0.000000e+00 0.000000
940 positive_all.gml positive worried 1 0.001056 0.000000e+00 0.000000
941 positive_all.gml positive written down 1 0.001056 0.000000e+00 0.000000
942 positive_all.gml positive young adults 2 0.002112 0.000000e+00 0.011664
943 positive_all.gml positive young age group 2 0.002112 0.000000e+00 0.000000
944 positive_all.gml positive young children 3 0.003168 0.000000e+00 0.000000
945 positive_all.gml positive young women 1 0.001056 0.000000e+00 0.001056
946 positive_all.gml positive younger sibling of child with autism 1 0.001056 0.000000e+00 0.002200
947 positive_all.gml positive younger siblings of children with ASD 1 0.001056 0.000000e+00 0.000000

2761 rows × 7 columns


In [6]:
# save dataframe to csv
combined_df.to_csv('node_union_df', encoding = 'utf-8')


In [8]:
# split into sub-tables
negative_union_df = combined_df[combined_df['sentiment'] == 'negative']
positive_union_df = combined_df[combined_df['sentiment'] == 'positive']
neutral_union_df = combined_df[combined_df['sentiment'] == 'neutral']

In [16]:
negative_union_df.to_csv('negative_union_df.csv', encoding = 'utf-8', index = False)
positive_union_df.to_csv('positive_union_df.csv', encoding = 'utf-8', index = False)
neutral_union_df.to_csv('neutral_union_df.csv', encoding = 'utf-8', index = False)

In [ ]:


#### read the thing ####

In [17]:
# df = pd.read_csv('df')
df = pd.read_csv('positive_union_df.csv')
df


Out[17]:
name sentiment node degree degree centrality betweenness centrality closeness centrality
0 positive_all.gml positive 11-12 years of age 1 0.001056 0.000000e+00 0.000000
1 positive_all.gml positive 11-18 year olds 1 0.001056 0.000000e+00 0.000000
2 positive_all.gml positive 16 years of age 1 0.001056 0.000000e+00 0.000000
3 positive_all.gml positive 2014-2015 FLULAVAL QUADRIVALENT flu vaccine 7 0.007392 1.227868e-05 0.004224
4 positive_all.gml positive 86% vaccination coverage 3 0.003168 7.155120e-04 0.050354
5 positive_all.gml positive 92 to 94 percent vaccination coverage 4 0.004224 0.000000e+00 0.000000
6 positive_all.gml positive ASD 10 0.010560 1.416247e-03 0.012681
7 positive_all.gml positive ASD rate 1 0.001056 0.000000e+00 0.001901
8 positive_all.gml positive ASD risk 3 0.003168 1.629715e-04 0.012013
9 positive_all.gml positive Afghanistan 1 0.001056 0.000000e+00 0.000000
10 positive_all.gml positive Age of Autism 2 0.002112 0.000000e+00 0.047874
11 positive_all.gml positive American Medical Association 5 0.005280 0.000000e+00 0.004224
12 positive_all.gml positive Americans 1 0.001056 0.000000e+00 0.009541
13 positive_all.gml positive Amish 3 0.003168 4.922633e-04 0.058304
14 positive_all.gml positive Andrew Wakefield 3 0.003168 0.000000e+00 0.003168
15 positive_all.gml positive Arizona Deparment of Health Services 1 0.001056 0.000000e+00 0.003456
16 positive_all.gml positive Arizona Department of Health Services 1 0.001056 0.000000e+00 0.000000
17 positive_all.gml positive Assembly 1 0.001056 0.000000e+00 0.000000
18 positive_all.gml positive Australian social services minister 1 0.001056 0.000000e+00 0.000000
19 positive_all.gml positive Babies 1 0.001056 0.000000e+00 0.001056
20 positive_all.gml positive Babies at 2 months 1 0.001056 0.000000e+00 0.001056
21 positive_all.gml positive CDC 4 0.004224 0.000000e+00 0.014223
22 positive_all.gml positive California 5 0.005280 1.167963e-03 0.059149
23 positive_all.gml positive California Coalition for Health Choice 3 0.003168 3.560816e-04 0.043347
24 positive_all.gml positive California Senate 1 0.001056 0.000000e+00 0.000000
25 positive_all.gml positive California governor 2 0.002112 0.000000e+00 0.047488
26 positive_all.gml positive Caregivers 1 0.001056 0.000000e+00 0.061725
27 positive_all.gml positive Carribean 1 0.001056 0.000000e+00 0.001408
28 positive_all.gml positive Catholic Church 1 0.001056 0.000000e+00 0.001056
29 positive_all.gml positive Catholic parents 3 0.003168 0.000000e+00 0.003771
... ... ... ... ... ... ... ...
918 positive_all.gml positive vague choice 1 0.001056 0.000000e+00 0.000000
919 positive_all.gml positive variation 2 0.002112 1.116243e-06 0.001056
920 positive_all.gml positive varicella vaccine 3 0.003168 7.441622e-07 0.002112
921 positive_all.gml positive variety of medical conditions 2 0.002112 0.000000e+00 0.001877
922 positive_all.gml positive vast amounts of money 1 0.001056 0.000000e+00 0.001056
923 positive_all.gml positive vast amounts of time 1 0.001056 0.000000e+00 0.001056
924 positive_all.gml positive victims 1 0.001056 0.000000e+00 0.000000
925 positive_all.gml positive video 1 0.001056 0.000000e+00 0.000000
926 positive_all.gml positive virus 1 0.001056 0.000000e+00 0.000000
927 positive_all.gml positive vitamins 1 0.001056 0.000000e+00 0.000000
928 positive_all.gml positive voluntary 1 0.001056 0.000000e+00 0.000000
929 positive_all.gml positive vomiting 1 0.001056 0.000000e+00 0.000000
930 positive_all.gml positive waiting longer 2 0.002112 0.000000e+00 0.054127
931 positive_all.gml positive we 3 0.003168 0.000000e+00 0.003168
932 positive_all.gml positive wealthier regions 2 0.002112 0.000000e+00 0.000000
933 positive_all.gml positive what we already knew 3 0.003168 2.232487e-06 0.002112
934 positive_all.gml positive whooping couch vaccine 1 0.001056 0.000000e+00 0.000000
935 positive_all.gml positive whooping cough 6 0.006336 7.149539e-04 0.050172
936 positive_all.gml positive withholding vaccines 4 0.004224 2.636567e-03 0.062149
937 positive_all.gml positive women 3 0.003168 3.616628e-04 0.002112
938 positive_all.gml positive women previously exposed to HPV 5 0.005280 0.000000e+00 0.000000
939 positive_all.gml positive women without previous HPV 1 0.001056 0.000000e+00 0.000000
940 positive_all.gml positive worried 1 0.001056 0.000000e+00 0.000000
941 positive_all.gml positive written down 1 0.001056 0.000000e+00 0.000000
942 positive_all.gml positive young adults 2 0.002112 0.000000e+00 0.011664
943 positive_all.gml positive young age group 2 0.002112 0.000000e+00 0.000000
944 positive_all.gml positive young children 3 0.003168 0.000000e+00 0.000000
945 positive_all.gml positive young women 1 0.001056 0.000000e+00 0.001056
946 positive_all.gml positive younger sibling of child with autism 1 0.001056 0.000000e+00 0.002200
947 positive_all.gml positive younger siblings of children with ASD 1 0.001056 0.000000e+00 0.000000

948 rows × 7 columns


In [ ]:


In [ ]: