node_df

  1. node-level data (all articles)
  2. network data + centrality data = combined_df
  3. remove NaN values
  4. out = node_df.csv

In [1]:
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
from glob import glob
pd.set_option('display.mpl_style', 'default') 
# display all the columns
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)


gml_files = glob('../output_join/article_pos1.gml')


def calculate_graph_inf(graph):
    graph.name = filename
    info = nx.info(graph)
    print info
    ## plot spring layout
    #plt.figure(figsize=(10,10))
    #nx.draw_spring(graph, arrows=True, with_labels=True)

def highest_centrality(cent_dict):
    """Returns a tuple (node,value) with the node
    with largest value from centrality dictionary."""
    # create ordered tuple of centrality data
    cent_items = [(b,a) for (a,b) in cent_dict.iteritems()]
    # sort in descending order
    cent_items.sort()
    cent_items.reverse()
    return tuple(reversed(cent_items[0]))

In [2]:
# create empty dataframe with columns

data_columns = ['name',
                'sentiment'
                ]
data = pd.DataFrame(columns = data_columns)
combined_df = pd.DataFrame()

In [3]:
# graph = directed, ugraph = undirected

for graph_num, gml_graph in enumerate(gml_files):
    graph = nx.read_gml(gml_graph)
    ugraph = graph.to_undirected()
    # adding missing edges back
    U = graph.to_undirected(reciprocal=True)
    e = U.edges()
    ugraph.add_edges_from(e)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 40)
    print(gml_graph)
    #calculate_graph_inf(graph)
    #calculate_graph_inf(ugraph)

    # calculate variables and save into list
    sent = filepath.split('/')[-1]
    deg_cent = nx.degree_centrality(graph)
    bet_cent = nx.betweenness_centrality(graph)
    clo_cent = nx.closeness_centrality(graph)
    
    graph_values = {'name':filename,
                    'sentiment':sent,
                    }

    data = data.append(graph_values, ignore_index=True)
    
    #
    degree = nx.degree(graph)
    deg_df = pd.DataFrame.from_dict(degree, orient = 'index')
    deg_df.columns = ['degree']
    # degree centrality
    deg_cent = nx.degree_centrality(graph)
    dc_df = pd.DataFrame.from_dict(deg_cent, orient = 'index')
    dc_df.columns = ['degree centrality']
    # betweenness centrality
    bet_cent = nx.betweenness_centrality(graph)
    bc_df = pd.DataFrame.from_dict(bet_cent, orient = 'index')
    bc_df.columns = ['betweenness centrality']
    # closeness centrality
    clo_cent = nx.closeness_centrality(graph)
    cc_df = pd.DataFrame.from_dict(clo_cent, orient = 'index')
    cc_df.columns = ['closeness centrality']
    
    # concat node frames into node_df
    frames = [deg_df, dc_df, bc_df, cc_df]
    node_df = pd.concat(frames, axis = 1)
    node_df.index.name = 'node'
    node_df = node_df.reset_index()
    
    values = pd.DataFrame(graph_values, columns = ('name', 'sentiment'), index = [0])
    
    # df = merges graph_values with node_df for single graph and fill NaNs
    df = pd.concat([values, node_df], axis = 1)
    df = df.fillna(method='ffill')
    
    # append to combined_df
    combined_df = combined_df.append(df)
    
#    if graph_num == 2:
#        break


----------------------------------------
../output_join/article_neg1.gml

In [4]:
combined_df


Out[4]:
name sentiment node degree degree centrality betweenness centrality closeness centrality
0 article_neg1.gml output_join ACIP 1 0.000796 0.000000e+00 0.077104
1 article_neg1.gml output_join ACIP's rotavirus use recommendation 1 0.000796 0.000000e+00 0.000000
2 article_neg1.gml output_join ADHD 1 0.000796 0.000000e+00 0.102970
3 article_neg1.gml output_join AIDS 1 0.000796 0.000000e+00 0.000000
4 article_neg1.gml output_join African American males 2 0.001592 0.000000e+00 0.105347
5 article_neg1.gml output_join African women 3 0.002389 2.008949e-06 0.000796
6 article_neg1.gml output_join African-American children 1 0.000796 0.000000e+00 0.000000
7 article_neg1.gml output_join Alysia Osoff 6 0.004777 4.250514e-04 0.097385
8 article_neg1.gml output_join America 2 0.001592 1.903215e-06 0.001433
9 article_neg1.gml output_join American Academy of Pediatrics 2 0.001592 2.170723e-04 0.095888
10 article_neg1.gml output_join American Nursing Association's Code of Ethics 2 0.001592 0.000000e+00 0.000000
11 article_neg1.gml output_join Americans 7 0.005573 4.896067e-03 0.102908
12 article_neg1.gml output_join Amish 2 0.001592 0.000000e+00 0.001791
13 article_neg1.gml output_join Andrew Wakefield 1 0.000796 0.000000e+00 0.000000
14 article_neg1.gml output_join Apartheid 1 0.000796 0.000000e+00 0.000000
15 article_neg1.gml output_join Attkisson's website 2 0.001592 4.104601e-04 0.088995
16 article_neg1.gml output_join Australia 1 0.000796 0.000000e+00 0.096528
17 article_neg1.gml output_join Baby Boom 2 0.001592 0.000000e+00 0.000000
18 article_neg1.gml output_join Baby Boomers 5 0.003981 0.000000e+00 0.104643
19 article_neg1.gml output_join Baker College nursing school 3 0.002389 2.537620e-06 0.000796
20 article_neg1.gml output_join Baker College nursing school instructors 6 0.004777 4.387104e-04 0.092638
21 article_neg1.gml output_join Bell's Palsy 1 0.000796 0.000000e+00 0.000000
22 article_neg1.gml output_join Big Pharma 13 0.010350 7.582102e-03 0.127289
23 article_neg1.gml output_join Big Tobacco 2 0.001592 0.000000e+00 0.123915
24 article_neg1.gml output_join Bill of Rights 1 0.000796 0.000000e+00 0.000000
25 article_neg1.gml output_join Brian Hooker 1 0.000796 0.000000e+00 0.096415
26 article_neg1.gml output_join Bruesewitz v. Wyeth 2 0.001592 4.066536e-04 0.121071
27 article_neg1.gml output_join Bush Administration 2 0.001592 0.000000e+00 0.107416
28 article_neg1.gml output_join CDC 62 0.049363 2.134902e-02 0.154337
29 article_neg1.gml output_join CDC and Big Pharma 3 0.002389 0.000000e+00 0.097188
... ... ... ... ... ... ... ...
1227 article_neg1.gml output_join violation of basic human rights 2 0.001592 0.000000e+00 0.000000
1228 article_neg1.gml output_join violation of law 1 0.000796 0.000000e+00 0.000000
1229 article_neg1.gml output_join viral replication 2 0.001592 0.000000e+00 0.000000
1230 article_neg1.gml output_join vitamin A supplements 2 0.001592 0.000000e+00 0.004485
1231 article_neg1.gml output_join vitamin B12 2 0.001592 3.172025e-07 0.000796
1232 article_neg1.gml output_join vitamin C 2 0.001592 3.172025e-07 0.000796
1233 article_neg1.gml output_join vitamin D 1 0.000796 0.000000e+00 0.001062
1234 article_neg1.gml output_join vitamin D deficiency 1 0.000796 0.000000e+00 0.000796
1235 article_neg1.gml output_join vitamin supplements 1 0.000796 0.000000e+00 0.000000
1236 article_neg1.gml output_join vulnerable 1 0.000796 0.000000e+00 0.000000
1237 article_neg1.gml output_join wander 1 0.000796 0.000000e+00 0.000000
1238 article_neg1.gml output_join wander off 1 0.000796 0.000000e+00 0.000000
1239 article_neg1.gml output_join war propaganda 2 0.001592 0.000000e+00 0.000000
1240 article_neg1.gml output_join water 1 0.000796 0.000000e+00 0.000000
1241 article_neg1.gml output_join whistle 1 0.000796 0.000000e+00 0.000000
1242 article_neg1.gml output_join whistleblower 3 0.002389 4.136321e-04 0.101282
1243 article_neg1.gml output_join whistleblowers 1 0.000796 0.000000e+00 0.000000
1244 article_neg1.gml output_join whooping cough 1 0.000796 0.000000e+00 0.000000
1245 article_neg1.gml output_join whooping cough outbreaks 1 0.000796 0.000000e+00 0.080278
1246 article_neg1.gml output_join widespread 1 0.000796 0.000000e+00 0.000000
1247 article_neg1.gml output_join widespread fear 2 0.001592 1.209176e-03 0.114824
1248 article_neg1.gml output_join widespread health problems 2 0.001592 0.000000e+00 0.102746
1249 article_neg1.gml output_join words 2 0.001592 0.000000e+00 0.114627
1250 article_neg1.gml output_join world 1 0.000796 0.000000e+00 0.000000
1251 article_neg1.gml output_join world's healthiest children 1 0.000796 0.000000e+00 0.000000
1252 article_neg1.gml output_join wrong doing 3 0.002389 0.000000e+00 0.000000
1253 article_neg1.gml output_join years 1 0.000796 0.000000e+00 0.000000
1254 article_neg1.gml output_join you 10 0.007962 3.966962e-03 0.120027
1255 article_neg1.gml output_join young adults 1 0.000796 0.000000e+00 0.000000
1256 article_neg1.gml output_join young doctors 7 0.005573 4.091913e-04 0.092548

1257 rows × 7 columns


In [5]:
# save dataframe to csv
combined_df.to_csv('neg1_df.csv', encoding = 'utf-8')


In [ ]:
# split into sub-tables
neg_node_df = combined_df[combined_df['sentiment'] == 'negative']
pos_node_df = combined_df[combined_df['sentiment'] == 'positive']
neu_node_df = combined_df[combined_df['sentiment'] == 'neutral']

In [ ]:
neg_node_df.to_csv('negative_node_df.csv', encoding = 'utf-8')
pos_node_df.to_csv('positive_node_df.csv', encoding = 'utf-8')
neu_node_df.to_csv('neutral_node_df.csv', encoding = 'utf-8')

#### read the thing ####

In [6]:
df = pd.read_csv('neg1_df.csv')
df


Out[6]:
Unnamed: 0 name sentiment node degree degree centrality betweenness centrality closeness centrality
0 0 article_neg1.gml output_join ACIP 1 0.000796 0.000000e+00 0.077104
1 1 article_neg1.gml output_join ACIP's rotavirus use recommendation 1 0.000796 0.000000e+00 0.000000
2 2 article_neg1.gml output_join ADHD 1 0.000796 0.000000e+00 0.102970
3 3 article_neg1.gml output_join AIDS 1 0.000796 0.000000e+00 0.000000
4 4 article_neg1.gml output_join African American males 2 0.001592 0.000000e+00 0.105347
5 5 article_neg1.gml output_join African women 3 0.002389 2.008949e-06 0.000796
6 6 article_neg1.gml output_join African-American children 1 0.000796 0.000000e+00 0.000000
7 7 article_neg1.gml output_join Alysia Osoff 6 0.004777 4.250514e-04 0.097385
8 8 article_neg1.gml output_join America 2 0.001592 1.903215e-06 0.001433
9 9 article_neg1.gml output_join American Academy of Pediatrics 2 0.001592 2.170723e-04 0.095888
10 10 article_neg1.gml output_join American Nursing Association's Code of Ethics 2 0.001592 0.000000e+00 0.000000
11 11 article_neg1.gml output_join Americans 7 0.005573 4.896067e-03 0.102908
12 12 article_neg1.gml output_join Amish 2 0.001592 0.000000e+00 0.001791
13 13 article_neg1.gml output_join Andrew Wakefield 1 0.000796 0.000000e+00 0.000000
14 14 article_neg1.gml output_join Apartheid 1 0.000796 0.000000e+00 0.000000
15 15 article_neg1.gml output_join Attkisson's website 2 0.001592 4.104601e-04 0.088995
16 16 article_neg1.gml output_join Australia 1 0.000796 0.000000e+00 0.096528
17 17 article_neg1.gml output_join Baby Boom 2 0.001592 0.000000e+00 0.000000
18 18 article_neg1.gml output_join Baby Boomers 5 0.003981 0.000000e+00 0.104643
19 19 article_neg1.gml output_join Baker College nursing school 3 0.002389 2.537620e-06 0.000796
20 20 article_neg1.gml output_join Baker College nursing school instructors 6 0.004777 4.387104e-04 0.092638
21 21 article_neg1.gml output_join Bell's Palsy 1 0.000796 0.000000e+00 0.000000
22 22 article_neg1.gml output_join Big Pharma 13 0.010350 7.582102e-03 0.127289
23 23 article_neg1.gml output_join Big Tobacco 2 0.001592 0.000000e+00 0.123915
24 24 article_neg1.gml output_join Bill of Rights 1 0.000796 0.000000e+00 0.000000
25 25 article_neg1.gml output_join Brian Hooker 1 0.000796 0.000000e+00 0.096415
26 26 article_neg1.gml output_join Bruesewitz v. Wyeth 2 0.001592 4.066536e-04 0.121071
27 27 article_neg1.gml output_join Bush Administration 2 0.001592 0.000000e+00 0.107416
28 28 article_neg1.gml output_join CDC 62 0.049363 2.134902e-02 0.154337
29 29 article_neg1.gml output_join CDC and Big Pharma 3 0.002389 0.000000e+00 0.097188
... ... ... ... ... ... ... ... ...
1227 1227 article_neg1.gml output_join violation of basic human rights 2 0.001592 0.000000e+00 0.000000
1228 1228 article_neg1.gml output_join violation of law 1 0.000796 0.000000e+00 0.000000
1229 1229 article_neg1.gml output_join viral replication 2 0.001592 0.000000e+00 0.000000
1230 1230 article_neg1.gml output_join vitamin A supplements 2 0.001592 0.000000e+00 0.004485
1231 1231 article_neg1.gml output_join vitamin B12 2 0.001592 3.172025e-07 0.000796
1232 1232 article_neg1.gml output_join vitamin C 2 0.001592 3.172025e-07 0.000796
1233 1233 article_neg1.gml output_join vitamin D 1 0.000796 0.000000e+00 0.001062
1234 1234 article_neg1.gml output_join vitamin D deficiency 1 0.000796 0.000000e+00 0.000796
1235 1235 article_neg1.gml output_join vitamin supplements 1 0.000796 0.000000e+00 0.000000
1236 1236 article_neg1.gml output_join vulnerable 1 0.000796 0.000000e+00 0.000000
1237 1237 article_neg1.gml output_join wander 1 0.000796 0.000000e+00 0.000000
1238 1238 article_neg1.gml output_join wander off 1 0.000796 0.000000e+00 0.000000
1239 1239 article_neg1.gml output_join war propaganda 2 0.001592 0.000000e+00 0.000000
1240 1240 article_neg1.gml output_join water 1 0.000796 0.000000e+00 0.000000
1241 1241 article_neg1.gml output_join whistle 1 0.000796 0.000000e+00 0.000000
1242 1242 article_neg1.gml output_join whistleblower 3 0.002389 4.136321e-04 0.101282
1243 1243 article_neg1.gml output_join whistleblowers 1 0.000796 0.000000e+00 0.000000
1244 1244 article_neg1.gml output_join whooping cough 1 0.000796 0.000000e+00 0.000000
1245 1245 article_neg1.gml output_join whooping cough outbreaks 1 0.000796 0.000000e+00 0.080278
1246 1246 article_neg1.gml output_join widespread 1 0.000796 0.000000e+00 0.000000
1247 1247 article_neg1.gml output_join widespread fear 2 0.001592 1.209176e-03 0.114824
1248 1248 article_neg1.gml output_join widespread health problems 2 0.001592 0.000000e+00 0.102746
1249 1249 article_neg1.gml output_join words 2 0.001592 0.000000e+00 0.114627
1250 1250 article_neg1.gml output_join world 1 0.000796 0.000000e+00 0.000000
1251 1251 article_neg1.gml output_join world's healthiest children 1 0.000796 0.000000e+00 0.000000
1252 1252 article_neg1.gml output_join wrong doing 3 0.002389 0.000000e+00 0.000000
1253 1253 article_neg1.gml output_join years 1 0.000796 0.000000e+00 0.000000
1254 1254 article_neg1.gml output_join you 10 0.007962 3.966962e-03 0.120027
1255 1255 article_neg1.gml output_join young adults 1 0.000796 0.000000e+00 0.000000
1256 1256 article_neg1.gml output_join young doctors 7 0.005573 4.091913e-04 0.092548

1257 rows × 8 columns


In [ ]:


In [ ]: