In [1]:
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
from glob import glob
import re
pd.set_option('display.mpl_style', 'default')
pd.set_option('display.width', 5000)
pd.set_option('display.max_columns', 60)
# use a list comprehension to create a list that matches files in a directory using regular expressions
# http://stackoverflow.com/questions/2225564/get-a-filtered-list-of-files-in-a-directory
#gml_files = [f for f in os.listdir('.') if re.match(r'(pos|neg|neu)_u*all\.gml', f)]
gml_files = [f for f in os.listdir('.') if re.match(r'(positive|negative|neutral)_all\.gml', f)]
def calculate_graph_inf(graph):
graph.name = filename
info = nx.info(graph)
print info
#plt.figure(figsize=(10,10))
#nx.draw_spring(graph, arrows=True, with_labels=True)
def highest_centrality(cent_dict):
"""Returns a tuple (node,value) with the node
with largest value from centrality dictionary."""
# create ordered tuple of centrality data
cent_items = [(b,a) for (a,b) in cent_dict.iteritems()]
# sort in descending order
cent_items.sort()
cent_items.reverse()
return tuple(reversed(cent_items[0]))
In [2]:
gml_files.sort()
gml_files
Out[2]:
In [3]:
# create empty dataframe with columns
data_columns = ['name',
'sentiment'
]
data = pd.DataFrame(columns = data_columns)
combined_df = pd.DataFrame()
In [4]:
for graph_num, gml_graph in enumerate(gml_files):
graph = nx.read_gml(gml_graph)
(filepath, filename) = os.path.split(gml_graph)
print('-' * 40)
print(gml_graph)
calculate_graph_inf(graph)
# calculate variables and save into list
sent = filename.split('_')[0]
deg_cent = nx.degree_centrality(graph)
bet_cent = nx.betweenness_centrality(graph)
clo_cent = nx.closeness_centrality(graph)
graph_values = {'name':filename,
'sentiment':sent,
}
data = data.append(graph_values, ignore_index=True)
#
degree = nx.degree(graph)
deg_df = pd.DataFrame.from_dict(degree, orient = 'index')
deg_df.columns = ['degree']
# degree centrality
deg_cent = nx.degree_centrality(graph)
dc_df = pd.DataFrame.from_dict(deg_cent, orient = 'index')
dc_df.columns = ['degree centrality']
# betweenness centrality
bet_cent = nx.betweenness_centrality(graph)
bc_df = pd.DataFrame.from_dict(bet_cent, orient = 'index')
bc_df.columns = ['betweenness centrality']
# closeness centrality
clo_cent = nx.closeness_centrality(graph)
cc_df = pd.DataFrame.from_dict(clo_cent, orient = 'index')
cc_df.columns = ['closeness centrality']
# concat node frames into node_df
frames = [deg_df, dc_df, bc_df, cc_df]
node_df = pd.concat(frames, axis = 1)
node_df.index.name = 'node'
node_df = node_df.reset_index()
values = pd.DataFrame(graph_values, columns = ('name', 'sentiment'), index = [0])
# df = merges graph_values with node_df for single graph and fill NaNs
df = pd.concat([values, node_df], axis = 1)
df = df.fillna(method='ffill')
# append to combined_df
combined_df = combined_df.append(df)
# if graph_num == 0:
# break
In [5]:
combined_df
Out[5]:
In [6]:
# save dataframe to csv
combined_df.to_csv('node_union_df', encoding = 'utf-8')
In [8]:
# split into sub-tables
negative_union_df = combined_df[combined_df['sentiment'] == 'negative']
positive_union_df = combined_df[combined_df['sentiment'] == 'positive']
neutral_union_df = combined_df[combined_df['sentiment'] == 'neutral']
In [16]:
negative_union_df.to_csv('negative_union_df.csv', encoding = 'utf-8', index = False)
positive_union_df.to_csv('positive_union_df.csv', encoding = 'utf-8', index = False)
neutral_union_df.to_csv('neutral_union_df.csv', encoding = 'utf-8', index = False)
In [ ]:
In [17]:
# df = pd.read_csv('df')
df = pd.read_csv('positive_union_df.csv')
df
Out[17]:
In [ ]:
In [ ]: