In [1]:
# 1_network_df
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
from glob import glob
plt.style.use('ggplot')
pd.set_option('display.width', 5000)
pd.set_option('display.max_columns', 60)
gml_files = glob('../output/network/article_pos1.gml')
In [2]:
def calculate_graph_inf(graph):
graph.name = filename
info = nx.info(graph)
print info
def highest_centrality(cent_dict):
"""Returns a tuple (node,value) with the node
with largest value from centrality dictionary."""
# create ordered tuple of centrality data
cent_items = [(b,a) for (a,b) in cent_dict.iteritems()]
# sort in descending order
cent_items.sort()
cent_items.reverse()
return tuple(reversed(cent_items[0]))
In [3]:
for graph_num, gml_graph in enumerate(gml_files):
dgraph = nx.read_gml(gml_graph)
ugraph = dgraph.to_undirected() # to undirected graph
#U = dgraph.to_undirected(reciprocal=True)
#e = U.edges()
#ugraph.add_edges_from(e)
(filepath, filename) = os.path.split(gml_graph)
print('-' * 10)
print(gml_graph)
calculate_graph_inf(dgraph)
calculate_graph_inf(ugraph)
In [4]:
# save undirected gml
#nx.write_gml(ugraph, "../output/network/article_u_pos.gml")
dgraph = directed ugraph = undirected
In [5]:
# load
gml_files = glob('../output/network/article_u_pos.gml')
In [6]:
# ugraph = undirected; dgraph = directed
for graph_num, gml_graph in enumerate(gml_files):
ugraph = nx.read_gml(gml_graph)
(filepath, filename) = os.path.split(gml_graph)
print('-' * 10)
print(gml_graph)
calculate_graph_inf(ugraph)
In [7]:
# 2_node_df: list all nodes and centrality
data_columns = ['name',
'sentiment'
]
data = pd.DataFrame(columns = data_columns)
combined_df = pd.DataFrame()
In [8]:
# calculate variables and save into list
sent = "positive"
deg_cent = nx.degree_centrality(ugraph)
bet_cent = nx.betweenness_centrality(ugraph)
clo_cent = nx.closeness_centrality(ugraph)
graph_values = {'name':filename,
'sentiment':sent
}
data = data.append(graph_values, ignore_index=True)
degree = nx.degree(ugraph)
deg_df = pd.DataFrame.from_dict(degree, orient = 'index')
deg_df.columns = ['degree']
# degree centrality
deg_cent = nx.degree_centrality(ugraph)
dc_df = pd.DataFrame.from_dict(deg_cent, orient = 'index')
dc_df.columns = ['deg cent']
# betweenness centrality
bet_cent = nx.betweenness_centrality(ugraph)
bc_df = pd.DataFrame.from_dict(bet_cent, orient = 'index')
bc_df.columns = ['bet cent']
# closeness centrality
clo_cent = nx.closeness_centrality(ugraph)
cc_df = pd.DataFrame.from_dict(clo_cent, orient = 'index')
cc_df.columns = ['clo cent']
# concat node frames into node_df
frames = [deg_df, dc_df, bc_df, cc_df]
node_df = pd.concat(frames, axis = 1)
node_df.index.name = 'node'
node_df = node_df.reset_index()
values = pd.DataFrame(graph_values, columns = ('name', 'sentiment'), index = [0])
# df = merges graph_values with node_df for single graph and fill NaNs
df = pd.concat([values, node_df], axis = 1)
df = df.fillna(method='ffill')
combined_df = combined_df.append(df)
In [9]:
# print entire network
combined_df
Out[9]:
In [10]:
# save
#combined_df.to_csv('../output/df/article_u_pos.csv')
In [11]:
# 7_graph_calculation
def drawIt(graph, what = 'graph'):
nsize = graph.number_of_nodes()
print "Drawing %s of size %s:" % (what, nsize)
if nsize > 20:
plt.figure(figsize=(10, 10))
if nsize > 40:
nx.draw_spring(graph, with_labels = True, node_size = 70, font_size = 12)
else:
nx.draw_spring(graph, with_labels = True)
else:
nx.draw_spring(graph, with_labels = True)
plt.show()
# for undirected graphs
def describeGraph(graph):
components = sorted(nx.connected_components(graph), key = len, reverse = True)
cc = [len(c) for c in components]
subgraphs = list(nx.connected_component_subgraphs(graph))
params = (graph.number_of_edges(),graph.number_of_nodes(),len(cc))
print "Graph has %s edges, %s nodes, %s connected components\n" % params
drawIt(graph)
for sub in components:
drawIt(graph.subgraph(sub), what = 'component')
# for directed graphs
def describeGraph_d(graph):
components = sorted(nx.weakly_connected_components(graph), key = len, reverse = True)
cc = [len(c) for c in components]
subgraphs = list(nx.weakly_connected_component_subgraphs(graph))
params = (graph.number_of_edges(),graph.number_of_nodes(),len(cc))
print "Graph has %s edges, %s nodes, %s connected components\n" % params
drawIt(graph)
for sub in components:
drawIt(graph.subgraph(sub), what = 'component')
In [12]:
# UNDIRECTED network graph
describeGraph(ugraph)
In [13]:
# DIRECTED network graph
describeGraph_d(dgraph)
In [14]:
# list of connected components by size (undirected graph)
connected_components = [len(c) for c in sorted(nx.connected_components(ugraph), key=len, reverse=True)]
# generate connected components as subgraphs (undirected graph)
subgraphs = list(nx.connected_component_subgraphs(ugraph))
# greatest component (undirected MultiGraph)
u_Gc = max(nx.connected_component_subgraphs(ugraph), key=len)
u_Gc.name = "undirected Gc"
In [15]:
print "connected components = ", connected_components
print nx.info(u_Gc)
In [16]:
# use directed dgraph
components = sorted(nx.weakly_connected_components(dgraph), key = len, reverse = True)
cc = [len(c) for c in components]
# generate connected components as subgraphs
subgraphs = list(nx.weakly_connected_component_subgraphs(dgraph))
# greatest component
d_Gc = max(nx.weakly_connected_component_subgraphs(dgraph), key=len)
d_Gc.name = "directed Gc"
In [17]:
print "connected components = ", cc
print nx.info(d_Gc)
In [18]:
# finally, greatest components for undirected and directed graphs
print nx.info(u_Gc)
print nx.info(d_Gc)
In [19]:
# save Gc
#nx.write_gml(u_Gc, "../output/network/u_Gc_positive2.gml")
#nx.write_gml(d_Gc, "../output/network/d_Gc_positive2.gml")
In [20]:
# load directed Gc
Gc_files = glob('../output/network/d_Gc_positive2.gml')
network_data_columns = ['name',
'sentiment',
'# nodes',
'# edges',
#'avg deg',
'density',
'deg assort coef',
'avg deg cent',
'avg bet cent',
'avg clo cent',
'high deg cent',
'high bet cent',
'high clo cent',
'avg node conn',
'# conn comp',
'gc size'
]
network_data = pd.DataFrame(columns = network_data_columns)
In [21]:
# Gc_files
for graph_num, gml_graph in enumerate(Gc_files):
graph = nx.read_gml(gml_graph)
(filepath, filename) = os.path.split(gml_graph)
print('-' * 10)
print(gml_graph)
calculate_graph_inf(graph)
# calculate variables
sent = "pos"
nodes = nx.number_of_nodes(graph)
edges = nx.number_of_edges(graph)
density = float("{0:.4f}".format(nx.density(graph)))
avg_deg_cen = np.array(nx.degree_centrality(graph).values()).mean()
avg_bet_cen = np.array(nx.betweenness_centrality(graph).values()).mean()
avg_clo_cen = np.array(nx.closeness_centrality(graph).values()).mean()
#avg_deg = float("{0:.4f}".format(in_deg + out_deg))
avg_node_con = float("{0:.4f}".format((nx.average_node_connectivity(graph))))
deg_assort_coeff = float("{0:.4f}".format((nx.degree_assortativity_coefficient(graph))))
conn_comp = nx.number_weakly_connected_components(graph) # ugraph
deg_cen = nx.degree_centrality(graph)
bet_cen = nx.betweenness_centrality(graph)
clo_cen = nx.closeness_centrality(graph)
highest_deg_cen = highest_centrality(deg_cen)
highest_bet_cen = highest_centrality(bet_cen)
highest_clo_cen = highest_centrality(clo_cen)
Gc = len(max(nx.weakly_connected_component_subgraphs(graph), key=len))
# save variables into list
graph_values = {'name':filename,
'sentiment':sent,
'# nodes':nodes,
'# edges':edges,
#'avg deg':avg_deg,
'density':density,
'deg assort coef':deg_assort_coeff,
'avg deg cent':"%.4f" % avg_deg_cen,
'avg bet cent':"%.4f" % avg_bet_cen,
'avg clo cent':"%.4f" % avg_clo_cen,
'high deg cent':highest_deg_cen,
'high bet cent':highest_bet_cen,
'high clo cent':highest_clo_cen,
'avg node conn':avg_node_con,
'# conn comp':conn_comp,
'gc size':Gc
}
network_data = network_data.append(graph_values, ignore_index=True)
In [22]:
# print network data for greatest component
network_data
Out[22]:
In [23]:
# save
#network_data.to_csv('../output/df/d_Gc_pos2.csv')
In [24]:
# load UNdirected Gc
Gc_files = glob('../output/network/u_Gc_positive2.gml')
network_data_columns = ['name',
'sentiment',
'# nodes',
'# edges',
#'avg deg',
'density',
'deg assort coef',
'avg deg cent',
'avg bet cent',
'avg clo cent',
'high deg cent',
'high bet cent',
'high clo cent',
'avg node conn'
#'# conn comp',
#'gc size'
]
network_data = pd.DataFrame(columns = network_data_columns)
In [25]:
# Gc_files
for graph_num, gml_graph in enumerate(Gc_files):
graph = nx.read_gml(gml_graph)
(filepath, filename) = os.path.split(gml_graph)
print('-' * 10)
print(gml_graph)
calculate_graph_inf(graph)
# calculate variables
sent = "pos"
nodes = nx.number_of_nodes(graph)
edges = nx.number_of_edges(graph)
density = float("{0:.4f}".format(nx.density(graph)))
avg_deg_cen = np.array(nx.degree_centrality(graph).values()).mean()
avg_bet_cen = np.array(nx.betweenness_centrality(graph).values()).mean()
avg_clo_cen = np.array(nx.closeness_centrality(graph).values()).mean()
#avg_deg = float("{0:.4f}".format(in_deg + out_deg))
avg_node_con = float("{0:.4f}".format((nx.average_node_connectivity(graph))))
deg_assort_coeff = float("{0:.4f}".format((nx.degree_assortativity_coefficient(graph))))
#conn_comp = nx.number_weakly_connected_components(graph) # ugraph
deg_cen = nx.degree_centrality(graph)
bet_cen = nx.betweenness_centrality(graph)
clo_cen = nx.closeness_centrality(graph)
highest_deg_cen = highest_centrality(deg_cen)
highest_bet_cen = highest_centrality(bet_cen)
highest_clo_cen = highest_centrality(clo_cen)
#Gc = len(max(nx.weakly_connected_component_subgraphs(graph), key=len))
# save variables into list
graph_values = {'name':filename,
'sentiment':sent,
'# nodes':nodes,
'# edges':edges,
#'avg deg':avg_deg,
'density':density,
'deg assort coef':deg_assort_coeff,
'avg deg cent':"%.4f" % avg_deg_cen,
'avg bet cent':"%.4f" % avg_bet_cen,
'avg clo cent':"%.4f" % avg_clo_cen,
'high deg cent':highest_deg_cen,
'high bet cent':highest_bet_cen,
'high clo cent':highest_clo_cen,
'avg node conn':avg_node_con
#'# conn comp':conn_comp,
#'gc size':Gc
}
network_data = network_data.append(graph_values, ignore_index=True)
In [26]:
# print network data for greatest component
network_data
Out[26]:
In [27]:
# save
#network_data.to_csv('../output/df/u_Gc_pos2.csv')
In [33]:
#gml_files = glob('../output/network/d_Gc_positive2.gml')
gml_files = glob('../output/network/u_Gc_positive2.gml')
In [34]:
# 2_node_df: list all nodes and centrality
data_columns = ['name',
'sentiment'
]
data = pd.DataFrame(columns = data_columns)
#combined_df = pd.DataFrame()
In [35]:
for graph_num, gml_graph in enumerate(gml_files):
graph = nx.read_gml(gml_graph)
(filepath, filename) = os.path.split(gml_graph)
print('-' * 10)
print(gml_graph)
calculate_graph_inf(graph)
# calculate variables and save into list
sent = "pos"
deg_cent = nx.degree_centrality(graph)
bet_cent = nx.betweenness_centrality(graph)
clo_cent = nx.closeness_centrality(graph)
graph_values = {'name':filename,
'sentiment':sent
}
data = data.append(graph_values, ignore_index=True)
degree = nx.degree(graph)
deg_df = pd.DataFrame.from_dict(degree, orient = 'index')
deg_df.columns = ['degree']
# degree centrality
deg_cent = nx.degree_centrality(graph)
dc_df = pd.DataFrame.from_dict(deg_cent, orient = 'index')
dc_df.columns = ['deg cent']
# betweenness centrality
bet_cent = nx.betweenness_centrality(graph)
bc_df = pd.DataFrame.from_dict(bet_cent, orient = 'index')
bc_df.columns = ['bet cent']
# closeness centrality
clo_cent = nx.closeness_centrality(graph)
cc_df = pd.DataFrame.from_dict(clo_cent, orient = 'index')
cc_df.columns = ['clo cent']
# concat node frames into node_df
frames = [deg_df, dc_df, bc_df, cc_df]
node_df = pd.concat(frames, axis = 1)
node_df.index.name = 'node'
node_df = node_df.reset_index()
values = pd.DataFrame(graph_values, columns = ('name', 'sentiment'), index = [0])
# df = merges graph_values with node_df for single graph and fill NaNs
df = pd.concat([values, node_df], axis = 1)
df = df.fillna(method='ffill')
#combined_df = combined_df.append(df)
In [36]:
# print positive gc nodes
df
Out[36]:
In [37]:
# save
#df.to_csv('../output/df/d_Gc_nodes_pos2.csv')
#df.to_csv('../output/df/u_Gc_nodes_pos2.csv')
In [46]:
# make sure you're using the right graph
print "gml_files = ", gml_files
print "gml_graph = ", gml_graph
In [47]:
# FULL DIRECTED
#graph = nx.read_gml('../output/network/article_pos1.gml')
# FULL UNDIRECTED
graph = nx.read_gml('../output/network/article_u_pos.gml')
print nx.info(graph)
In [48]:
# degree centrality
dc = nx.degree_centrality(graph)
dc_df = pd.DataFrame.from_dict(dc, orient = 'index')
dc_df.columns = ['degree cent']
dc_df = dc_df.sort_values(by = ['degree cent'])
#dc_df
# betweenness centrality
bc = nx.betweenness_centrality(graph)
bc_df = pd.DataFrame.from_dict(bc, orient = 'index')
bc_df.columns = ['betweenness cent']
bc_df = bc_df.sort_values(by = ['betweenness cent'])
#bc_df
# closeness centrality
cc = nx.closeness_centrality(graph)
cc_df = pd.DataFrame.from_dict(cc, orient = 'index')
cc_df.columns = ['closeness cent']
cc_df = cc_df.sort_values(by = ['closeness cent'])
#cc_df
In [49]:
dc_df
Out[49]:
In [50]:
bc_df
Out[50]:
In [51]:
cc_df
Out[51]:
In [52]:
# Gc directed
#graph = nx.read_gml('../output/network/d_Gc_positive2.gml')
# Gc undirected
graph = nx.read_gml('../output/network/u_Gc_positive2.gml')
print nx.info(graph)
In [53]:
# degree centrality
dc = nx.degree_centrality(graph)
dc_df = pd.DataFrame.from_dict(dc, orient = 'index')
dc_df.columns = ['degree cent']
dc_df = dc_df.sort_values(by = ['degree cent'])
#dc_df
# betweenness centrality
bc = nx.betweenness_centrality(graph)
bc_df = pd.DataFrame.from_dict(bc, orient = 'index')
bc_df.columns = ['betweenness cent']
bc_df = bc_df.sort_values(by = ['betweenness cent'])
#bc_df
# closeness centrality
cc = nx.closeness_centrality(graph)
cc_df = pd.DataFrame.from_dict(cc, orient = 'index')
cc_df.columns = ['closeness cent']
cc_df = cc_df.sort_values(by = ['closeness cent'])
#cc_df
In [54]:
dc_df
Out[54]:
In [55]:
bc_df
Out[55]:
In [56]:
cc_df
Out[56]:
In [57]:
# Gc directed
#graph = nx.read_gml('../output/network/d_Gc_positive2.gml')
# Gc undirected
graph = nx.read_gml('../output/network/u_Gc_positive2.gml')
print nx.info(graph)
In [58]:
print "Greatest component size =", len(graph)
In [59]:
# returns all minimum k cutsets of an undirected graph
# i.e., the set(s) of nodes of cardinality equal to the node connectivity of G
# thus if removed, would break G into two or more connected components
#cutsets = list(nx.all_node_cuts(graph)) # must be undirected
print "Greatest component size =", len(graph)
#print "# of cutsets =", len(cutsets)
# returns a set of nodes or edges of minimum cardinality that disconnects G
min_ncut = nx.minimum_node_cut(graph)
min_ecut = nx.minimum_edge_cut(graph)
print "Min node cut =", min_ncut
print "Min edge cut =", min_ecut
# min cuts with source and target
print nx.minimum_node_cut(graph, s='vaccines', t='autism')
print nx.minimum_edge_cut(graph, s='vaccines', t='autism')
In [60]:
# read edge labels in min cut for Gc
# change source and target
a = nx.minimum_edge_cut(graph, s='vaccines', t='autism')
#a = nx.minimum_edge_cut(graph)
labels = nx.get_edge_attributes(graph,'edge')
edgelabels = {}
for e in labels.keys():
e1 = e[0:2]
edgelabels[e1]=labels[e]
for e in a:
if edgelabels.has_key(e):
print e,edgelabels[e]
else:
rev_e = e[::-1]
print rev_e, edgelabels[rev_e]