In [15]:
import pickle
import text_utils
import networkx as nx
load the original graph
In [8]:
named_graph = pickle.load(open('testimony/text/annual-reports/pickles/named-graph.p', 'rb'))
In [13]:
In [9]:
keys = named_graph.keys()
# flatten values
vals = [name for namelist in named_graph.values() for name in namelist]
# append together
names = keys + vals
names = [name.lower() for name in names]
# remove duplicates
names = list(set(names))
In [10]:
disambiguated_names = text_utils.chunk_list(names)
In [11]:
print "# original names:", len(names)
print "# chunked names:", len(disambiguated_names)
In [12]:
def get_key(mention, l):
"second argument is the chunked list."
mention = mention.lower()
for chunk in l:
if mention in chunk:
return l.index(chunk)
In [17]:
disambiguated_naming_graph = {}
for snitch, accused in named_graph.items():
snitch_key = get_key(snitch, disambiguated_names)
accused = [get_key(a, disambiguated_names) for a in accused]
disambiguated_naming_graph[snitch_key] = accused
In [41]:
G = nx.DiGraph()
for snitch, accused in disambiguated_naming_graph.items():
for a in accused:
G.add_edge(snitch, a)
nx.write_gml(G, 'graphs/final/disambiguated_naming_graph.gml')
Networkx has no measure of let's make our own!
In [118]:
def reciprocity(D):
"computes the proportion of reciprocated edges to all edges"
for (u,v) in D.edges():
if not D.has_edge(v,u):
return float(len(G.edges()))/len(D.to_undirected().edges())
def run_statistics(G):
outdegrees = [G.out_degree(n) for n in G.node if G.out_degree(n) > 0]
indegrees = [G.in_degree(n) for n in G.node if G.in_degree(n) > 0]
naming_nodes = [n for n in G.node if G.out_degree(n) > 0]
named_nodes = [n for n in G.node if G.in_degree(n) > 0]
print '# of naming nodes:', len(naming_nodes)
print '# of named nodes:', len(named_nodes)
print "# of nodes:", len(G.node)
print "# of edges:", G.number_of_edges()
print "average outdegree for nodes with outdegree > 0:", float(sum(outdegrees))/len(outdegrees)
print "average indegree for nodes with outdegree > 0:", float(sum(indegrees))/len(indegrees)
giant_component = max(list(nx.weakly_connected_component_subgraphs(G)), key=len)
print "reciprocity for naming graph:", reciprocity(giant_component)
# take the avg. shortest path length for the giant component.
print "average shortest path length:", nx.average_shortest_path_length(giant_component)
gnt = giant_component.to_undirected()
print "max eccentricity/diameter:", nx.diameter(gnt)
print "min eccentricity/radius:", nx.radius(gnt)
print "# in periphery (at diameter):", len(nx.periphery(gnt))
print "# in center (at radius):", len(
print "transitivity (fraction of all possible triangles in graph)", nx.transitivity(gnt)
print '# triangles:', sum(nx.triangles(gnt).values())/3
Some network statistics
In [122]:
In [130]:
giant_component = max(list(nx.weakly_connected_component_subgraphs(G)), key=len)
In [ ]:
In [83]:
mentions_graph = nx.read_gml('graphs/final/gml/dominant_categories.gml')
In [121]:
In [ ]: