In [15]:
    
import pickle
import text_utils
import networkx as nx
    
load the original graph
In [8]:
    
named_graph = pickle.load(open('testimony/text/annual-reports/pickles/named-graph.p', 'rb'))
    
In [13]:
    
named_graph.items()[:5]
    
    Out[13]:
In [9]:
    
keys = named_graph.keys()
# flatten values
vals = [name for namelist in named_graph.values() for name in namelist]
# append together
names = keys + vals
names = [name.lower() for name in names]
# remove duplicates
names = list(set(names))
    
In [10]:
    
disambiguated_names = text_utils.chunk_list(names)
    
In [11]:
    
print "# original names:", len(names)
print "# chunked names:", len(disambiguated_names)
    
    
In [12]:
    
def get_key(mention, l):
    "second argument is the chunked list."
    mention = mention.lower()
    for chunk in l:
        if mention in chunk:
            return l.index(chunk)
    
In [17]:
    
disambiguated_naming_graph = {}
for snitch, accused in named_graph.items():
    snitch_key = get_key(snitch, disambiguated_names)
    accused = [get_key(a, disambiguated_names) for a in accused]
    disambiguated_naming_graph[snitch_key] = accused
    
In [41]:
    
G = nx.DiGraph()
for snitch, accused in disambiguated_naming_graph.items():
    for a in accused:
      G.add_edge(snitch, a)
        
nx.write_gml(G, 'graphs/final/disambiguated_naming_graph.gml')
    
Networkx has no measure of reciprocity...so let's make our own!
In [118]:
    
def reciprocity(D):
        "computes the proportion of reciprocated edges to all edges"
        G=D.to_undirected()
        for (u,v) in D.edges():
            if not D.has_edge(v,u):
                    G.remove_edge(u,v)
        return float(len(G.edges()))/len(D.to_undirected().edges())
    
def run_statistics(G):
    outdegrees = [G.out_degree(n) for n in G.node if G.out_degree(n) > 0]
    indegrees = [G.in_degree(n) for n in G.node if G.in_degree(n) > 0]
    
    naming_nodes = [n for n in G.node if G.out_degree(n) > 0]
    named_nodes = [n for n in G.node if G.in_degree(n) > 0]
    
    print '# of naming nodes:', len(naming_nodes)
    print '# of named nodes:', len(named_nodes)
    
    print "# of nodes:", len(G.node)
    print "# of edges:", G.number_of_edges()
    print "average outdegree for nodes with outdegree > 0:", float(sum(outdegrees))/len(outdegrees)
    print "average indegree for nodes with outdegree > 0:", float(sum(indegrees))/len(indegrees)
    
    giant_component = max(list(nx.weakly_connected_component_subgraphs(G)), key=len)
    print "reciprocity for naming graph:", reciprocity(giant_component)
    # take the avg. shortest path length for the giant component.
    print "average shortest path length:", nx.average_shortest_path_length(giant_component)
    
    gnt = giant_component.to_undirected()
    print "max eccentricity/diameter:", nx.diameter(gnt)
    print "min eccentricity/radius:", nx.radius(gnt)
    print "# in periphery (at diameter):", len(nx.periphery(gnt))
    print "# in center (at radius):", len(nx.center(gnt))
    print "transitivity (fraction of all possible triangles in graph)", nx.transitivity(gnt)
    
    print '# triangles:', sum(nx.triangles(gnt).values())/3
    
Some network statistics
In [122]:
    
run_statistics(G)
    
    
In [130]:
    
giant_component = max(list(nx.weakly_connected_component_subgraphs(G)), key=len)
nx.single_source_shortest_path_length(giant_component,giant_component[0])
    
    
In [ ]:
    
    
In [83]:
    
mentions_graph = nx.read_gml('graphs/final/gml/dominant_categories.gml')
    
In [121]:
    
    
    
In [ ]:
    
eccentricity