pure naming graph



In [15]:

    
import pickle
import text_utils
import networkx as nx

load the original graph



In [8]:

    
named_graph = pickle.load(open('testimony/text/annual-reports/pickles/named-graph.p', 'rb'))



In [13]:

    
named_graph.items()[:5]









    Out[13]:





[('john issacs', [u'wayne b. salisbury', u"richard f. o'hair"]),
 ('william e. oliver', [u'alice bennett', u'urcel daniel']),
 ('charles gainor', [u'dr. gainor']),
 ('john morgan', [u'charles d. blodgett']),
 ('lydia mates', [u'mrs. dave mates', u'bereniece baldwin'])]

create a disambiguated name list



In [9]:

    
keys = named_graph.keys()

# flatten values
vals = [name for namelist in named_graph.values() for name in namelist]

# append together
names = keys + vals

names = [name.lower() for name in names]

# remove duplicates
names = list(set(names))



In [10]:

    
disambiguated_names = text_utils.chunk_list(names)



In [11]:

    
print "# original names:", len(names)
print "# chunked names:", len(disambiguated_names)









    



# original names: 1917
# chunked names: 1864



In [12]:

    
def get_key(mention, l):
    "second argument is the chunked list."
    mention = mention.lower()
    for chunk in l:
        if mention in chunk:
            return l.index(chunk)

create the new graph with the disambiguated names



In [17]:

    
disambiguated_naming_graph = {}
for snitch, accused in named_graph.items():
    snitch_key = get_key(snitch, disambiguated_names)
    accused = [get_key(a, disambiguated_names) for a in accused]
    disambiguated_naming_graph[snitch_key] = accused

move it to a gml/networkx graph



In [41]:

    
G = nx.DiGraph()

for snitch, accused in disambiguated_naming_graph.items():
    for a in accused:
      G.add_edge(snitch, a)
        
nx.write_gml(G, 'graphs/final/disambiguated_naming_graph.gml')

Networkx has no measure of reciprocity...so let's make our own!



In [118]:

    
def reciprocity(D):
        "computes the proportion of reciprocated edges to all edges"
        G=D.to_undirected()
        for (u,v) in D.edges():
            if not D.has_edge(v,u):
                    G.remove_edge(u,v)
        return float(len(G.edges()))/len(D.to_undirected().edges())
    
def run_statistics(G):
    outdegrees = [G.out_degree(n) for n in G.node if G.out_degree(n) > 0]
    indegrees = [G.in_degree(n) for n in G.node if G.in_degree(n) > 0]
    
    naming_nodes = [n for n in G.node if G.out_degree(n) > 0]
    named_nodes = [n for n in G.node if G.in_degree(n) > 0]
    
    print '# of naming nodes:', len(naming_nodes)
    print '# of named nodes:', len(named_nodes)
    
    print "# of nodes:", len(G.node)
    print "# of edges:", G.number_of_edges()

    print "average outdegree for nodes with outdegree > 0:", float(sum(outdegrees))/len(outdegrees)
    print "average indegree for nodes with outdegree > 0:", float(sum(indegrees))/len(indegrees)
    
    giant_component = max(list(nx.weakly_connected_component_subgraphs(G)), key=len)

    print "reciprocity for naming graph:", reciprocity(giant_component)
    # take the avg. shortest path length for the giant component.
    print "average shortest path length:", nx.average_shortest_path_length(giant_component)
    
    gnt = giant_component.to_undirected()
    print "max eccentricity/diameter:", nx.diameter(gnt)
    print "min eccentricity/radius:", nx.radius(gnt)
    print "# in periphery (at diameter):", len(nx.periphery(gnt))
    print "# in center (at radius):", len(nx.center(gnt))
    print "transitivity (fraction of all possible triangles in graph)", nx.transitivity(gnt)
    
    print '# triangles:', sum(nx.triangles(gnt).values())/3

Some network statistics



In [122]:

    
run_statistics(G)









    



# of naming nodes: 1628
# of named nodes: 348
# of nodes: 1832
# of edges: 2839
average outdegree for nodes with outdegree > 0: 1.74385749386
average indegree for nodes with outdegree > 0: 8.15804597701
reciprocity for naming graph: 0.0118449389806
average shortest path length: 0.002001130494
max eccentricity/diameter: 12
min eccentricity/radius: 7
# in periphery (at diameter): 2
# in center (at radius): 100
transitivity (fraction of all possible triangles in graph) 0.00356820605365
# triangles: 87



In [130]:

    
giant_component = max(list(nx.weakly_connected_component_subgraphs(G)), key=len)
nx.single_source_shortest_path_length(giant_component,giant_component[0])









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-130-d1ca5c9fe9c9> in <module>()
      1 giant_component = max(list(nx.weakly_connected_component_subgraphs(G)), key=len)
      2 
----> 3 nx.single_source_shortest_path_length(giant_component,giant_component[0])

/usr/local/Cellar/python/2.7.5/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/networkx/algorithms/shortest_paths/unweighted.pyc in single_source_shortest_path_length(G, source, cutoff)
     54     seen={}                  # level (number of hops) when seen in BFS
     55     level=0                  # the current level
---> 56     nextlevel={source:1}  # dict of nodes to check at next level
     57     while nextlevel:
     58         thislevel=nextlevel  # advance to next level

TypeError: unhashable type: 'dict'



In [ ]:

mentions graph



In [83]:

    
mentions_graph = nx.read_gml('graphs/final/gml/dominant_categories.gml')



In [121]:









    



# of naming nodes: 47
# of named nodes: 871
# of nodes: 904
# of edges: 1255
average outdegree for nodes with outdegree > 0: 26.7021276596
average indegree for nodes with outdegree > 0: 1.44087256028
reciprocity for naming graph: 0.0
average shortest path length: 0.00265462225228
max eccentricity/diameter: 7
min eccentricity/radius: 4
# in periphery (at diameter): 24
# in center (at radius): 20
transitivity (fraction of all possible triangles in graph) 0.00346400323307
# triangles: 40



In [ ]:

    
eccentricity