Constructing the graph of mentions from the hearings.


In [77]:
import networkx as nx
import pandas as pd
import numpy as np
import text_utils
import difflib
import os
%load_ext autoreload
%autoreload 2

transcript_dir = os.path.join('testimony/text/hearings')

df = pd.read_pickle('pickles/with_corenlp_mentions.p')
G = nx.DiGraph()

In [21]:
for n, row in df.iterrows():
    if n % 500 == 0:
        print n, "rows analyzed."

    # test run
    # if n > 2000:
    #     break

    speaker = row['speaker']
    mentions = row['corenlp_mentions']
    if type(mentions) == list:
        for mention in mentions:
            G.add_edge(speaker, mention)

0 rows analyzed.
12500 rows analyzed.

Let's see if the node labels are reasonable.

In [ ]:
speakers = pd.Series(G.node.keys())
sample = np.random.choice(speakers.index.values, 50)
sample = speakers.ix[sample]

It actually looks pretty reasonable.

What won't be reasonable is the list of mentions. We should chunk the list and turn each mention into an I.D.

Chunking the mention list

In [85]:
mentions = df['corenlp_mentions'].dropna()
mentions = [mention for sublist in mentions for mention in sublist]
mentions = [mention.lower() for mention in mentions]

mentions = [list(set(mentions)) for mentions in text_utils.chunk_list(mentions)]

def get_key(mention, l):
    mention = mention.lower()
    for chunk in l:
        if mention in chunk:
            return l.index(chunk)

Now, we can use 'get_mention_key' to retrieve a disambiguated key that refers to a single entity in the mention list.

Add a column on the dataframe for the disambiguated mentions

In [86]:
disambiguated_mentions = []

for n, row in df.iterrows():
    corenlp_mentions = row['corenlp_mentions']
    if type(corenlp_mentions) == list:
        keys = []
        for mention in corenlp_mentions:

df['disambiguated_mentions'] = disambiguated_mentions

Time to create the mention graph with disambiguated mentions.

We need to translate the speakers into their disambiguated mention keys first, though (and add it to the dataframe)

In [87]:
disambiguated_names = list(mentions)

for speaker in list(set(df['speaker'])):
    if get_key(speaker, disambiguated_names):
        continue # we have a key

disambiguated_speakers = []
for speaker in df['speaker']:
    key = get_key(speaker, disambiguated_names)

df['disambiguated_speaker'] = disambiguated_speakers

When we create the graph, we don't want to include interviewer names.

In [88]:
interviewee_names = [f.replace(".txt", "") for f in os.listdir(transcript_dir)]
interviewee_names = map(lambda s: s.replace("-", " "), interviewee_names)

def is_interviewer(name):
    return not difflib.get_close_matches(name, interviewee_names)

In [90]:
skipped = 0
G = nx.DiGraph()
for n, row in df.iterrows():
    if n % 500 == 0:
        print n, "rows analyzed."

    # test run
    # if n > 2000:
    #     break
    speaker = row['disambiguated_speaker']
    # if the speaker is an interviewer
    if any(map(is_interviewer, disambiguated_names[speaker])):
        skipped += 1

    mentions = row['disambiguated_mentions']
    if type(mentions) == list:
        for mention in mentions:
            G.add_edge(speaker, mention)
print "skipped", skipped, "speechacts."

0 rows analyzed.
skipped 7853 speechacts.

In [95]:
print "# interviewerrs:", len(filter(is_interviewer, list(set(df['speaker']))))

print "# speakers:", len(list(set(df['speaker'])))

# interviewerrs: 18
# speakers: 68

In [91]:
nx.write_gml(G, 'graphs/mention_graph.gml')

In [123]:
import pickle
pickle.dump(disambiguated_names, open('pickles/disambiguated_names_for_corenlp_mentions.p', 'wb'))