Constructing the graph of mentions from the hearings.

Setup


In [77]:
import networkx as nx
import pandas as pd
import numpy as np
import text_utils
import difflib
import os
%load_ext autoreload
%autoreload 2

transcript_dir = os.path.join('testimony/text/hearings')

df = pd.read_pickle('pickles/with_corenlp_mentions.p')
G = nx.DiGraph()


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [21]:
for n, row in df.iterrows():
    if n % 500 == 0:
        print n, "rows analyzed."

    # test run
    # if n > 2000:
    #     break

    speaker = row['speaker']
    mentions = row['corenlp_mentions']
    if type(mentions) == list:
        for mention in mentions:
            G.add_edge(speaker, mention)


0 rows analyzed.
500 rows analyzed.
1000 rows analyzed.
1500 rows analyzed.
2000 rows analyzed.
2500 rows analyzed.
3000 rows analyzed.
3500 rows analyzed.
4000 rows analyzed.
4500 rows analyzed.
5000 rows analyzed.
5500 rows analyzed.
6000 rows analyzed.
6500 rows analyzed.
7000 rows analyzed.
7500 rows analyzed.
8000 rows analyzed.
8500 rows analyzed.
9000 rows analyzed.
9500 rows analyzed.
10000 rows analyzed.
10500 rows analyzed.
11000 rows analyzed.
11500 rows analyzed.
12000 rows analyzed.
12500 rows analyzed.

Let's see if the node labels are reasonable.


In [ ]:
speakers = pd.Series(G.node.keys())
sample = np.random.choice(speakers.index.values, 50)
sample = speakers.ix[sample]
sample

It actually looks pretty reasonable.

What won't be reasonable is the list of mentions. We should chunk the list and turn each mention into an I.D.

Chunking the mention list


In [85]:
mentions = df['corenlp_mentions'].dropna()
mentions = [mention for sublist in mentions for mention in sublist]
mentions = [mention.lower() for mention in mentions]

mentions = [list(set(mentions)) for mentions in text_utils.chunk_list(mentions)]



def get_key(mention, l):
    mention = mention.lower()
    for chunk in l:
        if mention in chunk:
            return l.index(chunk)

Now, we can use 'get_mention_key' to retrieve a disambiguated key that refers to a single entity in the mention list.

Add a column on the dataframe for the disambiguated mentions


In [86]:
disambiguated_mentions = []

for n, row in df.iterrows():
    corenlp_mentions = row['corenlp_mentions']
    if type(corenlp_mentions) == list:
        keys = []
        for mention in corenlp_mentions:
            keys.append(get_mention_key(mention))
        disambiguated_mentions.append(keys)
    else:
        disambiguated_mentions.append(np.nan)

df['disambiguated_mentions'] = disambiguated_mentions

Time to create the mention graph with disambiguated mentions.

We need to translate the speakers into their disambiguated mention keys first, though (and add it to the dataframe)


In [87]:
disambiguated_names = list(mentions)

for speaker in list(set(df['speaker'])):
    if get_key(speaker, disambiguated_names):
        continue # we have a key
    else:
        disambiguated_names.append(speaker)

disambiguated_speakers = []
for speaker in df['speaker']:
    key = get_key(speaker, disambiguated_names)
    disambiguated_speakers.append(key)

df['disambiguated_speaker'] = disambiguated_speakers

When we create the graph, we don't want to include interviewer names.


In [88]:
interviewee_names = [f.replace(".txt", "") for f in os.listdir(transcript_dir)]
interviewee_names = map(lambda s: s.replace("-", " "), interviewee_names)

def is_interviewer(name):
    return not difflib.get_close_matches(name, interviewee_names)

In [90]:
skipped = 0
G = nx.DiGraph()
for n, row in df.iterrows():
    if n % 500 == 0:
        print n, "rows analyzed."

    # test run
    # if n > 2000:
    #     break
    
    
    speaker = row['disambiguated_speaker']
    # if the speaker is an interviewer
    if any(map(is_interviewer, disambiguated_names[speaker])):
        skipped += 1
        continue

    mentions = row['disambiguated_mentions']
    if type(mentions) == list:
        for mention in mentions:
            G.add_edge(speaker, mention)
print "skipped", skipped, "speechacts."


0 rows analyzed.
500 rows analyzed.
1000 rows analyzed.
1500 rows analyzed.
2000 rows analyzed.
2500 rows analyzed.
3000 rows analyzed.
3500 rows analyzed.
4000 rows analyzed.
4500 rows analyzed.
5000 rows analyzed.
5500 rows analyzed.
6000 rows analyzed.
6500 rows analyzed.
7000 rows analyzed.
7500 rows analyzed.
8000 rows analyzed.
8500 rows analyzed.
9000 rows analyzed.
9500 rows analyzed.
10000 rows analyzed.
10500 rows analyzed.
11000 rows analyzed.
11500 rows analyzed.
12000 rows analyzed.
12500 rows analyzed.
skipped 7853 speechacts.

In [95]:
print "# interviewerrs:", len(filter(is_interviewer, list(set(df['speaker']))))

print "# speakers:", len(list(set(df['speaker'])))


# interviewerrs: 18
# speakers: 68

In [91]:
nx.write_gml(G, 'graphs/mention_graph.gml')

In [92]:
len(disambiguated_mentions)


Out[92]:
12954

In [123]:
import pickle
pickle.dump(disambiguated_names, open('pickles/disambiguated_names_for_corenlp_mentions.p', 'wb'))