In [77]:
import networkx as nx
import pandas as pd
import numpy as np
import text_utils
import difflib
import os
%load_ext autoreload
%autoreload 2
transcript_dir = os.path.join('testimony/text/hearings')
df = pd.read_pickle('pickles/with_corenlp_mentions.p')
G = nx.DiGraph()
In [21]:
for n, row in df.iterrows():
if n % 500 == 0:
print n, "rows analyzed."
# test run
# if n > 2000:
# break
speaker = row['speaker']
mentions = row['corenlp_mentions']
if type(mentions) == list:
for mention in mentions:
G.add_edge(speaker, mention)
Let's see if the node labels are reasonable.
In [ ]:
speakers = pd.Series(G.node.keys())
sample = np.random.choice(speakers.index.values, 50)
sample = speakers.ix[sample]
sample
In [85]:
mentions = df['corenlp_mentions'].dropna()
mentions = [mention for sublist in mentions for mention in sublist]
mentions = [mention.lower() for mention in mentions]
mentions = [list(set(mentions)) for mentions in text_utils.chunk_list(mentions)]
def get_key(mention, l):
mention = mention.lower()
for chunk in l:
if mention in chunk:
return l.index(chunk)
In [86]:
disambiguated_mentions = []
for n, row in df.iterrows():
corenlp_mentions = row['corenlp_mentions']
if type(corenlp_mentions) == list:
keys = []
for mention in corenlp_mentions:
keys.append(get_mention_key(mention))
disambiguated_mentions.append(keys)
else:
disambiguated_mentions.append(np.nan)
df['disambiguated_mentions'] = disambiguated_mentions
Time to create the mention graph with disambiguated mentions.
We need to translate the speakers into their disambiguated mention keys first, though (and add it to the dataframe)
In [87]:
disambiguated_names = list(mentions)
for speaker in list(set(df['speaker'])):
if get_key(speaker, disambiguated_names):
continue # we have a key
else:
disambiguated_names.append(speaker)
disambiguated_speakers = []
for speaker in df['speaker']:
key = get_key(speaker, disambiguated_names)
disambiguated_speakers.append(key)
df['disambiguated_speaker'] = disambiguated_speakers
When we create the graph, we don't want to include interviewer names.
In [88]:
interviewee_names = [f.replace(".txt", "") for f in os.listdir(transcript_dir)]
interviewee_names = map(lambda s: s.replace("-", " "), interviewee_names)
def is_interviewer(name):
return not difflib.get_close_matches(name, interviewee_names)
In [90]:
skipped = 0
G = nx.DiGraph()
for n, row in df.iterrows():
if n % 500 == 0:
print n, "rows analyzed."
# test run
# if n > 2000:
# break
speaker = row['disambiguated_speaker']
# if the speaker is an interviewer
if any(map(is_interviewer, disambiguated_names[speaker])):
skipped += 1
continue
mentions = row['disambiguated_mentions']
if type(mentions) == list:
for mention in mentions:
G.add_edge(speaker, mention)
print "skipped", skipped, "speechacts."
In [95]:
print "# interviewerrs:", len(filter(is_interviewer, list(set(df['speaker']))))
print "# speakers:", len(list(set(df['speaker'])))
In [91]:
nx.write_gml(G, 'graphs/mention_graph.gml')
In [92]:
len(disambiguated_mentions)
Out[92]:
In [123]:
import pickle
pickle.dump(disambiguated_names, open('pickles/disambiguated_names_for_corenlp_mentions.p', 'wb'))