In [1]:
import testimony.graph as graph
import testimony.nameutils as nameutils
import testimony.testimony_utils as testimony_utils
transcripts = testimony_utils.Transcripts()
In [2]:
import networkx as nx
from classifier import Sentence
import nltk
import ner
G = nx.read_gml('graphs/unweighted.gml')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
tagger = ner.SocketNER(host='localhost', port=8080)
# --------------------------------------------------------
def people_mentioned_in_single_speechact(speechact):
"returns a list of mentioned people from a single speechact "
people = []
sens = tokenizer.tokenize(speechact)
for sen in sens:
entities_dict = tagger.get_entities(sen)
if entities_dict.has_key('PERSON'):
people.extend(entities_dict['PERSON'])
return people
def people_mentioned_in_speechacts(speechacts):
"retrieves a list of mentioned people from a list of speechacts"
people = []
for speechact in speechacts:
people.extend(people_mentioned_in_single_speechact(speechact))
return people
def naming_data_from_speechacts(speechact_dict):
entity_data = {} # name -> [entities]
for speaker, speechacts in speechact_dict.items():
mentioned_people = people_mentioned_in_speechacts(speechacts)
entity_data[speaker] = list(set(mentioned_people)) #get rid of duplicates (for now)
return entity_data
def naming_data_for_node(node):
"naming data from the transcripts for a particular node"
name = transcripts.get_closest_name(node['label'], True)
if not name:
return {}
speechact_dict_from_testimony = transcripts.get_speech_acts_from_testimony(name.replace(" ", "-"))
return naming_data_from_speechacts(speechact_dict_from_testimony)
naming_data_from_transcripts = {} #bad name, because keys are from graph.
for id in G.nodes_iter():
node = G.node[id]
naming_data_from_transcripts[node['label']] = naming_data_for_node(node)
In [41]:
from pandas import DataFrame
def is_close_to_name_in_transcripts(graph_name):
closest_name = transcripts.get_closest_name(graph_name)
if closest_name:
return closest_name
# for transcript_name in naming_data_from_transcripts.keys():
# if nameutils.are_close_tokens(transcript_name, graph_name, 0.3):
# return transcript_name
return False
# names from graph
graph_names = []
in_transcripts = []
for id in G.nodes_iter():
node = G.node[id]
name = node['label']
graph_names.append(name)
in_transcripts.append(is_close_to_name_in_transcripts(name))
data = {'in transcripts' : in_transcripts}
df = DataFrame(data, index=graph_names)
df.head()
Out[41]:
In [131]:
# problem: name taken from
def all_names_from_hearing(name):
#name = transcripts.get_closest_name(name, True) # taking data from graph names, so....
if not name:
return []
data = naming_data_from_transcripts[name]
names = []
for k,v in naming_data_from_transcripts[name].items():
names.extend(v)
return list(set(names)) # remove duplicates
def node_by_label(name):
for id in G.nodes_iter():
if name == G.node[id]['label']:
return id
# return G.node[id] #might need to return id instead
def all_named_in_graph(name):
names = []
node = node_by_label(name)
for id in G.successors_iter(node):
names.append(G.node[id]['label'])
return names
df_new = df.copy()
n_in_g_not_t = []
n_in_t_not_g = []
in_transcripts = []
in_graph = []
t_not_g = []
g_not_t = []
for name in df.index:
mentioned_in_transcripts = all_names_from_hearing(name)
named_in_graph = all_named_in_graph(name)
in_transcripts.append(mentioned_in_transcripts)
in_graph.append(named_in_graph)
t_not_g.append(mentioned_in_transcripts != [] and named_in_graph == [])
g_not_t.append(mentioned_in_transcripts == [] and named_in_graph != [])
df_new['mentioned in transcripts'] = in_transcripts
df_new['named in graph'] = in_graph
df_new['t_not_g'] = t_not_g
df_new['g_not_t'] = g_not_t
In [182]:
indices = []
for index, row in df_new.iterrows():
if row['mentioned in transcripts']:
indices.append(index)
len(indices)
len(naming_data_from_transcripts)
Out[182]:
In [167]:
# people whose transcripts mention people, but who name noone in the
#'who named whom' graph.
# df_new[df_new['t_not_g']]
# constructing a 'mention' graph, unweighted.
G_mention = nx.Graph()
mention_graph_dict = {}
for index, row in df_new.iterrows():
for name in row['mentioned in transcripts']:
if mention_graph_dict.has_key(index):
mention_graph_dict[index].append(name)
else:
mention_graph_dict[index] = [name]
#G_mention.add_edge(index, name)
In [ ]:
distribution = nameutils.name_distribution_with_tokens(mention_graph_dict)
def fix_graph(graph, distribution):
"""
Fixes mispelled names in the graph using name chunking and the
name occurrence distribution.
"""
new_graph = {}
for name in graph.keys():
print(name)
mln = nameutils.most_likely_name(name, distribution)
# I treat `informers` as a set because from looking at the
# data, it seems that multiple occurrences is usually an
# error.
informers = set()
for informer in graph[name]:
if not isinstance(informer, basestring):
continue
informer_mln = nameutils.most_likely_name(informer, distribution)
informers.add(informer_mln)
if new_graph.has_key(mln):
new_graph[mln].update(informers)
else:
new_graph[mln] = informers
return new_graph
In [175]:
mention_graph_with_nodes_from_summaries= fix_graph(mention_graph_dict, distribution)
nx.write_gml(mention_graph_with_nodes_from_summaries, "graphs/mention_graph_with_nodes_from_summaries.gml")
Out[175]:
In [ ]: