In [1]:
import testimony.graph as graph
import testimony.nameutils as nameutils
import testimony.testimony_utils as testimony_utils

transcripts = testimony_utils.Transcripts()


Getting speech acts...
Merging...

In [2]:
import networkx as nx
from classifier import Sentence
import nltk
import ner

G = nx.read_gml('graphs/unweighted.gml')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
tagger = ner.SocketNER(host='localhost', port=8080)
# --------------------------------------------------------


def people_mentioned_in_single_speechact(speechact):
    "returns a list of mentioned people from a single speechact "
    people = []
    sens = tokenizer.tokenize(speechact)
    for sen in sens:
        entities_dict = tagger.get_entities(sen)
        if entities_dict.has_key('PERSON'):
            people.extend(entities_dict['PERSON'])
    return people
    
    
def people_mentioned_in_speechacts(speechacts):
    "retrieves a list of mentioned people from a list of speechacts"
    people = []
    for speechact in speechacts:
        people.extend(people_mentioned_in_single_speechact(speechact))
    return people


def naming_data_from_speechacts(speechact_dict):
    entity_data = {} # name -> [entities]
    for speaker, speechacts in speechact_dict.items():
        mentioned_people = people_mentioned_in_speechacts(speechacts)
        entity_data[speaker] = list(set(mentioned_people)) #get rid of duplicates (for now)
    return entity_data

def naming_data_for_node(node):
    "naming data from the transcripts for a particular node"
    name = transcripts.get_closest_name(node['label'], True)
    if not name: 
        return {}
    speechact_dict_from_testimony = transcripts.get_speech_acts_from_testimony(name.replace(" ", "-"))
    
    return naming_data_from_speechacts(speechact_dict_from_testimony)


naming_data_from_transcripts = {} #bad name, because keys are from graph.
for id in G.nodes_iter():
    node = G.node[id]
    naming_data_from_transcripts[node['label']] = naming_data_for_node(node)

In [41]:
from pandas import DataFrame

def is_close_to_name_in_transcripts(graph_name):
    closest_name = transcripts.get_closest_name(graph_name)
    if closest_name:
        return closest_name
    # for transcript_name in naming_data_from_transcripts.keys():
    #     if nameutils.are_close_tokens(transcript_name, graph_name, 0.3):
    #         return transcript_name
    return False

# names from graph
graph_names = []
in_transcripts = []
for id in G.nodes_iter():
    node = G.node[id]
    name = node['label']
    graph_names.append(name)
    in_transcripts.append(is_close_to_name_in_transcripts(name))

data = {'in transcripts' : in_transcripts}

df = DataFrame(data, index=graph_names)

df.head()


Out[41]:
in transcripts
lillian lowenfels False
bart lytton bart lytton
larry parks larry parks
betty martin False
jean richardson False

In [131]:
# problem: name taken from 
def all_names_from_hearing(name):
    #name = transcripts.get_closest_name(name, True) # taking data from graph names, so....
    if not name:
        return []
    data = naming_data_from_transcripts[name]
    names = []
    for k,v in naming_data_from_transcripts[name].items():
        names.extend(v)
    return list(set(names)) # remove duplicates

def node_by_label(name):
    for id in G.nodes_iter():
        if name == G.node[id]['label']:
            return id
            # return G.node[id] #might need to return id instead


def all_named_in_graph(name):
    names = []
    node = node_by_label(name)
    for id in G.successors_iter(node):
        names.append(G.node[id]['label'])
    return names

df_new = df.copy()
n_in_g_not_t = []
n_in_t_not_g = []


in_transcripts = []
in_graph = []
t_not_g = []
g_not_t = []
    
for name in df.index:
    mentioned_in_transcripts = all_names_from_hearing(name)
    named_in_graph = all_named_in_graph(name)
    
    in_transcripts.append(mentioned_in_transcripts)
    in_graph.append(named_in_graph)
    t_not_g.append(mentioned_in_transcripts != [] and named_in_graph == [])
    g_not_t.append(mentioned_in_transcripts == [] and named_in_graph != [])

df_new['mentioned in transcripts'] = in_transcripts
df_new['named in graph'] = in_graph
df_new['t_not_g'] = t_not_g
df_new['g_not_t'] = g_not_t

In [182]:
indices = []
for index, row in df_new.iterrows():
    if row['mentioned in transcripts']:
        indices.append(index)
len(indices)

len(naming_data_from_transcripts)


Out[182]:
308

In [167]:
# people whose transcripts mention people, but who name noone in the
#'who named whom' graph.
# df_new[df_new['t_not_g']]

# constructing a 'mention' graph, unweighted.

G_mention = nx.Graph()

mention_graph_dict = {}

for index, row in df_new.iterrows():
    for name in row['mentioned in transcripts']:
        if mention_graph_dict.has_key(index):
            mention_graph_dict[index].append(name)
        else:
            mention_graph_dict[index] = [name]
        #G_mention.add_edge(index, name)

In [ ]:
distribution = nameutils.name_distribution_with_tokens(mention_graph_dict)
    
def fix_graph(graph, distribution):
    """
    Fixes mispelled names in the graph using name chunking and the
    name occurrence distribution.
    """
    new_graph = {}
    for name in graph.keys():
        print(name)
        mln = nameutils.most_likely_name(name, distribution)

        # I treat `informers` as a set because from looking at the
        # data, it seems that multiple occurrences is usually an
        # error.
        informers = set()
        for informer in graph[name]:
            if not isinstance(informer, basestring):
                continue
            informer_mln = nameutils.most_likely_name(informer, distribution)
            informers.add(informer_mln)

        if new_graph.has_key(mln):
            new_graph[mln].update(informers)
        else:
            new_graph[mln] = informers

    return new_graph

In [175]:
mention_graph_with_nodes_from_summaries= fix_graph(mention_graph_dict, distribution)
nx.write_gml(mention_graph_with_nodes_from_summaries, "graphs/mention_graph_with_nodes_from_summaries.gml")


Out[175]:
78

In [ ]: