In [2]:
import testimony.graph as graph
import testimony.nameutils as nameutils
import testimony.testimony_utils as testimony_utils
transcripts = testimony_utils.Transcripts()
In [26]:
from classifier import Sentence
import nltk
import ner
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
tagger = ner.SocketNER(host='localhost', port=8080)
# --------------------------------------------------------
def people_mentioned_in_a_single_speechact(speechact):
"returns a list of mentioned people from a single speechact "
people = []
sens = tokenizer.tokenize(speechact)
for sen in sens:
entities_dict = tagger.get_entities(sen)
if entities_dict.has_key('PERSON'):
people.extend(entities_dict['PERSON'])
return people
all_mentions_in_transcripts = []
for speaker, speechacts in transcripts.speechacts.items():
for speechact in speechacts:
all_mentions_in_transcripts.extend(people_mentioned_in_a_single_speechact(speechact))
#mentions = map(people_mentioned_in_a_single_speechact, speechacts)
#total_mentions.extend(mentions)
len(all_mentions_in_transcripts)
Out[26]:
In [69]:
# clean the list
orig_len = len(all_mentions_in_transcripts)
all_mentions_in_transcripts = map(lambda n: n.lower(), all_mentions_in_transcripts)
bad_entries = ['mr.', 'mrs.', 'mr. ', 'dr. ', ] #sometimes there are orphaned titles
length_threshold = 3
all_mentions_in_transcripts = [name for name in all_mentions_in_transcripts if not name in bad_entries]
all_mentions_in_transcripts = [name for name in all_mentions_in_transcripts if len(name) > 3]
print "removed", orig_len - len(all_mentions_in_transcripts), "bad entries"
In [120]:
import difflib
def has_been_chunked(s, chunklist):
for chunk in chunklist:
if s in chunk: return True
return False
def chunk_list(l):
chunklist = []
for s in l:
if has_been_chunked(s, chunklist continue
close_matches = difflib.get_close_matches(s, l, 100, 0.8)
chunklist.append(close_matches)
return chunklist
In [121]:
chunked_mentions = chunk_list(all_mentions_in_transcripts)
chunked_mentions_no_repeats = map(lambda l: list(set(l)), chunked_mentions)
len(chunked_mentions)
Out[121]:
In [72]:
import networkx as nx
G = nx.read_gml('graphs/unweighted.gml')
Out[72]:
In [135]:
# people named in graph
people_named_in_graph = []
for edge in G.edges_iter():
target_id = edge[1]
people_named_in_graph.append(G.node[target_id]['label'])
named_in_graph_and_mentioned_in_transcripts = filter(lambda s: has_been_chunked(s, chunked_mentions),
people_named_in_graph)
named_in_graph_and_mentioned_in_transcripts = list(set(named_in_graph_and_mentioned_in_transcripts))
In [145]:
# snitches in graph
#speakers_in_transcripts = map(lambda s: s.replace("-", " "), transcripts.names)
speakers_in_transcripts = transcripts.speechacts.keys()
chunked_speakers = chunk_list(speakers_in_transcripts)
snitches_in_graph = []
for edge in G.edges_iter():
snitch_id = edge[0]
snitches_in_graph.append(G.node[snitch_id]['label'])
snitches_in_graph = list(set(snitches_in_graph))
snitched_in_graph_and_speaker_in_transcripts = filter(lambda s: has_been_chunked(s, chunked_speakers),
snitches_in_graph)
snitches_in_graph[:10]
Out[145]:
In [153]:
names_present_in_graph_and_transcripts = []
for id in G.nodes_iter():
node = G.node[id]
name = node['label']
if has_been_chunked(name, chunked_mentions):
names_present_in_graph_and_transcripts.append(name)
# TODO:
# people mentioned in transcripts by the people who named them in the graph.
print "Total names in graph:", len(G.node)
print "Total names mentioned in transcripts:", len(chunked_mentions), "\n"
print "#mentioned in transcripts & present in graph:", len(names_present_in_graph_and_transcripts)
print "#present in graph but not mentioned in transcripts:", len(G.node) - len(names_present_in_graph_and_transcripts)
print "#mentioned in transcripts but not graph:", len(chunked_mentions) - len(G.node), "\n"
print "#people named in graph:", len(people_named_in_graph)
print "#people named in graph and mentioned in transcripts:", len(named_in_graph_and_mentioned_in_transcripts), "\n"
print "#snitches who spoke in transcripts:", len(snitched_in_graph_and_speaker_in_transcripts), "/", len(snitches_in_graph)
When referring to the "intersection", I'm using the following definitions:
"snitch" : someone who named someone else from the summary who also spoke in a transcript.
"accused" : someone who was named in the summary who is also mentioned in the transcripts
54 snitches named 185 people
The fact that all the snitches are represented in the transcripts is super weird.
there are 94 names that are present in the graph, but not the transcripts.
This, combined with the fact that they name a large number of people (average is 4.8), makes me think that a non-trivial number of snitches were working as 'professionals'. The distribution will tell more, though.
In [199]:
# distribution of people naming other people.
# this only includes snitches who have spoken in hearings.
import matplotlib.pyplot as plt
import numpy as np
naming_dist = {}
def get_id_by_label(G, label):
for id in G.nodes_iter():
if G.node[id]['label'] == label:
return id
get_id_by_label(G, 'larry parks')
for snitch in snitched_in_graph_and_speaker_in_transcripts:
snitch_id = get_id_by_label(G, snitch)
naming_dist[snitch] = len(G[snitch_id].keys())
plt.bar(range(len(naming_dist)), naming_dist.values(), align='center')
plt.xticks(range(len(naming_dist)), naming_dist.keys(), rotation='vertical')
plt.show()
number_named = [tup[1] for tup in naming_dist.items()]
print "median number of people named: ", np.median(number_named)
print "mean number of people named: ", np.mean(number_named)
print "removing the bad apples..."
naming_dist.pop("anne kinney")
naming_dist.pop("robert rossen")
naming_dist.pop("david a. lang")
plt.bar(range(len(naming_dist)), naming_dist.values(), align='center')
plt.xticks(range(len(naming_dist)), naming_dist.keys(), rotation='vertical')
plt.show()
number_named = [tup[1] for tup in naming_dist.items()]
print "median number of people named: ", np.median(number_named)
print "mean number of people named: ", np.mean(number_named)
so, there are a few (three) bad apples. Anne Kinney, David A. Lang, and Robert Rossen.
The median number named is 2, so it's definitely reasonable.
I can't find anything....
nope
http://en.wikipedia.org/wiki/Robert_Rossen#Examinations_by_HUAC
In [ ]:
# how many people that were named named someone else?