In [2]:
import testimony.graph as graph
import testimony.nameutils as nameutils
import testimony.testimony_utils as testimony_utils

transcripts = testimony_utils.Transcripts()


Getting speech acts...
Merging...

In [26]:
from classifier import Sentence
import nltk
import ner

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
tagger = ner.SocketNER(host='localhost', port=8080)
# --------------------------------------------------------

def people_mentioned_in_a_single_speechact(speechact):
    "returns a list of mentioned people from a single speechact "
    people = []
    sens = tokenizer.tokenize(speechact)
    for sen in sens:
        entities_dict = tagger.get_entities(sen)
        if entities_dict.has_key('PERSON'):
            people.extend(entities_dict['PERSON'])
    return people


all_mentions_in_transcripts = []

for speaker, speechacts in transcripts.speechacts.items():
    for speechact in speechacts:
        all_mentions_in_transcripts.extend(people_mentioned_in_a_single_speechact(speechact))
    #mentions = map(people_mentioned_in_a_single_speechact, speechacts)
    #total_mentions.extend(mentions)

len(all_mentions_in_transcripts)


Out[26]:
14282

In [69]:
# clean the list
orig_len = len(all_mentions_in_transcripts)
all_mentions_in_transcripts = map(lambda n: n.lower(), all_mentions_in_transcripts)
bad_entries = ['mr.', 'mrs.', 'mr. ', 'dr. ', ] #sometimes there are orphaned titles
length_threshold = 3
all_mentions_in_transcripts = [name for name in all_mentions_in_transcripts if not name in bad_entries]
all_mentions_in_transcripts = [name for name in all_mentions_in_transcripts if len(name) > 3]

print "removed", orig_len - len(all_mentions_in_transcripts), "bad entries"


removed 0 bad entries

In [120]:
import difflib

def has_been_chunked(s, chunklist):
    for chunk in chunklist:
        if s in chunk: return True
    return False

def chunk_list(l):
    chunklist = []
    for s in l:
        if has_been_chunked(s, chunklist     continue
        close_matches = difflib.get_close_matches(s, l, 100, 0.8)
        chunklist.append(close_matches)

    return chunklist

In [121]:
chunked_mentions = chunk_list(all_mentions_in_transcripts)
chunked_mentions_no_repeats = map(lambda l: list(set(l)), chunked_mentions)
len(chunked_mentions)


Out[121]:
2632

Total Names

  • 2632 total (chunked) names.
  • that's a lot.
  • So, next questions - how many names in the 'named graph' are present in the transcripts?

In [72]:
import networkx as nx

G = nx.read_gml('graphs/unweighted.gml')


Out[72]:
214

In [135]:
# people named in graph
people_named_in_graph = []

for edge in G.edges_iter():
    target_id = edge[1]
    people_named_in_graph.append(G.node[target_id]['label'])


named_in_graph_and_mentioned_in_transcripts = filter(lambda s: has_been_chunked(s, chunked_mentions),
                                                     people_named_in_graph)

named_in_graph_and_mentioned_in_transcripts = list(set(named_in_graph_and_mentioned_in_transcripts))

In [145]:
# snitches in graph

#speakers_in_transcripts = map(lambda s: s.replace("-", " "), transcripts.names)
speakers_in_transcripts = transcripts.speechacts.keys()
chunked_speakers = chunk_list(speakers_in_transcripts)

snitches_in_graph = []
for edge in G.edges_iter():
    snitch_id = edge[0]
    snitches_in_graph.append(G.node[snitch_id]['label'])

snitches_in_graph = list(set(snitches_in_graph))

snitched_in_graph_and_speaker_in_transcripts = filter(lambda s: has_been_chunked(s, chunked_speakers), 
                                                      snitches_in_graph)

snitches_in_graph[:10]


Out[145]:
[u'sol short',
 u'bart lytton',
 u'leo bigelman',
 u'charles daggett',
 u'david a. lang',
 u'william l. alland',
 u'percy solotoy',
 u'melvin levy',
 u'reuben ship',
 u'harold ashe']

In [153]:
names_present_in_graph_and_transcripts = []

for id in G.nodes_iter():
    node = G.node[id]
    name = node['label']
    if has_been_chunked(name, chunked_mentions):
        names_present_in_graph_and_transcripts.append(name)



# TODO:
# people mentioned in transcripts by the people who named them in the graph.

print "Total names in graph:", len(G.node)
print "Total names mentioned in transcripts:", len(chunked_mentions), "\n"
print "#mentioned in transcripts & present in graph:", len(names_present_in_graph_and_transcripts)
print "#present in graph but not mentioned in transcripts:", len(G.node) - len(names_present_in_graph_and_transcripts)
print "#mentioned in transcripts but not graph:", len(chunked_mentions) - len(G.node), "\n"

print "#people named in graph:", len(people_named_in_graph)
print "#people named in graph and mentioned in transcripts:", len(named_in_graph_and_mentioned_in_transcripts), "\n"

print "#snitches who spoke in transcripts:", len(snitched_in_graph_and_speaker_in_transcripts), "/", len(snitches_in_graph)


Total names in graph: 308
Total names mentioned in transcripts: 2632 

#mentioned in transcripts & present in graph: 214
#present in graph but not mentioned in transcripts: 94
#mentioned in transcripts but not graph: 2324 

#people named in graph: 383
#people named in graph and mentioned in transcripts: 185 

#snitches who spoke in transcripts: 54 / 54

Summary of intersection

When referring to the "intersection", I'm using the following definitions:

  • "snitch" : someone who named someone else from the summary who also spoke in a transcript.

  • "accused" : someone who was named in the summary who is also mentioned in the transcripts

  • 54 snitches named 185 people

Questions

  • The fact that all the snitches are represented in the transcripts is super weird.

  • there are 94 names that are present in the graph, but not the transcripts.

  • This, combined with the fact that they name a large number of people (average is 4.8), makes me think that a non-trivial number of snitches were working as 'professionals'. The distribution will tell more, though.


In [199]:
# distribution of people naming other people.
# this only includes snitches who have spoken in hearings.
import matplotlib.pyplot as plt
import numpy as np

naming_dist = {}

def get_id_by_label(G, label):
    for id in G.nodes_iter():
        if G.node[id]['label'] == label:
            return id

get_id_by_label(G, 'larry parks')

for snitch in snitched_in_graph_and_speaker_in_transcripts:
    snitch_id = get_id_by_label(G, snitch)
    naming_dist[snitch] = len(G[snitch_id].keys())

plt.bar(range(len(naming_dist)), naming_dist.values(), align='center')
plt.xticks(range(len(naming_dist)), naming_dist.keys(), rotation='vertical')

plt.show()

number_named = [tup[1] for tup in naming_dist.items()]
print "median number of people named: ", np.median(number_named)
print "mean number of people named: ", np.mean(number_named)

print "removing the bad apples..."

naming_dist.pop("anne kinney")
naming_dist.pop("robert rossen")
naming_dist.pop("david a. lang")


plt.bar(range(len(naming_dist)), naming_dist.values(), align='center')
plt.xticks(range(len(naming_dist)), naming_dist.keys(), rotation='vertical')

plt.show()

number_named = [tup[1] for tup in naming_dist.items()]
print "median number of people named: ", np.median(number_named)
print "mean number of people named: ", np.mean(number_named)


median number of people named:  2.0
mean number of people named:  7.09259259259
removing the bad apples...
median number of people named:  2.0
mean number of people named:  2.72549019608

results

  • so, there are a few (three) bad apples. Anne Kinney, David A. Lang, and Robert Rossen.

  • The median number named is 2, so it's definitely reasonable.

specific people

Anne Kinney

I can't find anything....

David A. Lang

nope

Robert Rossen

http://en.wikipedia.org/wiki/Robert_Rossen#Examinations_by_HUAC


In [ ]:
# how many people that were named named someone else?