notebook.community

Edit and run



In [2]:

    
import testimony.graph as graph
import testimony.nameutils as nameutils
import testimony.testimony_utils as testimony_utils

transcripts = testimony_utils.Transcripts()









    



Getting speech acts...
Merging...



In [26]:

    
from classifier import Sentence
import nltk
import ner

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
tagger = ner.SocketNER(host='localhost', port=8080)
# --------------------------------------------------------

def people_mentioned_in_a_single_speechact(speechact):
    "returns a list of mentioned people from a single speechact "
    people = []
    sens = tokenizer.tokenize(speechact)
    for sen in sens:
        entities_dict = tagger.get_entities(sen)
        if entities_dict.has_key('PERSON'):
            people.extend(entities_dict['PERSON'])
    return people


all_mentions_in_transcripts = []

for speaker, speechacts in transcripts.speechacts.items():
    for speechact in speechacts:
        all_mentions_in_transcripts.extend(people_mentioned_in_a_single_speechact(speechact))
    #mentions = map(people_mentioned_in_a_single_speechact, speechacts)
    #total_mentions.extend(mentions)

len(all_mentions_in_transcripts)









    Out[26]:





14282



In [69]:

    
# clean the list
orig_len = len(all_mentions_in_transcripts)
all_mentions_in_transcripts = map(lambda n: n.lower(), all_mentions_in_transcripts)
bad_entries = ['mr.', 'mrs.', 'mr. ', 'dr. ', ] #sometimes there are orphaned titles
length_threshold = 3
all_mentions_in_transcripts = [name for name in all_mentions_in_transcripts if not name in bad_entries]
all_mentions_in_transcripts = [name for name in all_mentions_in_transcripts if len(name) > 3]

print "removed", orig_len - len(all_mentions_in_transcripts), "bad entries"









    



removed 0 bad entries



In [120]:

    
import difflib

def has_been_chunked(s, chunklist):
    for chunk in chunklist:
        if s in chunk: return True
    return False

def chunk_list(l):
    chunklist = []
    for s in l:
        if has_been_chunked(s, chunklist     continue
        close_matches = difflib.get_close_matches(s, l, 100, 0.8)
        chunklist.append(close_matches)

    return chunklist



In [121]:

    
chunked_mentions = chunk_list(all_mentions_in_transcripts)
chunked_mentions_no_repeats = map(lambda l: list(set(l)), chunked_mentions)
len(chunked_mentions)









    Out[121]:





2632

Total Names

2632 total (chunked) names.
that's a lot.
So, next questions - how many names in the 'named graph' are present in the transcripts?



In [72]:

    
import networkx as nx

G = nx.read_gml('graphs/unweighted.gml')









    Out[72]:





214



In [135]:

    
# people named in graph
people_named_in_graph = []

for edge in G.edges_iter():
    target_id = edge[1]
    people_named_in_graph.append(G.node[target_id]['label'])


named_in_graph_and_mentioned_in_transcripts = filter(lambda s: has_been_chunked(s, chunked_mentions),
                                                     people_named_in_graph)

named_in_graph_and_mentioned_in_transcripts = list(set(named_in_graph_and_mentioned_in_transcripts))



In [145]:

    
# snitches in graph

#speakers_in_transcripts = map(lambda s: s.replace("-", " "), transcripts.names)
speakers_in_transcripts = transcripts.speechacts.keys()
chunked_speakers = chunk_list(speakers_in_transcripts)

snitches_in_graph = []
for edge in G.edges_iter():
    snitch_id = edge[0]
    snitches_in_graph.append(G.node[snitch_id]['label'])

snitches_in_graph = list(set(snitches_in_graph))

snitched_in_graph_and_speaker_in_transcripts = filter(lambda s: has_been_chunked(s, chunked_speakers), 
                                                      snitches_in_graph)

snitches_in_graph[:10]









    Out[145]:





[u'sol short',
 u'bart lytton',
 u'leo bigelman',
 u'charles daggett',
 u'david a. lang',
 u'william l. alland',
 u'percy solotoy',
 u'melvin levy',
 u'reuben ship',
 u'harold ashe']



In [153]:

    
names_present_in_graph_and_transcripts = []

for id in G.nodes_iter():
    node = G.node[id]
    name = node['label']
    if has_been_chunked(name, chunked_mentions):
        names_present_in_graph_and_transcripts.append(name)



# TODO:
# people mentioned in transcripts by the people who named them in the graph.

print "Total names in graph:", len(G.node)
print "Total names mentioned in transcripts:", len(chunked_mentions), "\n"
print "#mentioned in transcripts & present in graph:", len(names_present_in_graph_and_transcripts)
print "#present in graph but not mentioned in transcripts:", len(G.node) - len(names_present_in_graph_and_transcripts)
print "#mentioned in transcripts but not graph:", len(chunked_mentions) - len(G.node), "\n"

print "#people named in graph:", len(people_named_in_graph)
print "#people named in graph and mentioned in transcripts:", len(named_in_graph_and_mentioned_in_transcripts), "\n"

print "#snitches who spoke in transcripts:", len(snitched_in_graph_and_speaker_in_transcripts), "/", len(snitches_in_graph)









    



Total names in graph: 308
Total names mentioned in transcripts: 2632 

#mentioned in transcripts & present in graph: 214
#present in graph but not mentioned in transcripts: 94
#mentioned in transcripts but not graph: 2324 

#people named in graph: 383
#people named in graph and mentioned in transcripts: 185 

#snitches who spoke in transcripts: 54 / 54

Summary of intersection

When referring to the "intersection", I'm using the following definitions:

"snitch" : someone who named someone else from the summary who also spoke in a transcript.
"accused" : someone who was named in the summary who is also mentioned in the transcripts
54 snitches named 185 people

Questions

The fact that all the snitches are represented in the transcripts is super weird.
there are 94 names that are present in the graph, but not the transcripts.
This, combined with the fact that they name a large number of people (average is 4.8), makes me think that a non-trivial number of snitches were working as 'professionals'. The distribution will tell more, though.



In [199]:

    
# distribution of people naming other people.
# this only includes snitches who have spoken in hearings.
import matplotlib.pyplot as plt
import numpy as np

naming_dist = {}

def get_id_by_label(G, label):
    for id in G.nodes_iter():
        if G.node[id]['label'] == label:
            return id

get_id_by_label(G, 'larry parks')

for snitch in snitched_in_graph_and_speaker_in_transcripts:
    snitch_id = get_id_by_label(G, snitch)
    naming_dist[snitch] = len(G[snitch_id].keys())

plt.bar(range(len(naming_dist)), naming_dist.values(), align='center')
plt.xticks(range(len(naming_dist)), naming_dist.keys(), rotation='vertical')

plt.show()

number_named = [tup[1] for tup in naming_dist.items()]
print "median number of people named: ", np.median(number_named)
print "mean number of people named: ", np.mean(number_named)

print "removing the bad apples..."

naming_dist.pop("anne kinney")
naming_dist.pop("robert rossen")
naming_dist.pop("david a. lang")


plt.bar(range(len(naming_dist)), naming_dist.values(), align='center')
plt.xticks(range(len(naming_dist)), naming_dist.keys(), rotation='vertical')

plt.show()

number_named = [tup[1] for tup in naming_dist.items()]
print "median number of people named: ", np.median(number_named)
print "mean number of people named: ", np.mean(number_named)









    












    



median number of people named:  2.0
mean number of people named:  7.09259259259
removing the bad apples...






    












    



median number of people named:  2.0
mean number of people named:  2.72549019608

results

so, there are a few (three) bad apples. Anne Kinney, David A. Lang, and Robert Rossen.
The median number named is 2, so it's definitely reasonable.

specific people

Anne Kinney

I can't find anything....

David A. Lang

nope

Robert Rossen

http://en.wikipedia.org/wiki/Robert_Rossen#Examinations_by_HUAC



In [ ]:

    
# how many people that were named named someone else?