The goal of this notebook is to construct the final dataframe that can be used for any real analysis needed.
for each speechact, it should include fields for:
Setup - load the original dataframe and modules
In [179]:
import pickle
import os
import difflib
import jsonrpclib
import simplejson
import text_utils
import collections
import numpy as np
import pandas as pd
from corenlp_utils import *
from sentiment_utils import *
%load_ext autoreload
%autoreload 2
server = jsonrpclib.Server("http://localhost:8084")
transcript_dir = os.path.join('testimony/text/hearings')
interviewee_names = [f.replace(".txt", "") for f in os.listdir(transcript_dir)]
interviewee_names = map(lambda s: s.replace("-", " "), interviewee_names)
def is_interviewer(name):
return not difflib.get_close_matches(name, interviewee_names)
df = pd.read_pickle('pickles/speaker_speechacts.p')
Create a dictionary to add to the dataframe. Missing / inapplicable entries have 'np.nan' as the value.
In [164]:
d = {"is_interviewee": [],
"is_response": [],
"liwc_categories_for_speechact": [],
"liwc_categories_by_sentence": [],
"liwc_sentiment_for_speechact": [],
"liwc_sentiment_by_sentence": [],
"liwc_sentiment_towards_entities_with_anaphora": [],
"liwc_sentiment_towards_entities_without_anaphora": [],
"liwc_sentiment_towards_only_anaphora": [],
"mention_list_by_sentence_without_anaphora": [],
"mention_list_by_sentence_with_anaphora": [],
"mention_list_for_speechact_without_anaphora": [],
}
Actually populate the dictionary. Makes lots of calls to the utility scripts.
This does not include disambiguated mention information or corenlp sentiment data. we'll get to those later.
In [165]:
for n, row in df.iterrows():
# if n > 5000:
# break
# interviewee
d['is_interviewee'].append(not is_interviewer(row['speaker']))
# response
is_response = not is_interviewer(row['speaker']) and is_interviewer(df.ix[n - 1]['speaker'])
d['is_response'].append(is_response)
if is_response:
prev_speechact = corenlp_utils.get_corenlp_object(df.ix[n - 1]['speechact'], server)
if not prev_speechact:
is_response = False
speechact = corenlp_utils.get_corenlp_object(row['speechact'], server)
if not speechact: # everything is nan.
d['liwc_categories_for_speechact'].append(np.nan)
d['liwc_categories_by_sentence'].append(np.nan)
d['liwc_sentiment_for_speechact'].append(np.nan)
d['liwc_sentiment_by_sentence'].append(np.nan)
d['liwc_sentiment_towards_entities_without_anaphora'].append(np.nan)
d['liwc_sentiment_towards_entities_with_anaphora'].append(np.nan)
d['liwc_sentiment_towards_only_anaphora'].append(np.nan)
d['mention_list_by_sentence_with_anaphora'].append(np.nan)
d['mention_list_by_sentence_without_anaphora'].append(np.nan)
d['mention_list_for_speechact_without_anaphora'].append(np.nan)
continue
else:
# response entries
if is_response:
d['liwc_sentiment_towards_entities_with_anaphora'].append(liwc_sentiment_with_anaphora(speechact, prev_speechact, server))
d['liwc_sentiment_towards_only_anaphora'].append(liwc_sentiment_towards_only_anaphora(speechact, prev_speechact, server))
d['mention_list_by_sentence_with_anaphora'].append(mention_list_by_sentence_with_anaphora(speechact, prev_speechact, server))
if not is_response:
d['liwc_sentiment_towards_entities_with_anaphora'].append(np.nan)
d['liwc_sentiment_towards_only_anaphora'].append(np.nan)
d['mention_list_by_sentence_with_anaphora'].append(np.nan)
# mention lists
d['liwc_sentiment_towards_entities_without_anaphora'].append(liwc_sentiment_towards_all_entities_in_speechact_no_anaphora(speechact))
d['mention_list_by_sentence_without_anaphora'].append(mention_list_by_sentence_no_anaphora(speechact))
d['mention_list_for_speechact_without_anaphora'].append(mention_list_for_speechact_no_anaphora(speechact))
# liwc categories
d['liwc_categories_for_speechact'].append(liwc_categories_for_speechact(speechact))
d['liwc_categories_by_sentence'].append(liwc_categories_by_sentence(speechact))
# overall liwc sentiment
d['liwc_sentiment_for_speechact'].append(liwc_overall_sentiment_of_speechact(speechact))
d['liwc_sentiment_by_sentence'].append(liwc_overall_sentiment_by_sentence(speechact))
if n % 500 == 0:
temp_pickle_name = "final_analysis_" + str(n) + "_tmp.p"
print "analyzed", n, "speechacts. Saving temporary pickle as", temp_pickle_name
pickle.dump(d, open("pickles/final/" + temp_pickle_name, 'wb'))
#df.to_pickle("pickles/final/" + temp_pickle_name)
temp_pickle_name = "final_analysis_" + str(n) + "_FINAL.p"
print "analyzed", n, "speechacts. Saving FINAL pickle as", temp_pickle_name
pickle.dump(d, open("pickles/final/" + temp_pickle_name, 'wb'))
Run this if the final pickle has already been generated:
In [82]:
d = pickle.load(open('pickles/final/final_analysis_12953_FINAL.p', 'rb'))
In [166]:
#df_d = pd.DataFrame(d)
for key in d.keys():
print key, "is", len(d[key]), "long"
Let's add the speaker and speechact columns, and save it for later:
In [168]:
df_d = pd.DataFrame(d)
df_d['speechact'] = df['speechact']
df_d['speaker'] = df['speaker']
df_d.to_pickle("pickles/final/final_dataframe.p")
In [ ]:
df_d.head()
In [255]:
def flatten(l):
return [item for sublist in l for item in sublist]
# get full list of all mentions with and without anaphora.
all_mentions_with_anaphora = [s.lower() for s in flatten(flatten(df_d['mention_list_by_sentence_with_anaphora'].dropna()))]
all_mentions_without_anaphora = [s.lower() for s in flatten(df_d['mention_list_for_speechact_without_anaphora'].dropna())]
all_mentions = list(set(all_mentions_with_anaphora + all_mentions_without_anaphora))
print "original number of mentions:", len(all_mentions)
disambiguated_names = text_utils.chunk_list(all_mentions)
print "number of disambiguated mentions:", len(disambiguated_names)
The disambiguated mention list generated previously had ~ 1227 elements. An increase of about 140 makes sense when we add anaphora, I think (maybe it's low, but we found that expanding the mention list with anaphora didn't add much before, so...)
A method to get the key for some individual mention:
In [147]:
def get_key(mention, l):
"second argument is the chunked list."
mention = mention.lower()
for chunk in l:
if mention in chunk:
return l.index(chunk)
Next, add the speakers:
In [254]:
for speaker in list(set(df_d['speaker'])):
if get_key(speaker, disambiguated_names):
continue # the key's already there.
else:
disambiguated_names.append([speaker]) # gets its' own list if it's not already there.
print "# of disambiguated names, with the speaker data:", len(disambiguated_names)
pickle.dump(disambiguated_names, open('pickles/final/disambiguated_names.p', 'wb'))
An increase of ~8 is expected, and consistent with the previous mention graph data.
Now, when we construct various graphs, we should call get_key
to
find the mention id for some individual.
Each edge has a number of properties that correspond to LIWC categories. The edge has a category score for each category.
Setup:
In [145]:
import networkx as nx
Some functions to help with constructing the liwc category scores for edges:
In [266]:
def update_dict(old, to_add):
for key in to_add.keys():
if old.has_key(key):
old[key] += to_add[key]
else:
old[key] = to_add[key]
return old
def normalize_dict(d, count):
"normalizes by a count of edges"
newdict = {}
for k,v in d.items():
newval = v/float(count)
if newval < 1:
newdict[k] = newval
return newdict
def filter_categories(d, cats):
newdict = {}
for k,v in d.items():
if k in cats:
newdict[k] = v
return newdict
In [ ]:
G = nx.DiGraph()
# the graph data has to be stored in a separate dict until we
# construct the graph; adding multiple edges between two nodes just
# replaces the attributes. We want to average/accumulate them.
graph_data = collections.defaultdict(lambda : collections.defaultdict(dict))
count_data = collections.defaultdict(int)
skipped = 0
for n, row in df_d.iterrows():
if n % 500 == 0:
print n, "rows analyzed"
speaker = row['speaker']
if is_interviewer(speaker):
skipped += 1
continue
categories = ['Negemo', 'Posemo', 'Anx', 'Anger', 'Tentat', 'Social', 'Excl', 'Occup', 'Swear']
def filter_categories(categories):
categories.pop('Number', None)
categories.pop('Present', None)
categories.pop('Othref', None)
categories.pop('Pronoun', None)
categories.pop('Cogmech', None)
categories.pop('Preps', None)
categories.pop('Space', None)
categories.pop('Negate', None)
categories.pop('Achieve', None)
categories.pop('Time', None)
categories.pop('We', None)
categories.pop('Senses', None)
categories.pop('Article', None)
categories.pop('Insight', None)
categories.pop('Hear', None)
categories.pop('Motion', None)
categories.pop('Up', None)
categories.pop('Leisure', None)
return categories
# there's anaphora
mention_list_w_anaphora = row['mention_list_by_sentence_with_anaphora']
mention_list_wo_anaphora = row['mention_list_by_sentence_without_anaphora']
categories_by_sentence = row['liwc_categories_by_sentence']
speaker = get_key(speaker, disambiguated_names)
if type(mention_list_w_anaphora) == list:
categories_towards_mentions = {}
for n, mentions in enumerate(mention_list_w_anaphora):
for mention in mentions:
mention = get_key(mention, disambiguated_names)
if speaker == mention:
continue
categories_for_mention = filter_categories(categories_by_sentence[n])
graph_data[speaker][mention] = update_dict(graph_data[speaker][mention], categories_for_mention)
count_data[(speaker, mention)] += 1
# G.add_edge(speaker, mention, categories_for_mention)
elif type(mention_list_wo_anaphora) == list:
categories_towards_mentions = {}
for n, mentions in enumerate(mention_list_wo_anaphora):
for mention in mentions:
mention = get_key(mention, disambiguated_mentions)
# don't include self-mentions; this will screw up
# centrality measures and is not a meaningful measure
if speaker == mention:
continue
categories_for_mention = filter_categories(categories_by_sentence[n])
graph_data[speaker][mention] = update_dict(graph_data[speaker][mention], categories_for_mention)
count_data[(speaker, mention)] += 1
#G.add_edge(speaker, mention, categories_for_mention)
Adding the normalized data to a graph file:
In [263]:
for speaker, mentions in graph_data.items():
for mentioned, attrs in mentions.items():
count = count_data[(speaker, mentioned)]
normaalized_attrs = normalize_dict(attrs, count)
G.add_edge(speaker, mentioned, normalized_attrs)
In [264]:
nx.write_gml(G, "graphs/liwc_categories_mentions.gml")
Okay, this is great, but gephi actually can't do anything special with this data.
Instead, I'm going to classify each edge according to it's dominating liwc category, chosen from a predefined list.
In [267]:
relevant_categories = ['Negemo', 'Posemo', 'Posfeel', 'Anx', 'Certain', 'Tentat', 'Anger', 'Swear']
G_dominant_scores = nx.DiGraph()
for speaker, mentions in graph_data.items():
for mentioned, attrs in mentions.items():
count = count_data[(speaker, mentioned)]
normalized_attrs = normalize_dict(attrs, count)
filtered = filter_categories(normalized_attrs, relevant_categories)
try:
dominant = max(filtered.items(), key=lambda p: p[1])
dominant = {dominant[0] : dominant[1]}
G_dominant_scores.add_edge(speaker, mentioned, dominant)
except ValueError:
G_dominant_scores.add_edge(speaker, mentioned)
nx.write_gml(G_dominant_scores, 'graphs/liwc_dominant_categories.gml')
In [257]:
disambiguated_speakers = list(set([get_key(s, disambiguated_names) for s in df_d['speaker']]))
In [ ]:
G_mentions = nx.DiGraph()
skipped = 0
for n, row in df_d.iterrows():
if n % 500 == 0:
print n, "rows analyzed"
speaker = row['speaker']
if is_interviewer(speaker):
skipped += 1
continue
speaker = get_key(speaker, disambiguated_names)
mentions_without_anaphora = row['mention_list_for_speechact_without_anaphora']
if type(mentions_without_anaphora) == list:
for mention in mentions_without_anaphora:
mention = get_key(mention, disambiguated_names)
if speaker == mention:
continue
G_mentions.add_edge(speaker, mention)
print "skipped:", skipped
nx.write_gml(G_mentions, "graphs/new_mention_graph.gml")
In [258]:
len(disambiguated_speakers)
Out[258]:
In [232]:
len(df_d['speaker'])
Out[232]:
In [259]:
len(disambiguated_names)
Out[259]:
In [271]:
disambiguated_names[597]
Out[271]: