In [88]:
import os
import pickle
import collections
import pandas as pd
import numpy as np
After this, I split the file into one-line parts using the following command:
$ split -l 1 -a 7 speechacts.txt
and removing the original speechacts.txt.
This ensures that each speechact is its' own file, and it receives its' own corenlp object analysis.
then, we compute the overall sentiment for each speechact by batch-parsing the
files. parsed
is a generator, so no computation will be done until we start iterating over it.
the split files are in alphabetical order, and batch_parse iterates through the files alphabetically, so iterating over the parsed files should retain the order from the dataframe.
this will take around 4 hours to compute. be sure to save intermediary results in a file. (108sec for 26 speechacts, ~26*100 total speechacts, 10800 seconds, ~3hrs)
(see batch-parse-sentiment.py)
In [3]:
sent = pickle.load(open('pickles/corenlp_sentiment/corenlp_sentiment_FINAL.p', 'rb'))
In [5]:
sent[:10]
Out[5]:
In [22]:
s = []
for l in sent:
if type(l) != list:
s.append(l)
continue
fname = l[0][0]
newl = []
for tup in l:
newl.append(tup[1:])
s.append((fname, newl))
s[:10]
Out[22]:
Make sure we only have unique filenames
In [15]:
s_no_dupes = [l for n,l in enumerate(s) if l not in s[:n] and l not in s[n+1:]]
In [17]:
print len(s_no_dupes)
print len(s)
In [18]:
a = [1,2,3,4, 4, 8, 9, 3, 1, 10]
print a[:2]
print a[2:]
a_no_dupes = [l for n,l in enumerate(a) if l not in a[:n] and l not in a[n+1:]]
print a_no_dupes
In [43]:
filenames = [t[0] for t in s if type(t) == tuple]
In [44]:
len(list(set(filenames)))
Out[44]:
make a dict of filename -> sentiment
In [46]:
s = [item for item in s if type(item) == tuple]
sent_filename_dict = dict(s)
Get the speechacts for each filename
In [47]:
fpath = '/Users/dan/classes/research/huac-testimony/pickles/speechacts_old/'
speech_dict = {}
for filename in filenames:
filepath = os.path.join(fpath, filename)
speechact = ""
with open(filepath, 'rb') as f:
speechact = f.readline()
speech_dict[speechact] = sent_filename_dict[filename]
In [48]:
speech_dict.items()[:5]
Out[48]:
read the pickle
In [180]:
df = pd.read_pickle('pickles/final/final_analysis.p')
add the column for corenlp/sentiment data.
In [49]:
import Levenshtein
corenlp_sentiment = []
for n, row in df.iterrows():
speechact = row['speechact']
found_match = False
for sa, sent in speech_dict.items():
if Levenshtein.ratio(sa, speechact) > 0.85:
corenlp_sentiment.append(sent)
found_match = True
break # don't want multiple. just take the first.
if not found_match:
corenlp_sentiment.append(np.nan)
In [50]:
corenlp_sentiment[:10]
Out[50]:
In [53]:
pickle.dump(corenlp_sentiment, open('pickles/final/corenlp_sentiment_on_speechacts_list.p', 'wb'))
In [156]:
sentiment = pickle.load(open('pickles/final/corenlp_sentiment_on_speechacts_list.p', 'rb'))
In [165]:
sentiment[:5]
Out[165]:
In [171]:
def base_and_multiplier_from_category(category):
categories = ["Verynegative", "Negative", "Positive", "Verypositive"]
base = (categories.index(category) - 1)/2 * 5
multiplier = categories.index(category) > 1
if multiplier:
return (base, 1)
else:
return (base, -1)
def category_and_score_to_number(t):
"produces a number that represents the given intensity and score."
category, score = t
base, multiplier = base_and_multiplier_from_category(category)
return base + multiplier*score
In [172]:
coded_sentiment = []
for s in sentiment:
new_s = []
if type(s) == float:
coded_sentiment.append(np.nan)
continue
for pair in s:
if pair[0] == "Neutral":
new_s.append(np.nan)
else:
new_s.append(category_and_score_to_number(pair))
coded_sentiment.append(new_s)
In [173]:
coded_sentiment[:5]
Out[173]:
In [174]:
pickle.dump(coded_sentiment, open('pickles/final/corenlp_sentiment_on_speechacts_list_coded.p', 'wb'))
In [181]:
sentiment = pickle.load(open('pickles/final/corenlp_sentiment_on_speechacts_list_coded.p', 'rb'))
In [182]:
df['corenlp_sentiment_by_sentence'] = sentiment
In [183]:
df.head()
Out[183]:
In [184]:
df.to_pickle('pickles/final/with_corenlp_sentiment_df.p')
We want to construct a graph that contains every edge corenlp picked up as having sentiment. Each edge should have the corenlp sentiment measure as well as the liwc sentiment measure, and the liwc pos/neg categories.
Then, we can compare the two in a graph.
name disambiguation
In [185]:
disambiguated_names = pickle.load(open('pickles/final/disambiguated_names.p', 'rb'))
def get_key(mention, l):
"returns the numerical key for the given mention"
mention = mention.lower()
for chunk in l:
if mention in chunk:
return l.index(chunk)
some utilities
In [ ]:
import difflib
transcript_dir = os.path.join("testimony/text/hearings")
interviewee_names = [f.replace(".txt", "") for f in os.listdir(transcript_dir)]
interviewee_names = map(lambda s: s.replace("-", " "), interviewee_names)
def is_interviewer(name):
return not difflib.get_close_matches(name, interviewee_names)
graph_data = sentiment_graph_data = collections.defaultdict(lambda : collections.defaultdict(int))
In [192]:
import difflib
transcript_dir = os.path.join("testimony/text/hearings")
interviewee_names = [f.replace(".txt", "") for f in os.listdir(transcript_dir)]
interviewee_names = map(lambda s: s.replace("-", " "), interviewee_names)
relevant_categories = ['Posemo', 'Negemo', 'Anger', 'Posfeel']
def is_interviewer(name):
return not difflib.get_close_matches(name, interviewee_names)
# the graph data has to be stored in a separate dict until we
# construct the graph; adding multiple edges between two nodes just
# replaces the attributes. We want to average/accumulate them.
nn_categories_by_sentence = []
nn_graph_data = collections.defaultdict(lambda : collections.defaultdict(dict))
sentiment_graph_data = collections.defaultdict(lambda : collections.defaultdict(int))
liwc_sentiment_graph_data = collections.defaultdict(lambda : collections.defaultdict(int))
count_data = collections.defaultdict(int)
skipped = 0
for n, row in df.iterrows():
if n % 500 == 0:
print n, "rows analyzed"
speaker = row['speaker']
if is_interviewer(speaker):
skipped += 1
continue
# there's anaphora
mention_list_w_anaphora = row['mention_list_by_sentence_with_anaphora']
mention_list_wo_anaphora = row['mention_list_by_sentence_without_anaphora']
corenlp_sentiment_by_sentence = row['corenlp_sentiment_by_sentence']
liwc_sentiment_by_sentence = row['liwc_sentiment_by_sentence']
if type(corenlp_sentiment_by_sentence) == float:
continue
skipped += 1
speaker = get_key(speaker, disambiguated_names)
if type(mention_list_w_anaphora) == list:
sentiment_towards_mentions = {}
for n, mentions in enumerate(mention_list_w_anaphora):
for mention in mentions:
mention = get_key(mention, disambiguated_names)
if speaker == mention or not mention or type(corenlp_sentiment_by_sentence[n]) == float:
skipped += 1
continue
sentiment_graph_data[speaker][mention] += corenlp_sentiment_by_sentence[n]
liwc_sentiment_graph_data[speaker][mention] += liwc_sentiment_by_sentence[n]
count_data[(speaker, mention)] += 1
elif type(mention_list_wo_anaphora) == list:
categories_towards_mentions = {}
for n, mentions in enumerate(mention_list_wo_anaphora):
for mention in mentions:
mention = get_key(mention, disambiguated_names)
if speaker == mention or not mention or type(corenlp_sentiment_by_sentence[n]) == float:
skipped += 1
continue
sentiment_graph_data[speaker][mention] += corenlp_sentiment_by_sentence[n]
liwc_sentiment_graph_data[speaker][mention] += liwc_sentiment_by_sentence[n]
count_data[(speaker, mention)] += 1
print "skipped", skipped
print "sentiment_graph_data and liwc_sentiment_graph_data are now populated."
In [206]:
sentiment_graph_data.items()[:5]
Out[206]:
In [212]:
pos_stanford = 0
neg_stanford = 0
pos_liwc = 0
neg_liwc = 0
import networkx as nx
G = nx.DiGraph()
for source, targets in sentiment_graph_data.items():
for accused, sent in targets.items():
attrs = {'stanford_sent': sent, 'liwc_sent': liwc_sentiment_graph_data[source][accused]}
if sent > 0:
pos_stanford += 1
elif sent < 0:
neg_stanford += 1
if liwc_sentiment_graph_data[source][accused] > 0:
pos_liwc += 1
elif liwc_sentiment_graph_data[source][accused] < 0:
neg_liwc += 1
G.add_edge(source, accused, attrs)
for speaker, mentions in anaphora_graph_data.items():
if n % 10 == 0:
print "analyzing", n, "mentions."
for mentioned, attrs in mentions.items():
count = anaphora_count_data[(speaker, mentioned)]
normalized_attrs = normalize_dict(attrs, count)
filtered = filter_categories(normalized_attrs, relevant_categories)
n += 1
try:
dominant = max(filtered.items(), key=lambda p:p[1])
dominant = {dominant[0] : dominant[1]}
G_only_anaphora_with_dominant_categories.add_edge(speaker, mentioned, dominant)
except ValueError:
G_only_anaphora_with_dominant_categories.add_edge(speaker, mentioned)
In [211]:
nx.write_gml(G, 'graphs/final/corenlp_vs_liwc_sentiment.gml')
In [209]:
G.node
Out[209]:
In [204]:
count_data[(229, 287)]
Out[204]:
In [ ]: