In [3]:
import os
import difflib
import pandas as pd
import numpy as np
import corenlp_utils
import jsonrpclib
import lexicons.lexiconUtils as sentimentUtils
from collections import defaultdict
liwc = sentimentUtils.LiwcDict()
df = pd.read_pickle('pickles/with_interviewee_anaphora_sentiment.p')
server = jsonrpclib.Server("http://localhost:8084") # corenlp server
transcript_dir = os.path.join('testimony/text/hearings')
interviewee_names = [f.replace(".txt", "") for f in os.listdir(transcript_dir)]
interviewee_names = map(lambda s: s.replace("-", " "), interviewee_names)
def is_interviewer(name):
return not difflib.get_close_matches(name, interviewee_names)
In [ ]:
df_categories = []
for n, row in df.iterrows():
if n > 10:
break
speech = row['speechact']
res = corenlp_utils.get_corenlp_object(speech, server)
if not res:
df_categories.append(np.nan)
continue
cats = defaultdict(int)
for sen in res['sentences']:
for word in sen['words']:
lemma = word[1]['Lemma']
categories = liwc.getCategories(lemma)
print "lemma: ", lemma, "categories:", categories
if not categories:
continue
for category in categories:
cats[category] += 1
print "cats:", cats
print "adding:", cats
df_categories.append(cats)
In [ ]:
df['categories'] = df_categories
df.to_pickle('pickles/with_liwc_categories_non_normalized.p')
Normalize the categories
In [ ]:
normalized = []
for n, row in df.iterrows():
categories = row['categories']
if type(categories) != defaultdict:
normalized.append(categories)
continue
normalized_dict = dict((k, (float(v)/len(row['speechact']))) for k,v in categories.items())
normalized.append(normalized_dict)
In [ ]:
df['categories'] = normalized
df.to_pickle('pickles/liwc_categories_normalized.p')
In [4]:
df = pd.read_pickle('pickles/liwc_categories_normalized.p')
df.head()
Out[4]:
I want to add a column to the dataframe that shows if that speechact is an interviewee response.
(I don't know why I haven't done this before)
Also, I just realized that I still have the old mentions list. We should get rid of that.
I'll add a column for just the keys of the anaphora sentiment.
In [5]:
df = df.drop('mentions', 1)
In [39]:
response = []
for n, row in df.iterrows():
if not pd.isnull(row['anaphora_sentiment']) and not is_interviewer(row['speaker']):
response.append(True)
elif not is_interviewer(row['speaker']):
response.append(False)
else:
response.append(np.nan)
df['interviewee_response'] = response
In [40]:
cat_df = pd.DataFrame({'response': df['interviewee_response'], 'categories': df['categories']})
def aggregate_dicts(ds):
res = defaultdict(float)
for d in ds:
for k, v in d.items():
res[k] += v
return res
total_response = {}
total_not_response = {}
In [57]:
import matplotlib.pyplot as plt
total_response = aggregate_dicts(cat_df.groupby('response')['categories'].get_group(True).dropna())
total_not_response = aggregate_dicts(cat_df.groupby('response')['categories'].get_group(False).dropna())
fig, ax1 = plt.subplots()
ax1.bar(range(len(total_response)), total_response.values(), align='center', label='responses', color='g', width=.7)
plt.xticks(range(len(total_response)), total_response.keys(), rotation='vertical', size='small')
ax1.set_xlabel('liwc category')
ax1.set_ylabel('frequency (normalized)')
ax1.set_title('liwc categories in speechacts')
#ax2 = ax1.twinx()
#ax2.bar(range(len(total_not_response)), total_not_response.values(), align='center', label='non-responses', color='r')
plt.show()
In [ ]: