Other categories from liwc

Using other word categories, we can identify feelings like anxiety, anger, etc. in speechacts.

It's a rough measure when used in an entire speechact, but we'll see what happens, I guess.

Each element of 'categories' will be a dictionary of {category: # of occurrences}


In [3]:
import os
import difflib
import pandas as pd
import numpy as np
import corenlp_utils
import jsonrpclib
import lexicons.lexiconUtils as sentimentUtils
from collections import defaultdict

liwc = sentimentUtils.LiwcDict()
df = pd.read_pickle('pickles/with_interviewee_anaphora_sentiment.p')
server = jsonrpclib.Server("http://localhost:8084") # corenlp server

transcript_dir = os.path.join('testimony/text/hearings')
interviewee_names = [f.replace(".txt", "") for f in os.listdir(transcript_dir)]
interviewee_names = map(lambda s: s.replace("-", " "), interviewee_names)

def is_interviewer(name):
    return not difflib.get_close_matches(name, interviewee_names)

In [ ]:
df_categories = []

for n, row in df.iterrows():
    if n > 10:
        break
    speech = row['speechact']
    res = corenlp_utils.get_corenlp_object(speech, server)
    if not res:
        df_categories.append(np.nan)
        continue
    cats = defaultdict(int)
    for sen in res['sentences']:
        for word in sen['words']:
            lemma = word[1]['Lemma']
            categories = liwc.getCategories(lemma)
            print "lemma: ", lemma, "categories:", categories

            if not categories:
                continue
            for category in categories:
                cats[category] += 1
            print "cats:", cats
    print "adding:", cats
    df_categories.append(cats)

In [ ]:
df['categories'] = df_categories
df.to_pickle('pickles/with_liwc_categories_non_normalized.p')

Normalize the categories


In [ ]:
normalized = []

for n, row in df.iterrows():
    categories = row['categories']
    if type(categories) != defaultdict:
        normalized.append(categories)
        continue
    normalized_dict = dict((k, (float(v)/len(row['speechact']))) for k,v in categories.items())
    normalized.append(normalized_dict)

In [ ]:
df['categories'] = normalized
df.to_pickle('pickles/liwc_categories_normalized.p')

Start Here


In [4]:
df = pd.read_pickle('pickles/liwc_categories_normalized.p')
df.head()


Out[4]:
mentions speaker speechact sentiment anaphora_sentiment categories
0 [] macia 'I'AvENNEit. Have both counsel identified them... NaN NaN {u'School': 0.0144927536232, u'Pronoun': 0.014...
1 [] macia AI'AENNEI1. When and where- NaN NaN {u'Space': 0.0357142857143, u'Incl': 0.0357142...
2 [] macia ,JACK ON. No. I am sorry, you cannot. NaN NaN {u'Pronoun': 0.0263157894737, u'Future': 0.026...
3 [] jackson The request, in line with the rules of the co... NaN NaN {u'Cogmech': 0.0121457489879, u'Tentat': 0.008...
4 [] macia ''M NER. illt volt (Otill emtoyvl its italelo ... NaN NaN {u'Eating': 0.0131578947368, u'Pronoun': 0.026...

I want to add a column to the dataframe that shows if that speechact is an interviewee response.

(I don't know why I haven't done this before)

Also, I just realized that I still have the old mentions list. We should get rid of that.

I'll add a column for just the keys of the anaphora sentiment.


In [5]:
df = df.drop('mentions', 1)

In [39]:
response = []

for n, row in df.iterrows():
    if not pd.isnull(row['anaphora_sentiment']) and not is_interviewer(row['speaker']):
        response.append(True)
    elif not is_interviewer(row['speaker']):
        response.append(False)
    else:
        response.append(np.nan)
df['interviewee_response'] = response

In [40]:
cat_df = pd.DataFrame({'response': df['interviewee_response'], 'categories': df['categories']})

def aggregate_dicts(ds):
    res = defaultdict(float)
    for d in ds:
        for k, v in d.items():
            res[k] += v
    return res
            

total_response = {}
total_not_response = {}

In [57]:
import matplotlib.pyplot as plt

total_response = aggregate_dicts(cat_df.groupby('response')['categories'].get_group(True).dropna())
total_not_response = aggregate_dicts(cat_df.groupby('response')['categories'].get_group(False).dropna())

fig, ax1 = plt.subplots()
ax1.bar(range(len(total_response)), total_response.values(), align='center', label='responses', color='g', width=.7)
plt.xticks(range(len(total_response)), total_response.keys(), rotation='vertical', size='small')
ax1.set_xlabel('liwc category')
ax1.set_ylabel('frequency (normalized)')
ax1.set_title('liwc categories in speechacts')

#ax2 = ax1.twinx()
#ax2.bar(range(len(total_not_response)), total_not_response.values(), align='center', label='non-responses', color='r')



plt.show()



In [ ]: