Other categories from liwc

Using other word categories, we can identify feelings like anxiety, anger, etc. in speechacts.

It's a rough measure when used in an entire speechact, but we'll see what happens, I guess.

Each element of 'categories' will be a dictionary of {category: # of occurrences}



In [3]:

    
import os
import difflib
import pandas as pd
import numpy as np
import corenlp_utils
import jsonrpclib
import lexicons.lexiconUtils as sentimentUtils
from collections import defaultdict

liwc = sentimentUtils.LiwcDict()
df = pd.read_pickle('pickles/with_interviewee_anaphora_sentiment.p')
server = jsonrpclib.Server("http://localhost:8084") # corenlp server

transcript_dir = os.path.join('testimony/text/hearings')
interviewee_names = [f.replace(".txt", "") for f in os.listdir(transcript_dir)]
interviewee_names = map(lambda s: s.replace("-", " "), interviewee_names)

def is_interviewer(name):
    return not difflib.get_close_matches(name, interviewee_names)



In [ ]:

    
df_categories = []

for n, row in df.iterrows():
    if n > 10:
        break
    speech = row['speechact']
    res = corenlp_utils.get_corenlp_object(speech, server)
    if not res:
        df_categories.append(np.nan)
        continue
    cats = defaultdict(int)
    for sen in res['sentences']:
        for word in sen['words']:
            lemma = word[1]['Lemma']
            categories = liwc.getCategories(lemma)
            print "lemma: ", lemma, "categories:", categories

            if not categories:
                continue
            for category in categories:
                cats[category] += 1
            print "cats:", cats
    print "adding:", cats
    df_categories.append(cats)



In [ ]:

    
df['categories'] = df_categories
df.to_pickle('pickles/with_liwc_categories_non_normalized.p')

Normalize the categories



In [ ]:

    
normalized = []

for n, row in df.iterrows():
    categories = row['categories']
    if type(categories) != defaultdict:
        normalized.append(categories)
        continue
    normalized_dict = dict((k, (float(v)/len(row['speechact']))) for k,v in categories.items())
    normalized.append(normalized_dict)



In [ ]:

    
df['categories'] = normalized
df.to_pickle('pickles/liwc_categories_normalized.p')

Start Here



In [4]:

    
df = pd.read_pickle('pickles/liwc_categories_normalized.p')
df.head()









    Out[4]:






  
    
      
      mentions
      speaker
      speechact
      sentiment
      anaphora_sentiment
      categories
    
  
  
    
      0
       []
         macia
       'I'AvENNEit. Have both counsel identified them...
       NaN
       NaN
       {u'School': 0.0144927536232, u'Pronoun': 0.014...
    
    
      1
       []
         macia
                            AI'AENNEI1. When and where- 
       NaN
       NaN
       {u'Space': 0.0357142857143, u'Incl': 0.0357142...
    
    
      2
       []
         macia
                  ,JACK ON. No. I am sorry, you cannot. 
       NaN
       NaN
       {u'Pronoun': 0.0263157894737, u'Future': 0.026...
    
    
      3
       []
       jackson
        The request, in line with the rules of the co...
       NaN
       NaN
       {u'Cogmech': 0.0121457489879, u'Tentat': 0.008...
    
    
      4
       []
         macia
       ''M NER. illt volt (Otill emtoyvl its italelo ...
       NaN
       NaN
       {u'Eating': 0.0131578947368, u'Pronoun': 0.026...

I want to add a column to the dataframe that shows if that speechact is an interviewee response.

(I don't know why I haven't done this before)

Also, I just realized that I still have the old mentions list. We should get rid of that.

I'll add a column for just the keys of the anaphora sentiment.



In [5]:

    
df = df.drop('mentions', 1)



In [39]:

    
response = []

for n, row in df.iterrows():
    if not pd.isnull(row['anaphora_sentiment']) and not is_interviewer(row['speaker']):
        response.append(True)
    elif not is_interviewer(row['speaker']):
        response.append(False)
    else:
        response.append(np.nan)
df['interviewee_response'] = response



In [40]:

    
cat_df = pd.DataFrame({'response': df['interviewee_response'], 'categories': df['categories']})

def aggregate_dicts(ds):
    res = defaultdict(float)
    for d in ds:
        for k, v in d.items():
            res[k] += v
    return res
            

total_response = {}
total_not_response = {}



In [57]:

    
import matplotlib.pyplot as plt

total_response = aggregate_dicts(cat_df.groupby('response')['categories'].get_group(True).dropna())
total_not_response = aggregate_dicts(cat_df.groupby('response')['categories'].get_group(False).dropna())

fig, ax1 = plt.subplots()
ax1.bar(range(len(total_response)), total_response.values(), align='center', label='responses', color='g', width=.7)
plt.xticks(range(len(total_response)), total_response.keys(), rotation='vertical', size='small')
ax1.set_xlabel('liwc category')
ax1.set_ylabel('frequency (normalized)')
ax1.set_title('liwc categories in speechacts')

#ax2 = ax1.twinx()
#ax2.bar(range(len(total_not_response)), total_not_response.values(), align='center', label='non-responses', color='r')



plt.show()



In [ ]:

	mentions	speaker	speechact	sentiment	anaphora_sentiment	categories
0	[]	macia	'I'AvENNEit. Have both counsel identified them...	NaN	NaN	{u'School': 0.0144927536232, u'Pronoun': 0.014...
1	[]	macia	AI'AENNEI1. When and where-	NaN	NaN	{u'Space': 0.0357142857143, u'Incl': 0.0357142...
2	[]	macia	,JACK ON. No. I am sorry, you cannot.	NaN	NaN	{u'Pronoun': 0.0263157894737, u'Future': 0.026...
3	[]	jackson	The request, in line with the rules of the co...	NaN	NaN	{u'Cogmech': 0.0121457489879, u'Tentat': 0.008...
4	[]	macia	''M NER. illt volt (Otill emtoyvl its italelo ...	NaN	NaN	{u'Eating': 0.0131578947368, u'Pronoun': 0.026...