setup


In [224]:
import pickle
import os
import difflib
import random
import corenlp_utils
import jsonrpclib
import simplejson
import numpy as np
import pandas as pd

transcript_dir = os.path.join('testimony/text/hearings')


df = pd.read_pickle('pickles/final_sentiment.p')
anaphora_and_mention_sentiment = pickle.load(open('pickles/anaphora_or_mention.p', 'rb'))

interviewee_names = [f.replace(".txt", "") for f in os.listdir(transcript_dir)]
interviewee_names = map(lambda s: s.replace("-", " "), interviewee_names)

adding information about whether an interviewee response refers to an interviewer's mention.


In [32]:
def is_interviewer(name):
    return not difflib.get_close_matches(name, interviewee_names)

anaphora_mentions = []
for n, row in df.iterrows():
    # don't want interviewer sentiment here.
    if is_interviewer(row['speaker']):
        to_add = np.nan
    if anaphora_and_mention_sentiment.has_key(n):
        to_add = anaphora_and_mention_sentiment[n]
    anaphora_mentions.append(to_add)

df['anaphora_sentiment'] = anaphora_mentions
df.head()


Out[32]:
mentions speaker speechact sentiment anaphora_sentiment
0 [] macia 'I'AvENNEit. Have both counsel identified them... NaN NaN
1 [] macia AI'AENNEI1. When and where- NaN NaN
2 [] macia ,JACK ON. No. I am sorry, you cannot. NaN NaN
3 [] jackson The request, in line with the rules of the co... NaN NaN
4 [] macia ''M NER. illt volt (Otill emtoyvl its italelo ... NaN NaN

In [179]:
df.to_pickle('pickles/with_interviewee_anaphora_sentiment.p')

let's make it a series, and see what we've got:


In [42]:
vals = pd.Series(anaphora_mentions)
vals[vals.notnull()]


Out[42]:
1156                                  {u'Jane Howe': 0.0}
1157                                  {u'Jane Howe': 0.0}
1216                            {u'Harrison George': 0.0}
1217                 {u'Harrison George': 0.117028985507}
1224                {u'Harrison George': 0.0133333333333}
1225                {u'Harrison George': 0.0133333333333}
1233                               {u'Moxley Carey': 0.2}
1234                               {u'Moxley Carey': 0.2}
1243    {u'Dick Lewis': 0.0, u'Richard B. Lewis': 0.0,...
1244          {u'Richard B. Lewis': 0.0, u'Richard': 0.0}
1245                                      {u'Lewis': 0.0}
1246                                      {u'Lewis': 0.0}
1249                                 {u'Abe Minkus': 0.0}
1250                                 {u'Abe Minkus': 0.0}
1253                                {u'Zara Becker': 0.0}
...
7998                              {u'Victor Killian': 0.0}
7999                              {u'Victor Killian': 0.0}
8000                              {u'Victor Killian': 0.0}
8001                              {u'Victor Killian': 0.0}
8008                          {u'Hester Sondergaard': 0.0}
8009     {u'Gale Sondergaard': 0.0, u'Hester Sondergaar...
8496            {u'Ann Morgan': 0.0, u'Steve Morgan': 0.0}
8497                             {u'Ann Roth Morgan': 0.0}
8506                              {u'Michael Wilson': 0.0}
8507                              {u'Michael Wilson': 0.0}
8596                             {u'Catherine Brant': 0.1}
8597                             {u'Catherine Brant': 0.1}
11788                                {u'Lester Cole': 0.0}
11887                                     {u'Offner': 0.0}
11888                                     {u'Offner': 0.0}
Length: 222, dtype: object

Analysis #1: Sentiment of mentions

I'm interested in seeing if there's a difference in sentiment when an individual cold-mentions someone vs when they mention them via anaphora, in response to an interviewer.

I expect there to be generally positive sentiment when the mention is via anaphora, and negative sentiment when it is a cold mention.


In [87]:
mentioned_sentiment = []
referenced_sentiment = []

for n, row in df.iterrows():
    a_sent_val = np.nan
    sent_val = np.nan

    a_sent = row['anaphora_sentiment']
    if not pd.isnull(a_sent):
        referenced_sentiment.extend(a_sent.values())

    sent = row['sentiment']
    if not pd.isnull(sent) and not is_interviewer(row['speaker']):
        mentioned_sentiment.extend(sent.values())

    # referenced_sentiment.extend(a_sent_val)
    # mentioned_sentiment.extend(sent_val)

In [91]:
import matplotlib.pyplot as plt
%pylab inline
plt.hist(referenced_sentiment, bins=30, alpha=0.5, label='sentiment when responding')
plt.hist(mentioned_sentiment, bins=30, alpha=0.5, label='sentiment when mentioning')

legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()


Populating the interactive namespace from numpy and matplotlib

doesn't look a lot different, but there are some stragglers on the referenced side that look to be negative. What does it look like if we get rid of the neutral sentiment?


In [236]:
referenced_sentiment_no_zeroes = filter(lambda x:.005 <= x or x <= -0.005, referenced_sentiment)
print "removed", len(referenced_sentiment) - len(referenced_sentiment_no_zeroes), "elements from referenced_sentiment."
mentioned_sentiment_no_zeroes = filter(lambda x: 0.005 <= x or x <= -0.005, mentioned_sentiment)
print "removed", len(mentioned_sentiment) - len(mentioned_sentiment_no_zeroes), "elements from mentioned_sentiment."

fig, ax1 = plt.subplots()
ax1.hist(referenced_sentiment_no_zeroes, bins=30, alpha=0.5, label='responding', color='g')

ax1.set_xlabel('sentiment score')
ax1.set_ylabel('frequency(responding)')
ax1.set_title('non-neutral sentiment')

ax2 = ax1.twinx()
ax2.hist(mentioned_sentiment_no_zeroes, bins=30, alpha=0.5, label='mentioning', color='r')
ax2.set_ylabel('frequency(mentioning)')

handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles, labels, loc=2)

handles, labels = ax2.get_legend_handles_labels()
ax2.legend(handles, labels, loc=1)

pylab.rcParams['figure.figsize'] = (10.0, 8.0)
plt.show()


removed 297 elements from referenced_sentiment.
removed 1285 elements from mentioned_sentiment.

there seems to be stronger overall sentiment when someone is responding to an interviewer than when they mention them on their own.

This is surprising because I would think that mentioning someone's name without that person being mentioned by the interviewer would invoke stronger sentiment.

Both distributions appear to be positive-leaning, as well. This is especially surprising in the mentioning dataset. I would expect responses that mention someone to carry some amount of positive sentiment, but cold mentions to carry generally negative sentiment.

Future analysis

I'd like to run sentiment with other keywords from liwc to discover if there are any patterns in emotional sentiment in the responses and mentions.

It would be interesting to see if many people were, say, defensive when responding, but aggressive when mentioning cold.

Sanity-check on these results

let's get a random sample of mentioning speechacts:


In [ ]:
# get all the sentiment values
sentiment = df['sentiment'].dropna()
sentiment = pd.DataFrame(sentiment.apply(lambda x: tuple(x.values())))

# get all the sentiment values that are non-zero
sentiment = sentiment.groupby('sentiment')['sentiment'].filter(lambda x: any(list(x)))

# take a random sample
sample_rows = np.random.choice(sentiment.index.values, 20)
sample = sentiment.ix[sample_rows]

# get the speechacts & mentionsfor this random sample
sample_speechacts = df.ix[sample.index.values]

for n, row in sample_speechacts.iterrows():
    print "speaker:", row['speaker']
    print "sentiment:", row['sentiment']
    print "text:", row['speechact']
    print "----"


#rand_smpl = [ mylist[i] for i in sorted(random.sample(xrange(len(mylist)), 4)) ]

Other categories from liwc

Using other word categories, we can identify feelings like anxiety, anger, etc. in speechacts.

It's a rough measure when used in an entire speechact, but we'll see what happens, I guess.

Each element of 'categories' will be a dictionary of {category: # of occurrences}


In [19]:
import lexicons.lexiconUtils as sentimentUtils
from collections import defaultdict
import corenlp_utils
import jsonrpclib

liwc = sentimentUtils.LiwcDict()
df = pd.read_pickle('pickles/with_interviewee_anaphora_sentiment.p')
server = jsonrpclib.Server("http://localhost:8084") # corenlp server

In [ ]:
df_categories = []

for n, row in df.iterrows():
    if n > 10:
        break
    speech = row['speechact']
    res = corenlp_utils.get_corenlp_object(speech, server)
    if not res:
        df_categories.append(np.nan)
        continue
    cats = defaultdict(int)
    for sen in res['sentences']:
        for word in sen['words']:
            lemma = word[1]['Lemma']
            categories = liwc.getCategories(lemma)
            print "lemma: ", lemma, "categories:", categories

            if not categories:
                continue
            for category in categories:
                cats[category] += 1
            print "cats:", cats
    print "adding:", cats
    df_categories.append(cats)

In [ ]:
df_categories[:20]

In [36]:
df['categories'] = df_categories
df.to_pickle('pickles/with_liwc_categories_non_normalized.p')

Normalize the categories


In [65]:
normalized = []

for n, row in df.iterrows():
    categories = row['categories']
    if type(categories) != defaultdict:
        normalized.append(categories)
        continue
    normalized_dict = dict((k, (float(v)/len(row['speechact']))) for k,v in categories.items())
    normalized.append(normalized_dict)

In [68]:
df['categories'] = normalized
df.to_pickle('pickles/liwc_categories_normalized.p')

uncooperativeness.

We want to find on how many occasions an interviewee was uncooperative, and look at:

- the sentiment expressed in the previous speechact
- the sentiment expressed when they were being cooperative

How do we measure uncooperativeness in a single speechact?

I don't think uncooperativeness is a sliding scale. I think it's binary. Given what I've seen in the testimony, using keywords might be enough.

However, it might be useful to try to classify speechacts into 'cooperative' and 'uncooperative' using ml. I don't think that would be a difficult task for svm/human annotation.


In [ ]:
keywords = ['refuse', 'i refuse', 'refuses', 'decline', 'declining', 'illegal', 'declination']
uncooperative_lemmas = ['refuse', 'decline', 'illegal']
server = jsonrpclib.Server("http://localhost:8084") # corenlp server

uncooperative = []
for row in df.iterrows():

    if is_interviewer(row['speaker']):
        uncooperative.append(False)
        continue

    speech = row['speechact']
    res = corenlp_utils.get_corenlp_object(speech)
    sentences = res['sentences']
    words = []
    for sen in sentences:
        words.extend(sen['words'])
    lemmas = [word['Lemma'] for word in words]

    intersection = [l in uncooperative_lemmas for l in lemmas]

    if any(intersection):
        uncooperative.append(True)
    else:
        uncooperative.append(False)

In [ ]:
#speakers = df.groupby(['speaker'])['speaker'].nunique().reset_index()
server = jsonrpclib.Server("http://localhost:8084") # corenlp server
corenlp_utils.get_corenlp_object('I refuse to answer the question on the grounds preivously stated.', server)