Constructing the final dataframe.

The goal of this notebook is to construct the final dataframe that can be used for any real analysis needed.

for each speechact, it should include fields for:

liwc / general

speaker
whether the speaker is an interviewer or interviewee
whether the speechact is in response to an interviewer
liwc category scores for speechact
liwc category scores by sentence
liwc sentiment of entire speechact
liwc sentiment by sentence
liwc sentiment towards all entities, incl. anaphora
liwc sentiment towards entities without anaphora
liwc sentiment towards only anaphora entities

CORENLP

disambiguated list of mentions in the speechact, incl anaphora ([15, 26, ...])
disambiguated list of mentions in the speechact, without anaphora ([15, 26, ...])
disambiguated list of mentions by sentence, incl. anaphora ([[15], [26, 34], ...])
disambiguated list of mentions by sentence, without anaphora ([[15], [26, 34], ...])
- these will allow computation of sentiment/categories for mentions by sentence.
corenlp sentiment for entire speechact
corenlp sentiment by sentence

Setup - load the original dataframe and modules



In [179]:

    
import pickle
import os
import difflib
import jsonrpclib
import simplejson
import text_utils
import collections
import numpy as np
import pandas as pd
from corenlp_utils import *
from sentiment_utils import *

%load_ext autoreload
%autoreload 2

server = jsonrpclib.Server("http://localhost:8084")

transcript_dir = os.path.join('testimony/text/hearings')

interviewee_names = [f.replace(".txt", "") for f in os.listdir(transcript_dir)]
interviewee_names = map(lambda s: s.replace("-", " "), interviewee_names)

def is_interviewer(name):
    return not difflib.get_close_matches(name, interviewee_names)



df = pd.read_pickle('pickles/speaker_speechacts.p')









    



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Create a dictionary to add to the dataframe. Missing / inapplicable entries have 'np.nan' as the value.



In [164]:

    
d = {"is_interviewee": [],
     "is_response": [],
     "liwc_categories_for_speechact": [],
     "liwc_categories_by_sentence": [],
     "liwc_sentiment_for_speechact": [],
     "liwc_sentiment_by_sentence": [],
     "liwc_sentiment_towards_entities_with_anaphora": [],
     "liwc_sentiment_towards_entities_without_anaphora": [],
     "liwc_sentiment_towards_only_anaphora": [],
     "mention_list_by_sentence_without_anaphora": [],
     "mention_list_by_sentence_with_anaphora": [],
     "mention_list_for_speechact_without_anaphora": [],
}

Actually populate the dictionary. Makes lots of calls to the utility scripts.

This does not include disambiguated mention information or corenlp sentiment data. we'll get to those later.



In [165]:

    
for n, row in df.iterrows():
    # if n > 5000:
    #     break

    # interviewee
    d['is_interviewee'].append(not is_interviewer(row['speaker']))

    # response
    is_response = not is_interviewer(row['speaker']) and is_interviewer(df.ix[n - 1]['speaker'])
    d['is_response'].append(is_response)
    if is_response:
        prev_speechact = corenlp_utils.get_corenlp_object(df.ix[n - 1]['speechact'], server)
    if not prev_speechact:
        is_response = False

    speechact = corenlp_utils.get_corenlp_object(row['speechact'], server)

    if not speechact: # everything is nan.
        d['liwc_categories_for_speechact'].append(np.nan)
        d['liwc_categories_by_sentence'].append(np.nan)
        d['liwc_sentiment_for_speechact'].append(np.nan)
        d['liwc_sentiment_by_sentence'].append(np.nan)
        d['liwc_sentiment_towards_entities_without_anaphora'].append(np.nan)
        d['liwc_sentiment_towards_entities_with_anaphora'].append(np.nan)
        d['liwc_sentiment_towards_only_anaphora'].append(np.nan)
        d['mention_list_by_sentence_with_anaphora'].append(np.nan)
        d['mention_list_by_sentence_without_anaphora'].append(np.nan)
        d['mention_list_for_speechact_without_anaphora'].append(np.nan)
        continue
    else:

        # response entries
        if is_response:
            d['liwc_sentiment_towards_entities_with_anaphora'].append(liwc_sentiment_with_anaphora(speechact, prev_speechact, server))
            d['liwc_sentiment_towards_only_anaphora'].append(liwc_sentiment_towards_only_anaphora(speechact, prev_speechact, server))
            d['mention_list_by_sentence_with_anaphora'].append(mention_list_by_sentence_with_anaphora(speechact, prev_speechact, server))

        if not is_response:
            d['liwc_sentiment_towards_entities_with_anaphora'].append(np.nan)
            d['liwc_sentiment_towards_only_anaphora'].append(np.nan)
            d['mention_list_by_sentence_with_anaphora'].append(np.nan)

        # mention lists
        d['liwc_sentiment_towards_entities_without_anaphora'].append(liwc_sentiment_towards_all_entities_in_speechact_no_anaphora(speechact))
        d['mention_list_by_sentence_without_anaphora'].append(mention_list_by_sentence_no_anaphora(speechact))
        d['mention_list_for_speechact_without_anaphora'].append(mention_list_for_speechact_no_anaphora(speechact))    


        # liwc categories
        d['liwc_categories_for_speechact'].append(liwc_categories_for_speechact(speechact))
        d['liwc_categories_by_sentence'].append(liwc_categories_by_sentence(speechact))

        # overall liwc sentiment
        d['liwc_sentiment_for_speechact'].append(liwc_overall_sentiment_of_speechact(speechact))
        d['liwc_sentiment_by_sentence'].append(liwc_overall_sentiment_by_sentence(speechact))
    
        
    if n % 500 == 0:
        temp_pickle_name = "final_analysis_" + str(n) + "_tmp.p"
        print "analyzed", n, "speechacts. Saving temporary pickle as", temp_pickle_name
        pickle.dump(d, open("pickles/final/" + temp_pickle_name, 'wb'))
        #df.to_pickle("pickles/final/" + temp_pickle_name)

temp_pickle_name = "final_analysis_" + str(n) + "_FINAL.p"
print "analyzed", n, "speechacts. Saving FINAL pickle as", temp_pickle_name
pickle.dump(d, open("pickles/final/" + temp_pickle_name, 'wb'))









    



analyzed 0 speechacts. Saving temporary pickle as final_analysis_0_tmp.p
analyzed 500 speechacts. Saving temporary pickle as final_analysis_500_tmp.p
analyzed 1500 speechacts. Saving temporary pickle as final_analysis_1500_tmp.p
analyzed 2000 speechacts. Saving temporary pickle as final_analysis_2000_tmp.p
analyzed 2500 speechacts. Saving temporary pickle as final_analysis_2500_tmp.p
analyzed 3000 speechacts. Saving temporary pickle as final_analysis_3000_tmp.p
analyzed 3500 speechacts. Saving temporary pickle as final_analysis_3500_tmp.p
analyzed 4000 speechacts. Saving temporary pickle as final_analysis_4000_tmp.p
analyzed 4500 speechacts. Saving temporary pickle as final_analysis_4500_tmp.p
analyzed 5000 speechacts. Saving temporary pickle as final_analysis_5000_tmp.p
analyzed 5500 speechacts. Saving temporary pickle as final_analysis_5500_tmp.p
analyzed 6000 speechacts. Saving temporary pickle as final_analysis_6000_tmp.p
analyzed 6500 speechacts. Saving temporary pickle as final_analysis_6500_tmp.p
analyzed 7000 speechacts. Saving temporary pickle as final_analysis_7000_tmp.p
analyzed 7500 speechacts. Saving temporary pickle as final_analysis_7500_tmp.p
analyzed 8000 speechacts. Saving temporary pickle as final_analysis_8000_tmp.p
analyzed 8500 speechacts. Saving temporary pickle as final_analysis_8500_tmp.p
analyzed 9000 speechacts. Saving temporary pickle as final_analysis_9000_tmp.p
analyzed 9500 speechacts. Saving temporary pickle as final_analysis_9500_tmp.p
analyzed 10000 speechacts. Saving temporary pickle as final_analysis_10000_tmp.p
analyzed 10500 speechacts. Saving temporary pickle as final_analysis_10500_tmp.p
analyzed 11000 speechacts. Saving temporary pickle as final_analysis_11000_tmp.p
analyzed 11500 speechacts. Saving temporary pickle as final_analysis_11500_tmp.p
analyzed 12000 speechacts. Saving temporary pickle as final_analysis_12000_tmp.p
analyzed 12500 speechacts. Saving temporary pickle as final_analysis_12500_tmp.p
analyzed 12953 speechacts. Saving FINAL pickle as final_analysis_12953_FINAL.p

Run this if the final pickle has already been generated:



In [82]:

    
d = pickle.load(open('pickles/final/final_analysis_12953_FINAL.p', 'rb'))



In [166]:

    
#df_d = pd.DataFrame(d)

for key in d.keys():
    print key, "is", len(d[key]), "long"









    



mention_list_for_speechact_without_anaphora is 12954 long
mention_list_by_sentence_without_anaphora is 12954 long
liwc_sentiment_towards_entities_with_anaphora is 12954 long
is_response is 12954 long
liwc_sentiment_for_speechact is 12954 long
liwc_sentiment_towards_entities_without_anaphora is 12954 long
liwc_sentiment_towards_only_anaphora is 12954 long
is_interviewee is 12954 long
liwc_sentiment_by_sentence is 12954 long
mention_list_by_sentence_with_anaphora is 12954 long
liwc_categories_for_speechact is 12954 long
liwc_categories_by_sentence is 12954 long

Let's add the speaker and speechact columns, and save it for later:



In [168]:

    
df_d = pd.DataFrame(d)
df_d['speechact'] = df['speechact']
df_d['speaker'] = df['speaker']
df_d.to_pickle("pickles/final/final_dataframe.p")



In [ ]:

    
df_d.head()

Constructing some graphs

Disambiguating speakers and mentions



In [255]:

    
def flatten(l):
    return [item for sublist in l for item in sublist]
# get full list of all mentions with and without anaphora.
all_mentions_with_anaphora = [s.lower() for s in flatten(flatten(df_d['mention_list_by_sentence_with_anaphora'].dropna()))]
all_mentions_without_anaphora = [s.lower() for s in flatten(df_d['mention_list_for_speechact_without_anaphora'].dropna())]
all_mentions = list(set(all_mentions_with_anaphora + all_mentions_without_anaphora))
print "original number of mentions:", len(all_mentions)

disambiguated_names = text_utils.chunk_list(all_mentions)
print "number of disambiguated mentions:", len(disambiguated_names)









    



original number of mentions: 1521
number of disambiguated mentions: 1367

The disambiguated mention list generated previously had ~ 1227 elements. An increase of about 140 makes sense when we add anaphora, I think (maybe it's low, but we found that expanding the mention list with anaphora didn't add much before, so...)

A method to get the key for some individual mention:



In [147]:

    
def get_key(mention, l):
    "second argument is the chunked list."
    mention = mention.lower()
    for chunk in l:
        if mention in chunk:
            return l.index(chunk)

Next, add the speakers:



In [254]:

    
for speaker in list(set(df_d['speaker'])):
    if get_key(speaker, disambiguated_names):
        continue # the key's already there.
    else:
        disambiguated_names.append([speaker]) # gets its' own list if it's not already there.

print "# of disambiguated names, with the speaker data:", len(disambiguated_names)
pickle.dump(disambiguated_names, open('pickles/final/disambiguated_names.p', 'wb'))

An increase of ~8 is expected, and consistent with the previous mention graph data.

Now, when we construct various graphs, we should call get_key to find the mention id for some individual.

Mention graph of interviewees, weighted by LIWC categories

Each edge has a number of properties that correspond to LIWC categories. The edge has a category score for each category.

Setup:



In [145]:

    
import networkx as nx

Some functions to help with constructing the liwc category scores for edges:



In [266]:

    
def update_dict(old, to_add):
    for key in to_add.keys():
        if old.has_key(key):
            old[key] += to_add[key]
        else:
            old[key] = to_add[key]
    return old

def normalize_dict(d, count):
    "normalizes by a count of edges"
    newdict = {}
    for k,v in d.items():
        newval = v/float(count)
        if newval < 1:
            newdict[k] = newval
    return newdict

def filter_categories(d, cats):
    newdict = {}
    for k,v in d.items():
        if k in cats:
            newdict[k] = v
    return newdict



In [ ]:

    
G = nx.DiGraph()

# the graph data has to be stored in a separate dict until we
# construct the graph; adding multiple edges between two nodes just
# replaces the attributes. We want to average/accumulate them.
graph_data = collections.defaultdict(lambda : collections.defaultdict(dict))
count_data = collections.defaultdict(int)

skipped = 0
for n, row in df_d.iterrows():
    if n % 500 == 0:
        print n, "rows analyzed"

    speaker = row['speaker']
    if is_interviewer(speaker):
        skipped += 1
        continue

    categories = ['Negemo', 'Posemo', 'Anx', 'Anger', 'Tentat', 'Social', 'Excl', 'Occup', 'Swear']
    def filter_categories(categories):
        categories.pop('Number', None)
        categories.pop('Present', None)
        categories.pop('Othref', None)
        categories.pop('Pronoun', None)
        categories.pop('Cogmech', None)
        categories.pop('Preps', None)
        categories.pop('Space', None)
        categories.pop('Negate', None)
        categories.pop('Achieve', None)
        categories.pop('Time', None)
        categories.pop('We', None)
        categories.pop('Senses', None)
        categories.pop('Article', None)
        categories.pop('Insight', None)
        categories.pop('Hear', None)
        categories.pop('Motion', None)
        categories.pop('Up', None)
        categories.pop('Leisure', None)
        return categories
        
    # there's anaphora
    mention_list_w_anaphora = row['mention_list_by_sentence_with_anaphora']
    mention_list_wo_anaphora = row['mention_list_by_sentence_without_anaphora']
    categories_by_sentence = row['liwc_categories_by_sentence']
    
    speaker = get_key(speaker, disambiguated_names)
    if type(mention_list_w_anaphora) == list:
        categories_towards_mentions = {}
        for n, mentions in enumerate(mention_list_w_anaphora):
            for mention in mentions:
                mention = get_key(mention, disambiguated_names)
                if speaker == mention:
                    continue
                categories_for_mention = filter_categories(categories_by_sentence[n])
                graph_data[speaker][mention] = update_dict(graph_data[speaker][mention], categories_for_mention)
                count_data[(speaker, mention)] += 1
#                G.add_edge(speaker, mention, categories_for_mention)
        
    elif type(mention_list_wo_anaphora) == list:
        categories_towards_mentions = {}
        for n, mentions in enumerate(mention_list_wo_anaphora):
            for mention in mentions:
                mention = get_key(mention, disambiguated_mentions)
                # don't include self-mentions; this will screw up
                # centrality measures and is not a meaningful measure
                if speaker == mention:
                    continue
                categories_for_mention = filter_categories(categories_by_sentence[n])
                graph_data[speaker][mention] = update_dict(graph_data[speaker][mention], categories_for_mention)
                count_data[(speaker, mention)] += 1
                #G.add_edge(speaker, mention, categories_for_mention)

Adding the normalized data to a graph file:



In [263]:

    
for speaker, mentions in graph_data.items():
    for mentioned, attrs in mentions.items():
        count = count_data[(speaker, mentioned)]
        normaalized_attrs = normalize_dict(attrs, count)
        G.add_edge(speaker, mentioned, normalized_attrs)



In [264]:

    
nx.write_gml(G, "graphs/liwc_categories_mentions.gml")

Okay, this is great, but gephi actually can't do anything special with this data.

Instead, I'm going to classify each edge according to it's dominating liwc category, chosen from a predefined list.



In [267]:

    
relevant_categories = ['Negemo', 'Posemo', 'Posfeel', 'Anx', 'Certain', 'Tentat', 'Anger', 'Swear']

G_dominant_scores = nx.DiGraph()

for speaker, mentions in graph_data.items():
    for mentioned, attrs in mentions.items():
        count = count_data[(speaker, mentioned)]
        normalized_attrs = normalize_dict(attrs, count)
        filtered = filter_categories(normalized_attrs, relevant_categories)
        try:
            dominant = max(filtered.items(), key=lambda p: p[1])
            dominant = {dominant[0] : dominant[1]}
            G_dominant_scores.add_edge(speaker, mentioned, dominant)
        except ValueError:
            G_dominant_scores.add_edge(speaker, mentioned)
        
nx.write_gml(G_dominant_scores, 'graphs/liwc_dominant_categories.gml')



In [257]:

    
disambiguated_speakers = list(set([get_key(s, disambiguated_names) for s in df_d['speaker']]))



In [ ]:

    
G_mentions = nx.DiGraph()

skipped = 0
for n, row in df_d.iterrows():
    if n % 500 == 0:
        print n, "rows analyzed"

    speaker = row['speaker']
    if is_interviewer(speaker):
        skipped += 1
        continue

    speaker = get_key(speaker, disambiguated_names)
    mentions_without_anaphora = row['mention_list_for_speechact_without_anaphora']
    if type(mentions_without_anaphora) == list:
        for mention in mentions_without_anaphora:
            mention = get_key(mention, disambiguated_names)
            if speaker == mention:
                continue
            G_mentions.add_edge(speaker, mention)
    
print "skipped:", skipped
nx.write_gml(G_mentions, "graphs/new_mention_graph.gml")









    



0 rows analyzed
500 rows analyzed
1000 rows analyzed
1500 rows analyzed
2000 rows analyzed
2500 rows analyzed
3000 rows analyzed
3500 rows analyzed
4000 rows analyzed
4500 rows analyzed
5000 rows analyzed
5500 rows analyzed
6000 rows analyzed
6500 rows analyzed
7000 rows analyzed
7500 rows analyzed
8000 rows analyzed
8500 rows analyzed
9000 rows analyzed
9500 rows analyzed
10000 rows analyzed
10500 rows analyzed
11000 rows analyzed
11500 rows analyzed
12000 rows analyzed
12500 rows analyzed
skipped: 6852



In [258]:

    
len(disambiguated_speakers)









    Out[258]:





61



In [232]:

    
len(df_d['speaker'])









    Out[232]:





12954



In [259]:

    
len(disambiguated_names)









    Out[259]:





1367



In [271]:

    
disambiguated_names[597]









    Out[271]:





[u'virginia viertel']