In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
import json
import statsmodels.api as sm
from statsmodels.formula.api import glm, ols

pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import math

In [2]:
# Let's first open our data frame of all relevant movies movies.
with open("IMDB_dftouse_dict.json", "r") as fd:
    IMDB = json.load(fd)
IMDB_df = pd.DataFrame(IMDB)

In [5]:
IMDB_df.head()


Out[5]:
movie_id movie_name positive stars text url
0 10027 Titanic True 7 Sure, Titanic was a good movie, the first time... http://www.imdb.com/title/tt0120338/usercommen...
1 10028 Titanic True 10 When I saw this movie I was stunned by what a ... http://www.imdb.com/title/tt0120338/usercommen...
2 10029 Titanic True 10 Why do people bitch about this movie and not a... http://www.imdb.com/title/tt0120338/usercommen...
3 10030 Titanic True 10 What's inexplicable? Firstly, the hatred towar... http://www.imdb.com/title/tt0120338/usercommen...
4 10031 Titanic True 10 Previously, I wrote that I loved "Titanic", cr... http://www.imdb.com/title/tt0120338/usercommen...

Now let's download labMT, a word score list for sentiment analysis containing over 10,000 words. The file contains a "happiness" value, and ranks words by their happiness. It also includes mean and standard deviation, Twitter rank and Google rank.


In [16]:
url = 'http://www.plosone.org/article/fetchSingleRepresentation.action?uri=info:doi/10.1371/journal.pone.0026752.s001'
labmt = pd.read_csv(url, skiprows=2, sep='\t', index_col=0)

In [20]:
labmt.head()


Out[20]:
happiness_rank happiness_average happiness_standard_deviation twitter_rank google_rank nyt_rank lyrics_rank
word
laughter 1 8.50 0.9313 3600 -- -- 1728
happiness 2 8.44 0.9723 1853 2458 -- 1230
love 3 8.42 1.1082 25 317 328 23
happy 4 8.30 0.9949 65 1372 1313 375
laughed 5 8.26 1.1572 3334 3542 -- 2332

Now let's create a happiness dictionary of (word, valence) pairs where each valence is that word's original valence minus the average valence


In [21]:
average = labmt.happiness_average.mean()
happiness = (labmt.happiness_average - average).to_dict()

In [32]:
# Save to disc
fp = open("happiness.json","w")
json.dump(happiness, fp)
fp.close()

In [33]:
# Reopen
with open("happiness.json", "r") as fp:
    happiness = json.load(fp)

In [26]:
print "Score(happy): ", happiness['happy']
print "Score(miserable): ", happiness['miserable']
print "Best score: ", max(happiness.values())
print "Worst score: ", min(happiness.values())


Score(happy):  2.92476032088
Score(miserable):  -2.83523967912
Best score:  3.12476032088
Worst score:  -4.07523967912

Analyzing and Saving Review Attributes Using Happiness Dictionary

Now let's write a function to collect several attributes from a given review's text body, and save all valuable information into a new data frame.

First let's write a function that removes stop words (all non important words from a valence perspective) from a text body.


In [27]:
from sklearn.feature_extraction import text
stopwords = text.ENGLISH_STOP_WORDS
punctuation = list('.,;:!?()[]{}`''\"@#$%^&*+-|-=~_')

def removeStopWords(text, stopwords = stopwords):
    new_text = ""
    for word in text.split():
        if word not in stopwords:
            while len(word) != 0 and word[-1] in punctuation:
                word = word[:len(word)-1]
            new_text += word + ' '
    return new_text

Now we'll write a function that returns total happiness, average happiness, total scorable words, and percentage of scorable words in a given review text.


In [35]:
'''
Name: getValenceInfo()
Inputs: review text, dictionary of happiness
Returns: a 4-tuple of (happiness total, happiness average, total # of scorable words, % of scorable words)
'''
def getValenceInfo(text, valenceDict):
    total_words = len(text.split())
    happiness_total, count_relevant = 0, 0
    for word in text.split():
        if word in valenceDict.keys():
            count_relevant += 1
            happiness_total += valenceDict[word]
    if count_relevant != 0: 
        avg_valence = 1.*happiness_total/count_relevant
    else: 
        avg_valence = 0
    return happiness_total, avg_valence, total_words, 1.*count_relevant / total_words

Now we'll write a function that, given a data frame, returns a new data frame with the concatenation of valence (happiness) info in 4 new columns: valence sum, valence average, # of scorable words, % of scorable words.


In [74]:
'''
Name: getAllInfo
Input: data frame, happiness dictionary, list of stop words
Returns: a new data frame with 4 new columns: valence_sum, valence_avg, n_scorables, pct_scorables
'''
def getAllInfo(df, valenceDict, stopwords): 
    valence_suml, valence_avgl, review_lenl, review_fractionl = [], [], [], []
    for i, row in df.iterrows():
        cleaned_review = removeStopWords(row['text'], stopwords)
        valence_sum, valence_avg, review_len, review_fraction = getValenceInfo(cleaned_review, valenceDict)
        valence_suml.append(valence_sum)
        valence_avgl.append(valence_avg)
        review_lenl.append(review_len)
        review_fractionl.append(review_fraction)
    conc = pd.DataFrame({'valence_sum': valence_suml, 'valence_avg':valence_avgl ,'n_scorables': review_lenl, 
                         'pct_scorables': review_fractionl})
    return pd.concat([df, conc], axis=1)

Now let's create a new dataframe valence_df with the valence statistics run on our IMDB_df. This code takes a few minutes to run.


In [93]:
%%time
valence_df = getAllInfo(IMDB_df, happiness, stopwords)


CPU times: user 2min 32s, sys: 708 ms, total: 2min 33s
Wall time: 2min 33s

In [94]:
valence_df.head()


Out[94]:
movie_id movie_name positive stars text url n_scorables pct_scorables valence_avg valence_sum
0 10027 Titanic True 7 Sure, Titanic was a good movie, the first time... http://www.imdb.com/title/tt0120338/usercommen... 120 0.666667 0.479760 38.380826
1 10028 Titanic True 10 When I saw this movie I was stunned by what a ... http://www.imdb.com/title/tt0120338/usercommen... 65 0.538462 0.508760 17.806611
2 10029 Titanic True 10 Why do people bitch about this movie and not a... http://www.imdb.com/title/tt0120338/usercommen... 75 0.586667 0.710669 31.269454
3 10030 Titanic True 10 What's inexplicable? Firstly, the hatred towar... http://www.imdb.com/title/tt0120338/usercommen... 235 0.587234 0.239253 33.016924
4 10031 Titanic True 10 Previously, I wrote that I loved "Titanic", cr... http://www.imdb.com/title/tt0120338/usercommen... 302 0.450331 0.189907 25.827404

In [95]:
# Convert True/False to 1/0: needed to make valence_df JSON serializable, also better practice
valence_df.positive = 1.0*valence_df.positive

In [96]:
# Save to disc
# fp = open("valence_df_dict.json","w")
# json.dump(valence_df.to_dict(), fp)
# fp.close()

In [97]:
with open("valence_df_dict.json", "r") as fp:
    valence_df_dict = json.load(fp)
valence_df = pd.DataFrame(valence_df_dict)

In [ ]: