In [1]:

    
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
import json
import statsmodels.api as sm
from statsmodels.formula.api import glm, ols

pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import math



In [2]:

    
# Let's first open our data frame of all relevant movies movies.
with open("IMDB_dftouse_dict.json", "r") as fd:
    IMDB = json.load(fd)
IMDB_df = pd.DataFrame(IMDB)



In [5]:

    
IMDB_df.head()









    Out[5]:






  
    
      
      movie_id
      movie_name
      positive
      stars
      text
      url
    
  
  
    
      0
      10027
      Titanic
      True
      7
      Sure, Titanic was a good movie, the first time...
      http://www.imdb.com/title/tt0120338/usercommen...
    
    
      1
      10028
      Titanic
      True
      10
      When I saw this movie I was stunned by what a ...
      http://www.imdb.com/title/tt0120338/usercommen...
    
    
      2
      10029
      Titanic
      True
      10
      Why do people bitch about this movie and not a...
      http://www.imdb.com/title/tt0120338/usercommen...
    
    
      3
      10030
      Titanic
      True
      10
      What's inexplicable? Firstly, the hatred towar...
      http://www.imdb.com/title/tt0120338/usercommen...
    
    
      4
      10031
      Titanic
      True
      10
      Previously, I wrote that I loved "Titanic", cr...
      http://www.imdb.com/title/tt0120338/usercommen...

Now let's download labMT, a word score list for sentiment analysis containing over 10,000 words. The file contains a "happiness" value, and ranks words by their happiness. It also includes mean and standard deviation, Twitter rank and Google rank.



In [16]:

    
url = 'http://www.plosone.org/article/fetchSingleRepresentation.action?uri=info:doi/10.1371/journal.pone.0026752.s001'
labmt = pd.read_csv(url, skiprows=2, sep='\t', index_col=0)



In [20]:

    
labmt.head()









    Out[20]:






  
    
      
      happiness_rank
      happiness_average
      happiness_standard_deviation
      twitter_rank
      google_rank
      nyt_rank
      lyrics_rank
    
    
      word
      
      
      
      
      
      
      
    
  
  
    
      laughter
      1
      8.50
      0.9313
      3600
      --
      --
      1728
    
    
      happiness
      2
      8.44
      0.9723
      1853
      2458
      --
      1230
    
    
      love
      3
      8.42
      1.1082
      25
      317
      328
      23
    
    
      happy
      4
      8.30
      0.9949
      65
      1372
      1313
      375
    
    
      laughed
      5
      8.26
      1.1572
      3334
      3542
      --
      2332

Now let's create a happiness dictionary of (word, valence) pairs where each valence is that word's original valence minus the average valence



In [21]:

    
average = labmt.happiness_average.mean()
happiness = (labmt.happiness_average - average).to_dict()



In [32]:

    
# Save to disc
fp = open("happiness.json","w")
json.dump(happiness, fp)
fp.close()



In [33]:

    
# Reopen
with open("happiness.json", "r") as fp:
    happiness = json.load(fp)



In [26]:

    
print "Score(happy): ", happiness['happy']
print "Score(miserable): ", happiness['miserable']
print "Best score: ", max(happiness.values())
print "Worst score: ", min(happiness.values())









    



Score(happy):  2.92476032088
Score(miserable):  -2.83523967912
Best score:  3.12476032088
Worst score:  -4.07523967912

Analyzing and Saving Review Attributes Using Happiness Dictionary

Now let's write a function to collect several attributes from a given review's text body, and save all valuable information into a new data frame.

First let's write a function that removes stop words (all non important words from a valence perspective) from a text body.



In [27]:

    
from sklearn.feature_extraction import text
stopwords = text.ENGLISH_STOP_WORDS
punctuation = list('.,;:!?()[]{}`''\"@#$%^&*+-|-=~_')

def removeStopWords(text, stopwords = stopwords):
    new_text = ""
    for word in text.split():
        if word not in stopwords:
            while len(word) != 0 and word[-1] in punctuation:
                word = word[:len(word)-1]
            new_text += word + ' '
    return new_text

Now we'll write a function that returns total happiness, average happiness, total scorable words, and percentage of scorable words in a given review text.



In [35]:

    
'''
Name: getValenceInfo()
Inputs: review text, dictionary of happiness
Returns: a 4-tuple of (happiness total, happiness average, total # of scorable words, % of scorable words)
'''
def getValenceInfo(text, valenceDict):
    total_words = len(text.split())
    happiness_total, count_relevant = 0, 0
    for word in text.split():
        if word in valenceDict.keys():
            count_relevant += 1
            happiness_total += valenceDict[word]
    if count_relevant != 0: 
        avg_valence = 1.*happiness_total/count_relevant
    else: 
        avg_valence = 0
    return happiness_total, avg_valence, total_words, 1.*count_relevant / total_words

Now we'll write a function that, given a data frame, returns a new data frame with the concatenation of valence (happiness) info in 4 new columns: valence sum, valence average, # of scorable words, % of scorable words.



In [74]:

    
'''
Name: getAllInfo
Input: data frame, happiness dictionary, list of stop words
Returns: a new data frame with 4 new columns: valence_sum, valence_avg, n_scorables, pct_scorables
'''
def getAllInfo(df, valenceDict, stopwords): 
    valence_suml, valence_avgl, review_lenl, review_fractionl = [], [], [], []
    for i, row in df.iterrows():
        cleaned_review = removeStopWords(row['text'], stopwords)
        valence_sum, valence_avg, review_len, review_fraction = getValenceInfo(cleaned_review, valenceDict)
        valence_suml.append(valence_sum)
        valence_avgl.append(valence_avg)
        review_lenl.append(review_len)
        review_fractionl.append(review_fraction)
    conc = pd.DataFrame({'valence_sum': valence_suml, 'valence_avg':valence_avgl ,'n_scorables': review_lenl, 
                         'pct_scorables': review_fractionl})
    return pd.concat([df, conc], axis=1)

Now let's create a new dataframe valence_df with the valence statistics run on our IMDB_df. This code takes a few minutes to run.



In [93]:

    
%%time
valence_df = getAllInfo(IMDB_df, happiness, stopwords)









    



CPU times: user 2min 32s, sys: 708 ms, total: 2min 33s
Wall time: 2min 33s



In [94]:

    
valence_df.head()









    Out[94]:






  
    
      
      movie_id
      movie_name
      positive
      stars
      text
      url
      n_scorables
      pct_scorables
      valence_avg
      valence_sum
    
  
  
    
      0
      10027
      Titanic
      True
      7
      Sure, Titanic was a good movie, the first time...
      http://www.imdb.com/title/tt0120338/usercommen...
      120
      0.666667
      0.479760
      38.380826
    
    
      1
      10028
      Titanic
      True
      10
      When I saw this movie I was stunned by what a ...
      http://www.imdb.com/title/tt0120338/usercommen...
      65
      0.538462
      0.508760
      17.806611
    
    
      2
      10029
      Titanic
      True
      10
      Why do people bitch about this movie and not a...
      http://www.imdb.com/title/tt0120338/usercommen...
      75
      0.586667
      0.710669
      31.269454
    
    
      3
      10030
      Titanic
      True
      10
      What's inexplicable? Firstly, the hatred towar...
      http://www.imdb.com/title/tt0120338/usercommen...
      235
      0.587234
      0.239253
      33.016924
    
    
      4
      10031
      Titanic
      True
      10
      Previously, I wrote that I loved "Titanic", cr...
      http://www.imdb.com/title/tt0120338/usercommen...
      302
      0.450331
      0.189907
      25.827404



In [95]:

    
# Convert True/False to 1/0: needed to make valence_df JSON serializable, also better practice
valence_df.positive = 1.0*valence_df.positive



In [96]:

    
# Save to disc
# fp = open("valence_df_dict.json","w")
# json.dump(valence_df.to_dict(), fp)
# fp.close()



In [97]:

    
with open("valence_df_dict.json", "r") as fp:
    valence_df_dict = json.load(fp)
valence_df = pd.DataFrame(valence_df_dict)



In [ ]:

	movie_id	movie_name	positive	stars	text	url
0	10027	Titanic	True	7	Sure, Titanic was a good movie, the first time...	http://www.imdb.com/title/tt0120338/usercommen...
1	10028	Titanic	True	10	When I saw this movie I was stunned by what a ...	http://www.imdb.com/title/tt0120338/usercommen...
2	10029	Titanic	True	10	Why do people bitch about this movie and not a...	http://www.imdb.com/title/tt0120338/usercommen...
3	10030	Titanic	True	10	What's inexplicable? Firstly, the hatred towar...	http://www.imdb.com/title/tt0120338/usercommen...
4	10031	Titanic	True	10	Previously, I wrote that I loved "Titanic", cr...	http://www.imdb.com/title/tt0120338/usercommen...

	happiness_rank	happiness_average	happiness_standard_deviation	twitter_rank	google_rank	nyt_rank	lyrics_rank
word
laughter	1	8.50	0.9313	3600	--	--	1728
happiness	2	8.44	0.9723	1853	2458	--	1230
love	3	8.42	1.1082	25	317	328	23
happy	4	8.30	0.9949	65	1372	1313	375
laughed	5	8.26	1.1572	3334	3542	--	2332