In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
import json
import statsmodels.api as sm
from statsmodels.formula.api import glm, ols
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import math
In [2]:
# Let's first open our data frame of all relevant movies movies.
with open("IMDB_dftouse_dict.json", "r") as fd:
IMDB = json.load(fd)
IMDB_df = pd.DataFrame(IMDB)
In [5]:
IMDB_df.head()
Out[5]:
Now let's download labMT, a word score list for sentiment analysis containing over 10,000 words. The file contains a "happiness" value, and ranks words by their happiness. It also includes mean and standard deviation, Twitter rank and Google rank.
In [16]:
url = 'http://www.plosone.org/article/fetchSingleRepresentation.action?uri=info:doi/10.1371/journal.pone.0026752.s001'
labmt = pd.read_csv(url, skiprows=2, sep='\t', index_col=0)
In [20]:
labmt.head()
Out[20]:
Now let's create a happiness dictionary of (word, valence) pairs where each valence is that word's original valence minus the average valence
In [21]:
average = labmt.happiness_average.mean()
happiness = (labmt.happiness_average - average).to_dict()
In [32]:
# Save to disc
fp = open("happiness.json","w")
json.dump(happiness, fp)
fp.close()
In [33]:
# Reopen
with open("happiness.json", "r") as fp:
happiness = json.load(fp)
In [26]:
print "Score(happy): ", happiness['happy']
print "Score(miserable): ", happiness['miserable']
print "Best score: ", max(happiness.values())
print "Worst score: ", min(happiness.values())
Now let's write a function to collect several attributes from a given review's text body, and save all valuable information into a new data frame.
First let's write a function that removes stop words (all non important words from a valence perspective) from a text body.
In [27]:
from sklearn.feature_extraction import text
stopwords = text.ENGLISH_STOP_WORDS
punctuation = list('.,;:!?()[]{}`''\"@#$%^&*+-|-=~_')
def removeStopWords(text, stopwords = stopwords):
new_text = ""
for word in text.split():
if word not in stopwords:
while len(word) != 0 and word[-1] in punctuation:
word = word[:len(word)-1]
new_text += word + ' '
return new_text
Now we'll write a function that returns total happiness, average happiness, total scorable words, and percentage of scorable words in a given review text.
In [35]:
'''
Name: getValenceInfo()
Inputs: review text, dictionary of happiness
Returns: a 4-tuple of (happiness total, happiness average, total # of scorable words, % of scorable words)
'''
def getValenceInfo(text, valenceDict):
total_words = len(text.split())
happiness_total, count_relevant = 0, 0
for word in text.split():
if word in valenceDict.keys():
count_relevant += 1
happiness_total += valenceDict[word]
if count_relevant != 0:
avg_valence = 1.*happiness_total/count_relevant
else:
avg_valence = 0
return happiness_total, avg_valence, total_words, 1.*count_relevant / total_words
Now we'll write a function that, given a data frame, returns a new data frame with the concatenation of valence (happiness) info in 4 new columns: valence sum, valence average, # of scorable words, % of scorable words.
In [74]:
'''
Name: getAllInfo
Input: data frame, happiness dictionary, list of stop words
Returns: a new data frame with 4 new columns: valence_sum, valence_avg, n_scorables, pct_scorables
'''
def getAllInfo(df, valenceDict, stopwords):
valence_suml, valence_avgl, review_lenl, review_fractionl = [], [], [], []
for i, row in df.iterrows():
cleaned_review = removeStopWords(row['text'], stopwords)
valence_sum, valence_avg, review_len, review_fraction = getValenceInfo(cleaned_review, valenceDict)
valence_suml.append(valence_sum)
valence_avgl.append(valence_avg)
review_lenl.append(review_len)
review_fractionl.append(review_fraction)
conc = pd.DataFrame({'valence_sum': valence_suml, 'valence_avg':valence_avgl ,'n_scorables': review_lenl,
'pct_scorables': review_fractionl})
return pd.concat([df, conc], axis=1)
Now let's create a new dataframe valence_df
with the valence statistics run on our IMDB_df. This code takes a few minutes to run.
In [93]:
%%time
valence_df = getAllInfo(IMDB_df, happiness, stopwords)
In [94]:
valence_df.head()
Out[94]:
In [95]:
# Convert True/False to 1/0: needed to make valence_df JSON serializable, also better practice
valence_df.positive = 1.0*valence_df.positive
In [96]:
# Save to disc
# fp = open("valence_df_dict.json","w")
# json.dump(valence_df.to_dict(), fp)
# fp.close()
In [97]:
with open("valence_df_dict.json", "r") as fp:
valence_df_dict = json.load(fp)
valence_df = pd.DataFrame(valence_df_dict)
In [ ]: