In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
import json
import statsmodels.api as sm
from statsmodels.formula.api import glm, ols
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import math
For our project we will be using data from 3 different sources
First we import the requests
and BeautifulSoup
libraries to make working with HTTP requests easier, and then easily transfer HTML content to Python data structures.
In [2]:
from bs4 import BeautifulSoup
# The "requests" library makes working with HTTP requests easier
# than the built-in urllib libraries.
import requests
Secondly, we prepare the data frame movie_df
to store the data that we will scrape from BOM. We give this dataframe 9 columns:
* ranking: the ranking of the movie in its release year by gross
* title: movie title
* gross: how much the movie grossed while in theatres
* Total_theaters: the total number of theaters that showed this movie
* opening_gross: how much the movie grossed in the opening weekend (Fri-Sun)
* opening_theaters: the total number of theaters that showed this movie in the opening weekend (Fri-Sun)
* open_date: date of opening
* close_date: date of closing
* year: year of release
In [3]:
movie_df = pd.DataFrame(columns=['close_date', 'gross', 'open_date', 'opening_gross', 'opening_theaters','ranking','title','total_theaters','year'])
Now we write a function rowInfoGrabber
that we will call in a loop over the table on the BOM webpage to grab the attributes and save them into the corresponding columns in movie_df
.
In [4]:
def rowInfoGrabber(r):
info = []
# Ranking
info.append(int(r.find("font").get_text()))
# Title
info.append(r.find("a").get_text())
# Gross
info.append(int(r.find("td", attrs={"align":"right"}).find("b").get_text().strip("$").replace(",","")))
'''
For the next 3 categories, we need to deal with the 2000 Anomaly "Fantasia" where there are missing numbers.
In this case I have chosen to replace the missing values 'N/A' with the values from 'Final Destination', which
if right above it in the movie table and differs in gross income by about $1 million, which is a small
difference. See the picture below for a snapshot of the anomaly in the movie table from 2000.
'''
# Total number of theaters
if r.find_all("td",attrs={"align":"right"})[1].find("font").get_text().replace(",","") == 'N/A':
info.append(2587)
else:
info.append(int(r.find_all("td",attrs={"align":"right"})[1].find("font").get_text().replace(",","")))
# Opening Gross
if r.find_all("td", attrs={"align":"right"})[2].find("font").get_text().strip("$").replace(",","") == 'N/A':
info.append(10015822)
else:
info.append(int(r.find_all("td", attrs={"align":"right"})[2].find("font").get_text().strip("$").replace(",","")))
# Opening Number of Theaters
if r.find_all("td", attrs={"align":"right"})[3].find("font").get_text().replace(",","") == 'N/A':
info.append(2587)
else:
info.append(int(r.find_all("td", attrs={"align":"right"})[3].find("font").get_text().replace(",","")))
# Date of Opening
info.append(r.find_all("td", attrs={"align":"right"})[4].find("a").get_text())
# Date of Closing: Before 2002 they didn't have a "closing" date in their tables. We must account for this.
if (len(r.find_all("td", attrs={"align":"right"})) <= 5):
info.append('-')
else:
info.append(r.find_all("td", attrs={"align":"right"})[5].find("font").get_text())
return info
This is the image:
In [5]:
fields = ["ranking", "title", "gross", "total_theaters", "opening_gross", "opening_theaters", "open_date", "close_date"]
Finally we're ready to scrape!
Because IMDB was created in 1990, we will scrape that far back in BOM. So we're scraping the past 26 years (1990 - 2015). Also note that because the HTML was changed starting in 2001, our scraping will be a little different before and after then.
In [6]:
%%time
years = [1990 + i for i in range(26)]
for year in years:
pageText = requests.get("http://www.boxofficemojo.com/yearly/chart/?yr=%(yr)d&p=.htm" % {'yr':year})
soup = BeautifulSoup(pageText.text, "html.parser")
movieTable = soup.find("td", attrs={"colspan":"3"})
movieRows = movieTable.find("table").find_all("tr")[2:102]
print year
movie_dicts = [dict(zip(fields, rowInfoGrabber(row))) for row in movieRows]
year_df = pd.DataFrame(movie_dicts)
year_df['year'] = year
movie_df = movie_df.append(year_df, ignore_index=True)
time.sleep(1)
In [7]:
movie_df.shape
Out[7]:
In [8]:
movie_df.head()
Out[8]:
Because some films do not have a close date, we will have to be careful with the close date!
In [9]:
# if we decide it's worth just dumping the movies with no close_date, we can use the code below
# movie_df=movie_df[movie_df.close_date != '-'].reset_index(drop=True)
Next, we combine the close_date
, open_date
and year
columns into two columns close_date
and open_date
that are time series. This will make it easier for us to work with the data in the future.
In [9]:
# splitting the close_date and open_date into the respective month and day
movie_df['close_month'] = movie_df['close_date'].map(lambda x: '0' if x=='-' else x[:x.find('/')])
movie_df['close_day'] = movie_df['close_date'].map(lambda x: '0' if x=='-' else x[x.find('/')+1:len(x)])
movie_df['open_month'] = movie_df['open_date'].map(lambda x: x[:x.find('/')])
movie_df['open_day'] = movie_df['open_date'].map(lambda x: x[x.find('/')+1:len(x)])
# dropping the old close_date and open_date
movie_df = movie_df.drop('close_date', 1)
movie_df = movie_df.drop('open_date', 1)
# creating an open_year by turning the year column into a string and getting rid of trailing bits
movie_df['open_year'] = movie_df.year.astype(str)
movie_df['open_year'] = movie_df.open_year.map(lambda x: x[:x.find('.')])
# creating a close_year column, by looking at whether the close month is earlier/later than the open month in the year
close_month = movie_df['close_month'].astype(int)
open_month = movie_df['open_month'].astype(int)
year = movie_df['year'].astype(int)
close_year=[]
for i in range (0, len(year)):
if close_month[i] >= open_month[i]:
close_year.append(year[i])
else:
close_year.append(year[i]+1)
movie_df['close_year'] = close_year
movie_df['close_year'] = movie_df['close_year'].astype(str)
In [10]:
movie_df.head()
Out[10]:
Let's take a look at the data, now!
In [11]:
movie_df.head()
Out[11]:
Let's take a look at if we can get the run times for each movie!
In [14]:
run_time=[]
for index, row in movie_df.iterrows():
if row.close_date != None:
run_time.append(row['close_date']-row['open_date'])
else:
run_time.append('N/A')
In [13]:
movie_df.head()
Out[13]:
Looks like the data is ready for us to use! Let's save this data so we're ready to use it next time.
In [17]:
! pip install pymongo
In [15]:
# Save the movie Dictionaries corresponding to each row of the BoxOfficeMojo table.
import json # (dong)
# import pymongo
# from bson import json_util
# Make a dictionary out of the dataset for storage in JSON format.
movieSaved = {feature: movie_df[feature].values.tolist() for feature in movie_df.columns.values}
fp = open("allMovies_new.json","w")
json.dump(movieSaved, fp)
fp.close()
We have cleaned up our IMDB review dataset in the ipython notebook New.ipynb
. From that notebook we have been able to save dictionaries of our data, which we will now call.
In [16]:
with open("train_df_dict.json", "r") as fd:
train_df_dict = json.load(fd)
with open("test_df_dict.json", "r") as fd:
test_df_dict = json.load(fd)
In [17]:
train_df = pd.DataFrame(train_df_dict)
test_df = pd.DataFrame(test_df_dict)
The Stanford group distinguishes between train and test because this was relevant for their project. This may prove to be useful later, so we will keep two separate dataframes. However, for our purposes at the moment, we can combine them since the BOM data will serve as our true data set.
In [18]:
IMDB_df = train_df.append(test_df)
We want to figure out which movies from IMDB_df are also present in our BOM DF. So let's get all the movie titles in our BOM table.
In [19]:
BOM_movie_list = movie_df.title.values.tolist()
Now let's create a mask over IMDB_df, the boolean values of which indicate whether or not a movie is in the BOM list.
In [20]:
movie_mask = [(movie in BOM_movie_list) for movie in IMDB_df.movie_name]
In [21]:
sum(movie_mask)
Out[21]:
We can now create out final, relevant IMDB data frame, with only those movies that also appear in the BOM tables from 1990 - 2015.
In [22]:
IMDB_dftouse=IMDB_df[movie_mask]
Finally we want to save our dictionary of IMDB_dftouse into a JSON file for storage.
In [23]:
IMDB_dftouse_dict = {feature: IMDB_dftouse[feature].values.tolist() for feature in IMDB_dftouse.columns.values}
fp = open("IMDB_dftouse_dict.json","w")
json.dump(IMDB_dftouse_dict, fp)
fp.close()
In [24]:
# Reopen
with open("IMDB_dftouse_dict.json", "r") as fd:
IMDB_dftouse_dict = json.load(fd)
Now let's download labMT, a word score list for sentiment analysis containing over 10,000 words. The file contains a "happiness" value, and ranks words by their happiness. It also includes mean and standard deviation, Twitter rank and Google rank.
In [25]:
url = 'http://www.plosone.org/article/fetchSingleRepresentation.action?uri=info:doi/10.1371/journal.pone.0026752.s001'
labmt = pd.read_csv(url, skiprows=2, sep='\t', index_col=0)
In [26]:
labmt.head()
Out[26]:
Now let's create a happiness dictionary of (word, valence) pairs where each valence is that word's original valence minus the average valence.
In [27]:
average = labmt.happiness_average.mean()
happiness = (labmt.happiness_average - average).to_dict()
In [28]:
print "Score(happy): ", happiness['happy']
print "Score(miserable): ", happiness['miserable']
print "Best score: ", max(happiness.values())
print "Worst score: ", min(happiness.values())
In [29]:
# Save to disc
# fp = open("happiness.json","w")
# json.dump(happiness, fp)
# fp.close()
In [30]:
# Reopen
with open("happiness.json", "r") as fp:
happiness = json.load(fp)
Now let's collect several attributes from a given review's text body, and save all valuable information into a new data frame. First we define a function that removes stop words (all non important words from a valence perspective) from a text body.
In [31]:
from sklearn.feature_extraction import text
stopwords = text.ENGLISH_STOP_WORDS
punctuation = list('.,;:!?()[]{}`''\"@#$%^&*+-|-=~_')
def removeStopWords(text, stopwords = stopwords):
new_text = ""
for word in text.split():
if word not in stopwords:
while len(word) != 0 and word[-1] in punctuation:
word = word[:len(word)-1]
new_text += word + ' '
return new_text
Now we'll write a function that returns total happiness, average happiness, total scorable words, and percentage of scorable words in a given review text.
In [32]:
'''
Name: getValenceInfo()
Inputs: review text, dictionary of happiness
Returns: a 4-tuple of (happiness total, happiness average, total # of scorable words, % of scorable words)
'''
def getValenceInfo(text, valenceDict):
total_words = len(text.split())
happiness_total, count_relevant = 0, 0
for word in text.split():
if word in valenceDict.keys():
count_relevant += 1
happiness_total += valenceDict[word]
if count_relevant != 0:
avg_valence = 1.*happiness_total/count_relevant
else:
avg_valence = 0
return happiness_total, avg_valence, total_words, 1.*count_relevant / total_words
Now we'll write a function that, given a data frame, returns a new data frame with the concatenation of valence (happiness) info in 4 new columns: valence sum, valence average, # of scorable words, % of scorable words.
In [33]:
'''
Name: getAllInfo
Input: data frame, happiness dictionary, list of stop words
Returns: a new data frame with 4 new columns: valence_sum, valence_avg, n_scorables, pct_scorables
'''
def getAllInfo(df, valenceDict, stopwords):
valence_suml, valence_avgl, review_lenl, review_fractionl = [], [], [], []
for i, row in df.iterrows():
cleaned_review = removeStopWords(row['text'], stopwords)
valence_sum, valence_avg, review_len, review_fraction = getValenceInfo(cleaned_review, valenceDict)
valence_suml.append(valence_sum)
valence_avgl.append(valence_avg)
review_lenl.append(review_len)
review_fractionl.append(review_fraction)
conc = pd.DataFrame({'valence_sum': valence_suml, 'valence_avg':valence_avgl ,'n_scorables': review_lenl,
'pct_scorables': review_fractionl})
return pd.concat([df, conc], axis=1)
Now let's create a new dataframe valence_df
with the valence statistics run on our IMDB_df. This code takes a few minutes to run.
In [34]:
%%time
valence_df = getAllInfo(IMDB_df, happiness, stopwords)
In [36]:
valence_df.head()
In [ ]:
# Convert True/False to 1/0: needed to make valence_df JSON serializable, also better practice
valence_df.positive = 1.0*valence_df.positive
In [ ]:
# Save to disc
# fp = open("valence_df_dict.json","w")
# json.dump(valence_df.to_dict(), fp)
# fp.close()
In [37]:
# Reopen
with open("valence_df_dict.json", "r") as fp:
valence_df_dict = json.load(fp)
valence_df = pd.DataFrame(valence_df_dict)
We're going to reimport our prepared datasets - dictionary.csv (contains the dictionary from the AFINN dataset), and allMovies_new.json (which contains all the reviews that cross over with our BOM data). We will import these as panda dataframes dictionary
and IMDB_df
respectively.
In [38]:
dictionary=pd.read_csv("dictionary.csv")
with open("IMDB_dftouse_dict.json", "r") as fd:
IMDB = json.load(fd)
IMDB_df = pd.DataFrame(IMDB)
Now we'll import allMoives_new.json. Remember that we still need to convert some of the columns into the datetime type so we'll do that afterwards.
In [39]:
with open("allMovies_new.json", "r") as fd:
movie_df = json.load(fd)
movie_df = pd.DataFrame(movie_df)
In [40]:
# making close_date and open_date by concatenating the year, month and day
import datetime
close_date = []
for index, row in movie_df.iterrows():
if row.close_day != '0':
close_date.append(datetime.datetime(int(row.close_year), int(row.close_month), int(row.close_day)))
else:
close_date.append(None)
movie_df['close_date'] = close_date
movie_df['open_date']=movie_df.open_year + '-' + movie_df.open_month + '-' + movie_df.open_day
movie_df['open_date']=movie_df['open_date'].apply(pd.datetools.parse)
# dropping unnecessary columns
movie_df = movie_df.drop('close_day', 1)
movie_df = movie_df.drop('close_month', 1)
movie_df = movie_df.drop('open_day', 1)
movie_df = movie_df.drop('open_month', 1)
Great! Let's take a look at all the dataframes we've created
In [41]:
movie_df
Out[41]:
In [42]:
IMDB_df.head()
Out[42]:
Now, we want to make a new dataframe flattened_df
that we can use to run our regressions on. This dataframe will include all the columns in movie_df
and the extra columns
In [43]:
# set index to title
indexed_df = movie_df.set_index("title")
In [44]:
# use groupby to get the review_count and the star_avg
gold = IMDB_df.groupby("movie_name")
review_count = gold.movie_name.count()
star_avg = gold.stars.mean()
positive = gold.positive.mean()
In [45]:
# concatenate the two series into our final dataframe flattened_df
flattened_df = pd.concat([indexed_df, review_count], axis=1, join_axes=[indexed_df.index])
flattened_df.rename(columns={'movie_name': 'review_count'}, inplace=True)
flattened_df = pd.concat([flattened_df, star_avg], axis=1, join_axes=[indexed_df.index])
flattened_df.rename(columns={'stars': 'star_avg'}, inplace=True)
flattened_df = pd.concat([flattened_df, positive], axis=1, join_axes=[indexed_df.index])
Let's take a look at what our dataframe looks like now!
In [46]:
flattened_df.head()
Out[46]:
In [47]:
dftouse = flattened_df[~flattened_df['review_count'].map(np.isnan)]
dftouse.shape
Out[47]:
In [48]:
dftouse.head()
Out[48]:
In [49]:
plt.scatter(y=dftouse.opening_gross, x=dftouse.star_avg)
Out[49]:
In [50]:
plt.scatter(y=dftouse_four.opening_gross, x=dftouse_four.star_avg,s=dftouse_four.review_count)
In [160]:
plt.scatter(y=dftouse_seven.opening_gross, x=dftouse_seven.star_avg,s=dftouse_seven.review_count)
Out[160]:
In [51]:
dftouse_four = flattened_df[flattened_df['star_avg'] <= 4]
dftouse_seven = flattened_df[flattened_df['star_avg'] >= 7]
In [52]:
dftouse_four.head()
Out[52]:
In [53]:
bobby_ols = ols('opening_gross ~ star_avg',dftouse).fit()
bobby_ols.summary()
Out[53]:
In [162]:
bobby_ols = ols('opening_gross ~ star_avg',dftouse_four).fit()
bobby_ols.summary()
Out[162]:
In [159]:
bobby_ols = ols('opening_gross ~ star_avg',dftouse_seven).fit()
bobby_ols.summary()
Out[159]:
ASK ANDREW WHETHER WE NEED TEST/TRAIN FOR OUR REGRESSION
WORK FLOW 0 GOAL: DESCRIPTIVE STATISTICS
WORK FLOW 1 GOAL: MAKE A VALENCE SCORE THAT CLOSELY MATCHES STARS
STEPHEN's EDITS
number 2
WORK FLOW 2 GOAL: FIND A REGRESSION MODEL THAT BEST PREDICTS THE FOLLOWING Y VARIABLES
3.
Just realised that I was meant to take notes.
Re: pretty woman question
Re: regression
measures of dispersion on the sentiment analysis
We can adopt the hypothesis that stars predict box office gross Or hypothesis that evry polarized responses predict box office gross It's okay to have a dud hypothesis
Graded on the effort we put in - low unimpressive results should not be equated with not having much work to do. We can make up for it via lit reviews or extra research.
BY NEXT MONDAY: ANALYSIS SHOULD BE DONE, AND AT THAT POINT JUST TOUCHING UP THE VISUALIZATIONS
In [55]:
#getting inflation data and creating a new dataframe
inflation = pd.read_csv("inf.csv")
print inflation
In [90]:
#Creating a dataframe of Cumulative Inflation
years_90 = range(1990,2015)
infdict = {}
infindex = 0
infvalue = 1
testlist = []
for row in inflation.values:
currentval = 1 + (row[1]/100)
cuminf = infvalue*currentval
infdict[years_90[infindex]] = cuminf
infindex += 1
infvalue = cuminf
testlist.append(cuminf)
inframe = pd.DataFrame(data=testlist, index=range(1990,2015))
#infdict exists in case we need it later
In [91]:
inframe
Out[91]:
In [ ]:
adjframe =
In [ ]: