In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
For our project we will be using data from 3 different sources
First we import the requests
and BeautifulSoup
libraries to make working with HTTP requests easier, and then easily transfer HTML content to Python data structures.
In [2]:
from bs4 import BeautifulSoup
# The "requests" library makes working with HTTP requests easier
# than the built-in urllib libraries.
import requests
Secondly, we prepare the data frame movie_df
to store the data that we will scrape from BOM. We give this dataframe 9 columns:
* ranking: the ranking of the movie in its release year by gross
* title: movie title
* gross: how much the movie grossed while in theatres
* Total_theaters: the total number of theaters that showed this movie
* opening_gross: how much the movie grossed in the opening weekend (Fri-Sun)
* opening_theaters: the total number of theaters that showed this movie in the opening weekend (Fri-Sun)
* open_date: date of opening
* close_date: date of closing
* year: year of release
In [3]:
movie_df = pd.DataFrame(columns=['close_date', 'gross', 'open_date', 'opening_gross', 'opening_theaters','ranking','title','total_theaters','year'])
Now we write a function rowInfoGrabber
that we will call in a loop over the table on the BOM webpage to grab the attributes and save them into the corresponding columns in movie_df
.
In [4]:
def rowInfoGrabber(r):
info = []
# Ranking
info.append(int(r.find("font").get_text()))
# Title
info.append(r.find("a").get_text())
# Gross
info.append(int(r.find("td", attrs={"align":"right"}).find("b").get_text().strip("$").replace(",","")))
'''
For the next 3 categories, we need to deal with the 2000 Anomaly "Fantasia" where there are missing numbers.
In this case I have chosen to replace the missing values 'N/A' with the values from 'Final Destination', which
if right above it in the movie table and differs in gross income by about $1 million, which is a small
difference. See the picture below for a snapshot of the anomaly in the movie table from 2000.
'''
# Total number of theaters
if r.find_all("td",attrs={"align":"right"})[1].find("font").get_text().replace(",","") == 'N/A':
info.append(2587)
else:
info.append(int(r.find_all("td",attrs={"align":"right"})[1].find("font").get_text().replace(",","")))
# Opening Gross
if r.find_all("td", attrs={"align":"right"})[2].find("font").get_text().strip("$").replace(",","") == 'N/A':
info.append(10015822)
else:
info.append(int(r.find_all("td", attrs={"align":"right"})[2].find("font").get_text().strip("$").replace(",","")))
# Opening Number of Theaters
if r.find_all("td", attrs={"align":"right"})[3].find("font").get_text().replace(",","") == 'N/A':
info.append(2587)
else:
info.append(int(r.find_all("td", attrs={"align":"right"})[3].find("font").get_text().replace(",","")))
# Date of Opening
info.append(r.find_all("td", attrs={"align":"right"})[4].find("a").get_text())
# Date of Closing: Before 2002 they didn't have a "closing" date in their tables. We must account for this.
if (len(r.find_all("td", attrs={"align":"right"})) <= 5):
info.append('-')
else:
info.append(r.find_all("td", attrs={"align":"right"})[5].find("font").get_text())
return info
This is the image:
In [5]:
fields = ["ranking", "title", "gross", "total_theaters", "opening_gross", "opening_theaters", "open_date", "close_date"]
Finally we're ready to scrape!
Because IMDB was created in 1990, we will scrape that far back in BOM. So we're scraping the past 26 years (1990 - 2015). Also note that because the HTML was changed starting in 2001, our scraping will be a little different before and after then.
In [6]:
%%time
years = [1990 + i for i in range(26)]
for year in years:
pageText = requests.get("http://www.boxofficemojo.com/yearly/chart/?yr=%(yr)d&p=.htm" % {'yr':year})
soup = BeautifulSoup(pageText.text, "html.parser")
movieTable = soup.find("td", attrs={"colspan":"3"})
movieRows = movieTable.find("table").find_all("tr")[2:102]
print year
movie_dicts = [dict(zip(fields, rowInfoGrabber(row))) for row in movieRows]
year_df = pd.DataFrame(movie_dicts)
year_df['year'] = year
movie_df = movie_df.append(year_df, ignore_index=True)
time.sleep(1)
In [7]:
movie_df.shape
Out[7]:
In [8]:
movie_df.head()
Out[8]:
Because some films do not have a close date, we will have to be careful with the close date!
In [9]:
# if we decide it's worth just dumping the movies with no close_date, we can use the code below
# movie_df=movie_df[movie_df.close_date != '-'].reset_index(drop=True)
Next, we combine the close_date
, open_date
and year
columns into two columns close_date
and open_date
that are time series. This will make it easier for us to work with the data in the future.
In [10]:
# splitting the close_date and open_date into the respective month and day
movie_df['close_month'] = movie_df['close_date'].map(lambda x: '0' if x=='-' else x[:x.find('/')])
movie_df['close_day'] = movie_df['close_date'].map(lambda x: '0' if x=='-' else x[x.find('/')+1:len(x)])
movie_df['open_month'] = movie_df['open_date'].map(lambda x: x[:x.find('/')])
movie_df['open_day'] = movie_df['open_date'].map(lambda x: x[x.find('/')+1:len(x)])
# dropping the old close_date and open_date
movie_df = movie_df.drop('close_date', 1)
movie_df = movie_df.drop('open_date', 1)
# creating an open_year by turning the year column into a string and getting rid of trailing bits
movie_df['open_year'] = movie_df.year.astype(str)
movie_df['open_year'] = movie_df.open_year.map(lambda x: x[:x.find('.')])
# creating a close_year column, by looking at whether the close month is earlier/later than the open month in the year
close_month = movie_df['close_month'].astype(int)
open_month = movie_df['open_month'].astype(int)
year = movie_df['year'].astype(int)
close_year=[]
for i in range (0, len(year)):
if close_month[i] >= open_month[i]:
close_year.append(year[i])
else:
close_year.append(year[i]+1)
movie_df['close_year'] = close_year
movie_df['close_year'] = movie_df['close_year'].astype(str)
In [11]:
movie_df.head()
Out[11]:
In [12]:
# making close_date and open_date by concatenating the year, month and day
import datetime
close_date = []
for index, row in movie_df.iterrows():
if row.close_day != '0':
close_date.append(datetime.datetime(int(row.close_year), int(row.close_month), int(row.close_day)))
else:
close_date.append(None)
movie_df['close_date'] = close_date
movie_df['open_date']=movie_df.open_year + '-' + movie_df.open_month + '-' + movie_df.open_day
movie_df['open_date']=movie_df['open_date'].apply(pd.datetools.parse)
In [13]:
# dropping unnecessary columns
movie_df = movie_df.drop('close_day', 1)
movie_df = movie_df.drop('close_month', 1)
movie_df = movie_df.drop('close_year', 1)
movie_df = movie_df.drop('open_day', 1)
movie_df = movie_df.drop('open_month', 1)
movie_df = movie_df.drop('open_year', 1)
movie_df = movie_df.drop('year', 1)
Let's take a look at the data, now!
In [14]:
movie_df.head()
Out[14]:
Let's take a look at if we can get the run times for each movie!
In [15]:
run_time=[]
for index, row in movie_df.iterrows():
if row.close_date != None:
run_time.append(row['close_date']-row['open_date'])
else:
run_time.append('N/A')
In [16]:
movie_df.head()
Out[16]:
Looks like the data is ready for us to use! Let's save this data so we're ready to use it next time.
In [17]:
! pip install pymongo
In [18]:
# Save the movie Dictionaries corresponding to each row of the BoxOfficeMojo table.
import json # (dong)
import pymongo
from bson import json_util
# Make a dictionary out of the dataset for storage in JSON format.
movieSaved = {feature: movie_df[feature].values.tolist() for feature in movie_df.columns.values}
fp = open("allMovies.json","w")
json.dump(movieSaved, fp, default=json_util.default)
fp.close()
We have cleaned up our IMDB review dataset in the ipython notebook New.ipynb
. From that notebook we have been able to save dictionaries of our data, which we will now call.
In [25]:
with open("train_df_dict.json", "r") as fd:
train_df_dict = json.load(fd)
with open("test_df_dict.json", "r") as fd:
test_df_dict = json.load(fd)
In [26]:
train_df = pd.DataFrame(train_df_dict)
test_df = pd.DataFrame(test_df_dict)
The Stanford group distinguishes between train and test because this was relevant for their project. This may prove to be useful later, so we will keep two separate dataframes. However, for our purposes at the moment, we can combine them since the BOM data will serve as our true data set.
In [27]:
IMDB_df = train_df.append(test_df)
We want to figure out which movies from IMDB_df are also present in our BOM DF. So let's get all the movie titles in our BOM table.
In [28]:
BOM_movie_list = movie_df.title.values.tolist()
Now let's create a mask over IMDB_df, the boolean values of which indicate whether or not a movie is in the BOM list.
In [29]:
movie_mask = [(movie in BOM_movie_list) for movie in IMDB_df.movie_name]
In [30]:
sum(movie_mask)
Out[30]:
We can now create out final, relevant IMDB data frame, with only those movies that also appear in the BOM tables from 1990 - 2015.
In [31]:
IMDB_dftouse=IMDB_df[movie_mask]
Finally we want to save our dictionary of IMDB_dftouse into a JSON file for storage.
In [32]:
IMDB_dftouse_dict = {feature: IMDB_dftouse[feature].values.tolist() for feature in IMDB_dftouse.columns.values}
fp = open("IMDB_dftouse_dict.json","w")
json.dump(IMDB_dftouse_dict, fp)
fp.close()
In [33]:
with open("IMDB_dftouse_dict.json", "r") as fd:
IMDB_dftouse_dict = json.load(fd)
We wil read in the AFINN-111 dictionary from the tab-delimited .txt file and save it into a pandas dataframe dictionary
.
In [19]:
words = []
valences = []
with open('AFINN/AFINN-111.txt','rb') as source:
for line in source:
fields = line.split('\t')
words.append(fields[0])
valences.append(fields[1])
dictionary = pd.DataFrame({'words': words, 'valences':valences})
Because the valences are read in as a string, we will clean it up and change it to an int so it is ready for manipulation.
In [20]:
dictionary['valences'] = dictionary['valences'].map(lambda x: x if x.find('\n')== -1 else x[:x.find('\n')])
dictionary['valences']=dictionary['valences'].astype(int)
Let's take a look at the dictionary!
In [21]:
dictionary.head()
Out[21]:
Looks like the dictionary is ready for us to use! Let's save it so we have it in the event that anything happens.
In [24]:
# dictionary.to_csv("dictionary.csv", index=False)
dictionary=pd.read_csv("dictionary.csv")
In [ ]: