In [2]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
In [3]:
from bs4 import BeautifulSoup
# The "requests" library makes working with HTTP requests easier
# than the built-in urllib libraries.
import requests
In [28]:
def rowInfoGrabber(r):
info = []
# Ranking
info.append(int(r.find("font").get_text()))
# Title
info.append(r.find("a").get_text())
# Gross
info.append(int(r.find("td", attrs={"align":"right"}).find("b").get_text().strip("$").replace(",","")))
# Total Number of Theaters
info.append(int(r.find_all("td",attrs={"align":"right"})[1].find("font").get_text().replace(",","")))
# Opening Cost
info.append(int(r.find_all("td", attrs={"align":"right"})[2].find("font").get_text().strip("$").replace(",","")))
# Opening Number of Theaters
info.append(int(r.find_all("td", attrs={"align":"right"})[3].find("font").get_text().replace(",","")))
# Date of Opening
info.append(r.find_all("td", attrs={"align":"right"})[4].find("a").get_text())
# Date of Closing
info.append(r.find_all("td", attrs={"align":"right"})[5].find("font").get_text())
return info
fields = ["ranking", "title", "gross", "total_theaters", "opening", "opening_theaters", "open", "close"]
movies = [dict(zip(fields, rowInfoGrabber(row))) for row in movieRows]
In [29]:
# $80 million
movie_df = pd.DataFrame(columns=['close', 'gross', 'open', 'opening', 'opening_theaters','ranking','title','total_theaters','year'])
IMDB was created in 1990, so we'll only go that far back in our scraping of Box Office Mojo.'
In [30]:
years = [str(1990 + i) for i in range(26)]
for year in years:
pageText = requests.get("http://www.boxofficemojo.com/yearly/chart/?yr=%(yr)d&p=.htm" % {'yr':year})
soup = BeautifulSoup(pageText.text, "html.parser")
movieTable = soup.find("td", attrs={"colspan":"3"})
movieRows = movieTable.find("table").find_all("tr")[2:102]
movie_dicts = [dict(zip(fields, rowInfoGrabber(row))) for row in movieRows]
year_df = pd.DataFrame(movie_dicts)
year_df['year'] = year
movie_df = movie_df.append(year_df)
time.sleep(1)
In [31]:
movie_df.shape
Out[31]:
In [32]:
movie_df
Out[32]:
In [54]:
# Save the movie Dictionaries corresponding to each row of the BoxOfficeMojo table.
import json # (dong)
# Make a dictionary out of the dataset for storage in JSON format.
movieSaved = {feature: movie_df[feature].values.tolist() for feature in movie_df.columns.values}
fp = open("allMovies.json","w")
json.dump(movieSaved, fp)
fp.close()
In [ ]: