In [2]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [3]:
from bs4 import BeautifulSoup
# The "requests" library makes working with HTTP requests easier
# than the built-in urllib libraries.
import requests

In [28]:
def rowInfoGrabber(r):
    info = []
    # Ranking
    info.append(int(r.find("font").get_text()))
    # Title
    info.append(r.find("a").get_text())
    # Gross
    info.append(int(r.find("td", attrs={"align":"right"}).find("b").get_text().strip("$").replace(",","")))
    # Total Number of Theaters
    info.append(int(r.find_all("td",attrs={"align":"right"})[1].find("font").get_text().replace(",","")))
    # Opening Cost
    info.append(int(r.find_all("td", attrs={"align":"right"})[2].find("font").get_text().strip("$").replace(",","")))
    # Opening Number of Theaters
    info.append(int(r.find_all("td", attrs={"align":"right"})[3].find("font").get_text().replace(",","")))
    # Date of Opening
    info.append(r.find_all("td", attrs={"align":"right"})[4].find("a").get_text())
    # Date of Closing
    info.append(r.find_all("td", attrs={"align":"right"})[5].find("font").get_text())
    return info
fields = ["ranking", "title", "gross", "total_theaters", "opening", "opening_theaters", "open", "close"]

movies = [dict(zip(fields, rowInfoGrabber(row))) for row in movieRows]

In [29]:
# $80 million
movie_df = pd.DataFrame(columns=['close', 'gross', 'open', 'opening', 'opening_theaters','ranking','title','total_theaters','year'])

Scraping the past 26 years (1990-2016)

IMDB was created in 1990, so we'll only go that far back in our scraping of Box Office Mojo.'


In [30]:
years = [str(1990 + i) for i in range(26)]
for year in years:
    pageText = requests.get("http://www.boxofficemojo.com/yearly/chart/?yr=%(yr)d&p=.htm" % {'yr':year})
    soup = BeautifulSoup(pageText.text, "html.parser")
    movieTable = soup.find("td", attrs={"colspan":"3"})
    movieRows = movieTable.find("table").find_all("tr")[2:102]
    movie_dicts = [dict(zip(fields, rowInfoGrabber(row))) for row in movieRows]
    year_df = pd.DataFrame(movie_dicts)
    year_df['year'] = year
    movie_df = movie_df.append(year_df)
    time.sleep(1)

In [31]:
movie_df.shape


Out[31]:
(1200, 9)

In [32]:
movie_df


Out[32]:
close gross open opening opening_theaters ranking title total_theaters year
0 11/25 441226247 5/19 108037878 4163 1 Shrek 2 4223 2004
1 12/19 373585825 6/30 88156227 4152 2 Spider-Man 2 4166 2004
2 7/29 370274604 2/25 83848082 3043 3 The Passion of the Christ 3408 2004
3 6/16 279261160 12/22 46120980 3518 4 Meet the Fockers 3554 2004
4 4/14 261441092 11/5 70467623 3933 5 The Incredibles 3933 2004
5 12/19 249541069 6/4 93687367 3855 6 Harry Potter and the Prisoner of Azkaban 3855 2004
6 11/4 186740799 5/28 68743584 3425 7 The Day After Tomorrow 3444 2004
7 12/23 176241941 7/23 52521865 3165 8 The Bourne Supremacy 3304 2004
8 6/2 173008894 11/19 35142554 3017 9 National Treasure 3243 2004
9 3/10 162775358 11/10 23323463 3650 10 The Polar Express 3650 2004
10 1/6 160861908 10/1 47604606 4016 11 Shark Tale 4070 2004
11 12/30 144801023 7/16 52179887 3420 12 I, Robot 3494 2004
12 9/30 133378256 5/14 46865412 3411 13 Troy 3411 2004
13 3/24 125544280 12/10 39153380 3290 14 Ocean's Twelve 3290 2004
14 7/8 120908074 2/13 39852237 3591 15 50 First Dates 3612 2004
15 8/26 120177084 5/7 51748040 3575 16 Van Helsing 3580 2004
16 10/28 119194771 6/23 23920637 868 17 Fahrenheit 9/11 2011 2004
17 4/28 118634549 12/17 30061756 3620 18 Lemony Snicket's A Series of Unfortunate Events 3623 2004
18 11/4 114326736 6/18 30070196 2694 19 DodgeBall: A True Underdog Story 3020 2004
19 12/2 114197520 7/30 50746142 3730 20 The Village 3733 2004
20 12/30 110359362 10/22 39128715 3245 21 The Grudge 3348 2004
21 6/2 102610330 12/17 858021 40 22 The Aviator 2530 2004
22 11/25 101005703 8/6 24701458 3188 23 Collateral 3205 2004
23 6/30 100492203 12/15 179953 8 24 Million Dollar Baby 2375 2004
24 12/23 95170481 8/11 22956453 3472 25 The Princess Diaries 2: Royal Engagement 3490 2004
25 7/15 88237754 3/5 28103367 3185 26 Starsky and Hutch 3185 2004
26 6/10 88097164 1/16 27721185 2984 27 Along Came Polly 3052 2004
27 9/9 86058055 4/30 24432195 2839 28 Mean Girls 3054 2004
28 3/24 85417988 11/19 32018216 3212 29 The SpongeBob SquarePants Movie 3307 2004
29 10/7 85288303 7/9 28416365 3091 30 Anchorman: The Legend of Ron Burgundy 3104 2004
... ... ... ... ... ... ... ... ... ...
70 - 27285953 8/26 8111264 3355 71 No Escape 3415 2015
71 - 26822658 10/23 10812861 3082 72 The Last Witch Hunter 3082 2015
72 - 26822144 8/7 6610961 1603 73 Ricki and the Flash 2064 2015
73 3/19 26501323 1/2 15027415 2602 74 The Woman in Black 2: Angel of Death 2602 2015
74 5/7 26461644 3/13 11012305 3171 75 Run All Night 3171 2015
75 6/11 25801047 2/27 10203437 2666 76 The Lazarus Effect 2666 2015
76 9/3 25442958 4/10 237264 4 77 Ex Machina 2004 2015
77 9/17 22764410 7/10 9808463 2720 78 The Gallows 2720 2015
78 10/15 22467450 8/21 8326530 3261 79 Hitman: Agent 47 3273 2015
79 3/26 22348241 1/30 8310252 2893 80 Project Almanac 2900 2015
80 5/14 21571189 1/30 6213362 1823 81 Black or White 1823 2015
81 7/30 21067116 5/29 9670235 2815 82 Aloha 2815 2015
82 10/22 19375982 8/5 4038962 2320 83 Shaun the Sheep Movie 2360 2015
83 5/21 18754371 1/16 197000 12 84 Still Alice 1318 2015
84 - 18247445 10/23 8070493 1656 85 Paranormal Activity: The Ghost Dimension 1656 2015
85 11/5 17737646 7/17 2434908 361 86 Mr. Holmes 898 2015
86 - 17614323 10/9 521522 4 87 Steve Jobs 2493 2015
87 9/17 17506470 6/19 6100010 2002 88 Dope 2002 2015
88 3/19 17223265 2/6 7217640 2875 89 Seventh Son 2875 2015
89 7/23 16432322 4/17 4577861 2012 90 Monkey Kingdom 2012 2015
90 11/19 16029670 9/4 7355622 3434 91 The Transporter Refueled 3434 2015
91 - 15128355 11/13 8317545 2603 92 Love the Coopers 2603 2015
92 6/25 14674076 3/13 160089 4 93 It Follows 1655 2015
93 10/8 14440985 8/21 5454284 2778 94 American Ultra 2778 2015
94 - 14036500 10/16 4002226 1553 95 Woodlawn 1553 2015
95 - 13443407 10/30 5002521 3003 96 Burnt 3003 2015
96 6/25 12985600 3/20 3591282 1320 97 Do You Believe? 1356 2015
97 10/1 12551031 6/5 2122177 481 98 Love & Mercy 791 2015
98 4/16 12429583 1/23 5504441 3020 99 Strange Magic 3020 2015
99 3/26 12314651 2/20 5963324 2880 100 Hot Tub Time Machine 2 2901 2015

1200 rows × 9 columns


In [54]:
# Save the movie Dictionaries corresponding to each row of the BoxOfficeMojo table.
import json # (dong)

# Make a dictionary out of the dataset for storage in JSON format.
movieSaved = {feature: movie_df[feature].values.tolist() for feature in movie_df.columns.values}

fp = open("allMovies.json","w")
json.dump(movieSaved, fp)
fp.close()

In [ ]: