In [2]:

    
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")



In [3]:

    
from bs4 import BeautifulSoup
# The "requests" library makes working with HTTP requests easier
# than the built-in urllib libraries.
import requests



In [28]:

    
def rowInfoGrabber(r):
    info = []
    # Ranking
    info.append(int(r.find("font").get_text()))
    # Title
    info.append(r.find("a").get_text())
    # Gross
    info.append(int(r.find("td", attrs={"align":"right"}).find("b").get_text().strip("$").replace(",","")))
    # Total Number of Theaters
    info.append(int(r.find_all("td",attrs={"align":"right"})[1].find("font").get_text().replace(",","")))
    # Opening Cost
    info.append(int(r.find_all("td", attrs={"align":"right"})[2].find("font").get_text().strip("$").replace(",","")))
    # Opening Number of Theaters
    info.append(int(r.find_all("td", attrs={"align":"right"})[3].find("font").get_text().replace(",","")))
    # Date of Opening
    info.append(r.find_all("td", attrs={"align":"right"})[4].find("a").get_text())
    # Date of Closing
    info.append(r.find_all("td", attrs={"align":"right"})[5].find("font").get_text())
    return info
fields = ["ranking", "title", "gross", "total_theaters", "opening", "opening_theaters", "open", "close"]

movies = [dict(zip(fields, rowInfoGrabber(row))) for row in movieRows]



In [29]:

    
# $80 million
movie_df = pd.DataFrame(columns=['close', 'gross', 'open', 'opening', 'opening_theaters','ranking','title','total_theaters','year'])

Scraping the past 26 years (1990-2016)

IMDB was created in 1990, so we'll only go that far back in our scraping of Box Office Mojo.'



In [30]:

    
years = [str(1990 + i) for i in range(26)]
for year in years:
    pageText = requests.get("http://www.boxofficemojo.com/yearly/chart/?yr=%(yr)d&p=.htm" % {'yr':year})
    soup = BeautifulSoup(pageText.text, "html.parser")
    movieTable = soup.find("td", attrs={"colspan":"3"})
    movieRows = movieTable.find("table").find_all("tr")[2:102]
    movie_dicts = [dict(zip(fields, rowInfoGrabber(row))) for row in movieRows]
    year_df = pd.DataFrame(movie_dicts)
    year_df['year'] = year
    movie_df = movie_df.append(year_df)
    time.sleep(1)



In [31]:

    
movie_df.shape









    Out[31]:





(1200, 9)



In [32]:

    
movie_df









    Out[32]:






  
    
      
      close
      gross
      open
      opening
      opening_theaters
      ranking
      title
      total_theaters
      year
    
  
  
    
      0
      11/25
      441226247
      5/19
      108037878
      4163
      1
      Shrek 2
      4223
      2004
    
    
      1
      12/19
      373585825
      6/30
      88156227
      4152
      2
      Spider-Man 2
      4166
      2004
    
    
      2
      7/29
      370274604
      2/25
      83848082
      3043
      3
      The Passion of the Christ
      3408
      2004
    
    
      3
      6/16
      279261160
      12/22
      46120980
      3518
      4
      Meet the Fockers
      3554
      2004
    
    
      4
      4/14
      261441092
      11/5
      70467623
      3933
      5
      The Incredibles
      3933
      2004
    
    
      5
      12/19
      249541069
      6/4
      93687367
      3855
      6
      Harry Potter and the Prisoner of Azkaban
      3855
      2004
    
    
      6
      11/4
      186740799
      5/28
      68743584
      3425
      7
      The Day After Tomorrow
      3444
      2004
    
    
      7
      12/23
      176241941
      7/23
      52521865
      3165
      8
      The Bourne Supremacy
      3304
      2004
    
    
      8
      6/2
      173008894
      11/19
      35142554
      3017
      9
      National Treasure
      3243
      2004
    
    
      9
      3/10
      162775358
      11/10
      23323463
      3650
      10
      The Polar Express
      3650
      2004
    
    
      10
      1/6
      160861908
      10/1
      47604606
      4016
      11
      Shark Tale
      4070
      2004
    
    
      11
      12/30
      144801023
      7/16
      52179887
      3420
      12
      I, Robot
      3494
      2004
    
    
      12
      9/30
      133378256
      5/14
      46865412
      3411
      13
      Troy
      3411
      2004
    
    
      13
      3/24
      125544280
      12/10
      39153380
      3290
      14
      Ocean's Twelve
      3290
      2004
    
    
      14
      7/8
      120908074
      2/13
      39852237
      3591
      15
      50 First Dates
      3612
      2004
    
    
      15
      8/26
      120177084
      5/7
      51748040
      3575
      16
      Van Helsing
      3580
      2004
    
    
      16
      10/28
      119194771
      6/23
      23920637
      868
      17
      Fahrenheit 9/11
      2011
      2004
    
    
      17
      4/28
      118634549
      12/17
      30061756
      3620
      18
      Lemony Snicket's A Series of Unfortunate Events
      3623
      2004
    
    
      18
      11/4
      114326736
      6/18
      30070196
      2694
      19
      DodgeBall: A True Underdog Story
      3020
      2004
    
    
      19
      12/2
      114197520
      7/30
      50746142
      3730
      20
      The Village
      3733
      2004
    
    
      20
      12/30
      110359362
      10/22
      39128715
      3245
      21
      The Grudge
      3348
      2004
    
    
      21
      6/2
      102610330
      12/17
      858021
      40
      22
      The Aviator
      2530
      2004
    
    
      22
      11/25
      101005703
      8/6
      24701458
      3188
      23
      Collateral
      3205
      2004
    
    
      23
      6/30
      100492203
      12/15
      179953
      8
      24
      Million Dollar Baby
      2375
      2004
    
    
      24
      12/23
      95170481
      8/11
      22956453
      3472
      25
      The Princess Diaries 2: Royal Engagement
      3490
      2004
    
    
      25
      7/15
      88237754
      3/5
      28103367
      3185
      26
      Starsky and Hutch
      3185
      2004
    
    
      26
      6/10
      88097164
      1/16
      27721185
      2984
      27
      Along Came Polly
      3052
      2004
    
    
      27
      9/9
      86058055
      4/30
      24432195
      2839
      28
      Mean Girls
      3054
      2004
    
    
      28
      3/24
      85417988
      11/19
      32018216
      3212
      29
      The SpongeBob SquarePants Movie
      3307
      2004
    
    
      29
      10/7
      85288303
      7/9
      28416365
      3091
      30
      Anchorman: The Legend of Ron Burgundy
      3104
      2004
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      70
      -
      27285953
      8/26
      8111264
      3355
      71
      No Escape
      3415
      2015
    
    
      71
      -
      26822658
      10/23
      10812861
      3082
      72
      The Last Witch Hunter
      3082
      2015
    
    
      72
      -
      26822144
      8/7
      6610961
      1603
      73
      Ricki and the Flash
      2064
      2015
    
    
      73
      3/19
      26501323
      1/2
      15027415
      2602
      74
      The Woman in Black 2: Angel of Death
      2602
      2015
    
    
      74
      5/7
      26461644
      3/13
      11012305
      3171
      75
      Run All Night
      3171
      2015
    
    
      75
      6/11
      25801047
      2/27
      10203437
      2666
      76
      The Lazarus Effect
      2666
      2015
    
    
      76
      9/3
      25442958
      4/10
      237264
      4
      77
      Ex Machina
      2004
      2015
    
    
      77
      9/17
      22764410
      7/10
      9808463
      2720
      78
      The Gallows
      2720
      2015
    
    
      78
      10/15
      22467450
      8/21
      8326530
      3261
      79
      Hitman: Agent 47
      3273
      2015
    
    
      79
      3/26
      22348241
      1/30
      8310252
      2893
      80
      Project Almanac
      2900
      2015
    
    
      80
      5/14
      21571189
      1/30
      6213362
      1823
      81
      Black or White
      1823
      2015
    
    
      81
      7/30
      21067116
      5/29
      9670235
      2815
      82
      Aloha
      2815
      2015
    
    
      82
      10/22
      19375982
      8/5
      4038962
      2320
      83
      Shaun the Sheep Movie
      2360
      2015
    
    
      83
      5/21
      18754371
      1/16
      197000
      12
      84
      Still Alice
      1318
      2015
    
    
      84
      -
      18247445
      10/23
      8070493
      1656
      85
      Paranormal Activity: The Ghost Dimension
      1656
      2015
    
    
      85
      11/5
      17737646
      7/17
      2434908
      361
      86
      Mr. Holmes
      898
      2015
    
    
      86
      -
      17614323
      10/9
      521522
      4
      87
      Steve Jobs
      2493
      2015
    
    
      87
      9/17
      17506470
      6/19
      6100010
      2002
      88
      Dope
      2002
      2015
    
    
      88
      3/19
      17223265
      2/6
      7217640
      2875
      89
      Seventh Son
      2875
      2015
    
    
      89
      7/23
      16432322
      4/17
      4577861
      2012
      90
      Monkey Kingdom
      2012
      2015
    
    
      90
      11/19
      16029670
      9/4
      7355622
      3434
      91
      The Transporter Refueled
      3434
      2015
    
    
      91
      -
      15128355
      11/13
      8317545
      2603
      92
      Love the Coopers
      2603
      2015
    
    
      92
      6/25
      14674076
      3/13
      160089
      4
      93
      It Follows
      1655
      2015
    
    
      93
      10/8
      14440985
      8/21
      5454284
      2778
      94
      American Ultra
      2778
      2015
    
    
      94
      -
      14036500
      10/16
      4002226
      1553
      95
      Woodlawn
      1553
      2015
    
    
      95
      -
      13443407
      10/30
      5002521
      3003
      96
      Burnt
      3003
      2015
    
    
      96
      6/25
      12985600
      3/20
      3591282
      1320
      97
      Do You Believe?
      1356
      2015
    
    
      97
      10/1
      12551031
      6/5
      2122177
      481
      98
      Love & Mercy
      791
      2015
    
    
      98
      4/16
      12429583
      1/23
      5504441
      3020
      99
      Strange Magic
      3020
      2015
    
    
      99
      3/26
      12314651
      2/20
      5963324
      2880
      100
      Hot Tub Time Machine 2
      2901
      2015
    
  

1200 rows × 9 columns



In [54]:

    
# Save the movie Dictionaries corresponding to each row of the BoxOfficeMojo table.
import json # (dong)

# Make a dictionary out of the dataset for storage in JSON format.
movieSaved = {feature: movie_df[feature].values.tolist() for feature in movie_df.columns.values}

fp = open("allMovies.json","w")
json.dump(movieSaved, fp)
fp.close()



In [ ]:

	close	gross	open	opening	opening_theaters	ranking	title	total_theaters	year
0	11/25	441226247	5/19	108037878	4163	1	Shrek 2	4223	2004
1	12/19	373585825	6/30	88156227	4152	2	Spider-Man 2	4166	2004
2	7/29	370274604	2/25	83848082	3043	3	The Passion of the Christ	3408	2004
3	6/16	279261160	12/22	46120980	3518	4	Meet the Fockers	3554	2004
4	4/14	261441092	11/5	70467623	3933	5	The Incredibles	3933	2004
5	12/19	249541069	6/4	93687367	3855	6	Harry Potter and the Prisoner of Azkaban	3855	2004
6	11/4	186740799	5/28	68743584	3425	7	The Day After Tomorrow	3444	2004
7	12/23	176241941	7/23	52521865	3165	8	The Bourne Supremacy	3304	2004
8	6/2	173008894	11/19	35142554	3017	9	National Treasure	3243	2004
9	3/10	162775358	11/10	23323463	3650	10	The Polar Express	3650	2004
10	1/6	160861908	10/1	47604606	4016	11	Shark Tale	4070	2004
11	12/30	144801023	7/16	52179887	3420	12	I, Robot	3494	2004
12	9/30	133378256	5/14	46865412	3411	13	Troy	3411	2004
13	3/24	125544280	12/10	39153380	3290	14	Ocean's Twelve	3290	2004
14	7/8	120908074	2/13	39852237	3591	15	50 First Dates	3612	2004
15	8/26	120177084	5/7	51748040	3575	16	Van Helsing	3580	2004
16	10/28	119194771	6/23	23920637	868	17	Fahrenheit 9/11	2011	2004
17	4/28	118634549	12/17	30061756	3620	18	Lemony Snicket's A Series of Unfortunate Events	3623	2004
18	11/4	114326736	6/18	30070196	2694	19	DodgeBall: A True Underdog Story	3020	2004
19	12/2	114197520	7/30	50746142	3730	20	The Village	3733	2004
20	12/30	110359362	10/22	39128715	3245	21	The Grudge	3348	2004
21	6/2	102610330	12/17	858021	40	22	The Aviator	2530	2004
22	11/25	101005703	8/6	24701458	3188	23	Collateral	3205	2004
23	6/30	100492203	12/15	179953	8	24	Million Dollar Baby	2375	2004
24	12/23	95170481	8/11	22956453	3472	25	The Princess Diaries 2: Royal Engagement	3490	2004
25	7/15	88237754	3/5	28103367	3185	26	Starsky and Hutch	3185	2004
26	6/10	88097164	1/16	27721185	2984	27	Along Came Polly	3052	2004
27	9/9	86058055	4/30	24432195	2839	28	Mean Girls	3054	2004
28	3/24	85417988	11/19	32018216	3212	29	The SpongeBob SquarePants Movie	3307	2004
29	10/7	85288303	7/9	28416365	3091	30	Anchorman: The Legend of Ron Burgundy	3104	2004
...	...	...	...	...	...	...	...	...	...
70	-	27285953	8/26	8111264	3355	71	No Escape	3415	2015
71	-	26822658	10/23	10812861	3082	72	The Last Witch Hunter	3082	2015
72	-	26822144	8/7	6610961	1603	73	Ricki and the Flash	2064	2015
73	3/19	26501323	1/2	15027415	2602	74	The Woman in Black 2: Angel of Death	2602	2015
74	5/7	26461644	3/13	11012305	3171	75	Run All Night	3171	2015
75	6/11	25801047	2/27	10203437	2666	76	The Lazarus Effect	2666	2015
76	9/3	25442958	4/10	237264	4	77	Ex Machina	2004	2015
77	9/17	22764410	7/10	9808463	2720	78	The Gallows	2720	2015
78	10/15	22467450	8/21	8326530	3261	79	Hitman: Agent 47	3273	2015
79	3/26	22348241	1/30	8310252	2893	80	Project Almanac	2900	2015
80	5/14	21571189	1/30	6213362	1823	81	Black or White	1823	2015
81	7/30	21067116	5/29	9670235	2815	82	Aloha	2815	2015
82	10/22	19375982	8/5	4038962	2320	83	Shaun the Sheep Movie	2360	2015
83	5/21	18754371	1/16	197000	12	84	Still Alice	1318	2015
84	-	18247445	10/23	8070493	1656	85	Paranormal Activity: The Ghost Dimension	1656	2015
85	11/5	17737646	7/17	2434908	361	86	Mr. Holmes	898	2015
86	-	17614323	10/9	521522	4	87	Steve Jobs	2493	2015
87	9/17	17506470	6/19	6100010	2002	88	Dope	2002	2015
88	3/19	17223265	2/6	7217640	2875	89	Seventh Son	2875	2015
89	7/23	16432322	4/17	4577861	2012	90	Monkey Kingdom	2012	2015
90	11/19	16029670	9/4	7355622	3434	91	The Transporter Refueled	3434	2015
91	-	15128355	11/13	8317545	2603	92	Love the Coopers	2603	2015
92	6/25	14674076	3/13	160089	4	93	It Follows	1655	2015
93	10/8	14440985	8/21	5454284	2778	94	American Ultra	2778	2015
94	-	14036500	10/16	4002226	1553	95	Woodlawn	1553	2015
95	-	13443407	10/30	5002521	3003	96	Burnt	3003	2015
96	6/25	12985600	3/20	3591282	1320	97	Do You Believe?	1356	2015
97	10/1	12551031	6/5	2122177	481	98	Love & Mercy	791	2015
98	4/16	12429583	1/23	5504441	3020	99	Strange Magic	3020	2015
99	3/26	12314651	2/20	5963324	2880	100	Hot Tub Time Machine 2	2901	2015