In [1]:
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'http://www.boxofficemojo.com/yearly/'
In [2]:
resp = requests.get(url)
resp.encoding = 'utf-8'
soup = BeautifulSoup(resp.text, 'lxml')
In [3]:
table = soup.find('table', attrs={'cellspacing': '1'})
rows = table.find_all('tr')
from pprint import pprint
colname = rows.pop(0)
colname = [i.text for i in colname]
rows = [list(row.stripped_strings) for row in rows]
df = pd.DataFrame(rows, columns=colname)
df
['Year',
'TotalGross*',
'Change',
'TicketsSold',
'Change',
'# ofMovies',
'TotalScreens',
'Avg.TicketPrice',
'Avg.Cost^',
'#1 Movie']
Out[3]:
Year
TotalGross*
Change
TicketsSold
Change
# ofMovies
TotalScreens
Avg.TicketPrice
Avg.Cost^
#1 Movie
0
2018
$4,310.3
-
470.6
-
264
-
$9.16
-
Black Panther
1
2017
$11,071.9
-2.7%
1,234.3
-6.2%
738
-
$8.97
-
Star Wars: The Last Jedi
2
2016
$11,377.7
+2.2%
1,315.3
-0.4%
736
-
$8.65
-
Rogue One
3
2015
$11,129.4
+7.4%
1,320.2
+4.1%
705
-
$8.43
-
Star Wars: The Force Awakens
4
2014
$10,361.2
-5.2%
1,268.2
-5.6%
706
-
$8.17
-
American Sniper
5
2013
$10,924.6
+0.8%
1,343.7
-1.3%
688
-
$8.13
-
Catching Fire
6
2012
$10,837.6
+6.5%
1,361.5
+6.1%
669
-
$7.96
-
The Avengers
7
2011
$10,174.2
-3.7%
1,283.0
-4.2%
602
-
$7.93
-
Harry Potter / Deathly Hallows (P2)
8
2010
$10,565.6
-0.3%
1,339.1
-5.2%
537
-
$7.89
-
Toy Story 3
9
2009
$10,595.5
+10.0%
1,412.7
+5.3%
521
-
$7.50
-
Avatar
10
2008
$9,630.7
-0.3%
1,341.3
-4.5%
607
-
$7.18
-
The Dark Knight
11
2007
$9,663.8
+4.9%
1,404.6
-0.1%
631
-
$6.88
-
Spider-Man 3
12
2006
$9,209.5
+4.2%
1,406.0
+2.0%
608
-
$6.55
-
Dead Man's Chest
13
2005
$8,840.5
-5.8%
1,379.2
-8.7%
547
-
$6.41
-
Revenge of the Sith
14
2004
$9,380.5
+1.5%
1,510.5
-1.4%
551
-
$6.21
-
Shrek 2
15
2003
$9,239.7
+0.9%
1,532.3
-2.8%
506
-
$6.03
$63.8
Return of the King
16
2002
$9,155.1
+8.8%
1,575.7
+6.0%
480
35,592
$5.81
$58.8
Spider-Man
17
2001
$8,412.5
+9.8%
1,487.3
+4.7%
482
36,764
$5.66
$47.7
Harry Potter / Sorcerer's Stone
18
2000
$7,661.0
+2.9%
1,420.8
-3.0%
478
37,396
$5.39
$54.8
The Grinch
19
1999
$7,448.0
+7.2%
1,465.2
-1.1%
461
37,185
$5.08
$51.5
The Phantom Menace
20
1998
$6,949.0
+9.2%
1,480.7
+6.7%
509
34,186
$4.69
$52.7
Saving Private Ryan
21
1997
$6,365.9
+7.7%
1,387.7
+3.7%
510
31,640
$4.59
$53.4
Titanic
22
1996
$5,911.5
+7.6%
1,338.6
+6.0%
471
29,690
$4.42
$39.8
Independence Day
23
1995
$5,493.5
+1.8%
1,262.6
-2.3%
411
27,805
$4.35
$36.4
Toy Story
24
1994
$5,396.2
+4.7%
1,291.7
+3.8%
453
26,586
$4.18
$34.3
Forrest Gump
25
1993
$5,154.2
+5.8%
1,244.0
+6.0%
462
25,737
$4.14
$29.9
Jurassic Park
26
1992
$4,871.0
+1.4%
1,173.2
+2.9%
480
25,105
$4.15
$28.9
Aladdin
27
1991
$4,803.2
-4.4%
1,140.6
-4.0%
458
24,570
$4.21
$26.1
Terminator 2
28
1990
$5,021.8
-0.2%
1,188.6
-5.9%
410
23,689
$4.23
$26.8
Home Alone
29
1989
$5,033.4
+12.9%
1,262.8
+16.4%
502
23,132
$3.97
$23.5
Batman
30
1988
$4,458.4
+4.8%
1,084.8
-0.3%
510
23,234
$4.11
$18.1
Rain Man
31
1987
$4,252.9
+12.6%
1,088.5
+7.0%
509
23,555
$3.91
$20.1
Three Men and a Baby
32
1986
$3,778.0
+0.8%
1,017.2
-3.7%
451
22,765
$3.71
$17.5
Top Gun
33
1985
$3,749.2
-7.0%
1,056.1
-11.9%
470
21,147
$3.55
$16.8
Back to the Future
34
1984
$4,031.0
+7.0%
1,199.0
+0.2%
536
20,200
$3.36
$14.4
Beverly Hills Cop
35
1983
$3,766.0
+9.1%
1,197.0
+1.9%
495
18,884
$3.15
$11.9
Return of the Jedi
36
1982
$3,453.0
+16.4%
1,175.0
+10.1%
428
18,020
$2.94
$11.8
E.T.
37
1981
$2,966.0
+7.9%
1,067.0
+4.4%
173
18,040
$2.78
$11.3
Raiders / Lost Ark
38
1980
$2,749.0
-
1,022.0
-
161
17,590
$2.69
$9.4
The Empire Strikes Back
In [4]:
results = os.path.abspath('../results')
if not os.path.exists(results):
os.makedirs(results)
filename = os.path.join(results, 'boxofficemojo.csv')
df.to_csv(filename, index=False)
print('Save csv to {}'.format(filename))
Save csv to /home/dirl/github/Python-Crawling-Tutorial/results/boxofficemojo.csv
Content source: afunTW/dsc-crawling
Similar notebooks: