練習


In [1]:
import os
import requests
import pandas as pd

from bs4 import BeautifulSoup

url = 'http://www.boxofficemojo.com/yearly/'

In [2]:
resp = requests.get(url)
resp.encoding = 'utf-8'
soup = BeautifulSoup(resp.text, 'lxml')

In [3]:
table = soup.find('table', attrs={'cellspacing': '1'})
rows = table.find_all('tr')
from pprint import pprint

colname = rows.pop(0)
colname = [i.text for i in colname]
rows = [list(row.stripped_strings) for row in rows]

df = pd.DataFrame(rows, columns=colname)
df


['Year',
 'TotalGross*',
 'Change',
 'TicketsSold',
 'Change',
 '# ofMovies',
 'TotalScreens',
 'Avg.TicketPrice',
 'Avg.Cost^',
 '#1 Movie']
Out[3]:
Year TotalGross* Change TicketsSold Change # ofMovies TotalScreens Avg.TicketPrice Avg.Cost^ #1 Movie
0 2018 $4,310.3 - 470.6 - 264 - $9.16 - Black Panther
1 2017 $11,071.9 -2.7% 1,234.3 -6.2% 738 - $8.97 - Star Wars: The Last Jedi
2 2016 $11,377.7 +2.2% 1,315.3 -0.4% 736 - $8.65 - Rogue One
3 2015 $11,129.4 +7.4% 1,320.2 +4.1% 705 - $8.43 - Star Wars: The Force Awakens
4 2014 $10,361.2 -5.2% 1,268.2 -5.6% 706 - $8.17 - American Sniper
5 2013 $10,924.6 +0.8% 1,343.7 -1.3% 688 - $8.13 - Catching Fire
6 2012 $10,837.6 +6.5% 1,361.5 +6.1% 669 - $7.96 - The Avengers
7 2011 $10,174.2 -3.7% 1,283.0 -4.2% 602 - $7.93 - Harry Potter / Deathly Hallows (P2)
8 2010 $10,565.6 -0.3% 1,339.1 -5.2% 537 - $7.89 - Toy Story 3
9 2009 $10,595.5 +10.0% 1,412.7 +5.3% 521 - $7.50 - Avatar
10 2008 $9,630.7 -0.3% 1,341.3 -4.5% 607 - $7.18 - The Dark Knight
11 2007 $9,663.8 +4.9% 1,404.6 -0.1% 631 - $6.88 - Spider-Man 3
12 2006 $9,209.5 +4.2% 1,406.0 +2.0% 608 - $6.55 - Dead Man's Chest
13 2005 $8,840.5 -5.8% 1,379.2 -8.7% 547 - $6.41 - Revenge of the Sith
14 2004 $9,380.5 +1.5% 1,510.5 -1.4% 551 - $6.21 - Shrek 2
15 2003 $9,239.7 +0.9% 1,532.3 -2.8% 506 - $6.03 $63.8 Return of the King
16 2002 $9,155.1 +8.8% 1,575.7 +6.0% 480 35,592 $5.81 $58.8 Spider-Man
17 2001 $8,412.5 +9.8% 1,487.3 +4.7% 482 36,764 $5.66 $47.7 Harry Potter / Sorcerer's Stone
18 2000 $7,661.0 +2.9% 1,420.8 -3.0% 478 37,396 $5.39 $54.8 The Grinch
19 1999 $7,448.0 +7.2% 1,465.2 -1.1% 461 37,185 $5.08 $51.5 The Phantom Menace
20 1998 $6,949.0 +9.2% 1,480.7 +6.7% 509 34,186 $4.69 $52.7 Saving Private Ryan
21 1997 $6,365.9 +7.7% 1,387.7 +3.7% 510 31,640 $4.59 $53.4 Titanic
22 1996 $5,911.5 +7.6% 1,338.6 +6.0% 471 29,690 $4.42 $39.8 Independence Day
23 1995 $5,493.5 +1.8% 1,262.6 -2.3% 411 27,805 $4.35 $36.4 Toy Story
24 1994 $5,396.2 +4.7% 1,291.7 +3.8% 453 26,586 $4.18 $34.3 Forrest Gump
25 1993 $5,154.2 +5.8% 1,244.0 +6.0% 462 25,737 $4.14 $29.9 Jurassic Park
26 1992 $4,871.0 +1.4% 1,173.2 +2.9% 480 25,105 $4.15 $28.9 Aladdin
27 1991 $4,803.2 -4.4% 1,140.6 -4.0% 458 24,570 $4.21 $26.1 Terminator 2
28 1990 $5,021.8 -0.2% 1,188.6 -5.9% 410 23,689 $4.23 $26.8 Home Alone
29 1989 $5,033.4 +12.9% 1,262.8 +16.4% 502 23,132 $3.97 $23.5 Batman
30 1988 $4,458.4 +4.8% 1,084.8 -0.3% 510 23,234 $4.11 $18.1 Rain Man
31 1987 $4,252.9 +12.6% 1,088.5 +7.0% 509 23,555 $3.91 $20.1 Three Men and a Baby
32 1986 $3,778.0 +0.8% 1,017.2 -3.7% 451 22,765 $3.71 $17.5 Top Gun
33 1985 $3,749.2 -7.0% 1,056.1 -11.9% 470 21,147 $3.55 $16.8 Back to the Future
34 1984 $4,031.0 +7.0% 1,199.0 +0.2% 536 20,200 $3.36 $14.4 Beverly Hills Cop
35 1983 $3,766.0 +9.1% 1,197.0 +1.9% 495 18,884 $3.15 $11.9 Return of the Jedi
36 1982 $3,453.0 +16.4% 1,175.0 +10.1% 428 18,020 $2.94 $11.8 E.T.
37 1981 $2,966.0 +7.9% 1,067.0 +4.4% 173 18,040 $2.78 $11.3 Raiders / Lost Ark
38 1980 $2,749.0 - 1,022.0 - 161 17,590 $2.69 $9.4 The Empire Strikes Back

In [4]:
results = os.path.abspath('../results')
if not os.path.exists(results):
    os.makedirs(results)

filename = os.path.join(results, 'boxofficemojo.csv')
df.to_csv(filename, index=False)
print('Save csv to {}'.format(filename))


Save csv to /home/dirl/github/Python-Crawling-Tutorial/results/boxofficemojo.csv