In [1]:
    
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'http://www.boxofficemojo.com/yearly/'
    
In [2]:
    
resp = requests.get(url)
resp.encoding = 'utf-8'
soup = BeautifulSoup(resp.text, 'lxml')
    
In [3]:
    
table = soup.find('table', attrs={'cellspacing': '1'})
rows = table.find_all('tr')
from pprint import pprint
colname = rows.pop(0)
colname = [i.text for i in colname]
rows = [list(row.stripped_strings) for row in rows]
df = pd.DataFrame(rows, columns=colname)
df
    
    
['Year',
 'TotalGross*',
 'Change',
 'TicketsSold',
 'Change',
 '# ofMovies',
 'TotalScreens',
 'Avg.TicketPrice',
 'Avg.Cost^',
 '#1 Movie']
    Out[3]:
  
    
       
      Year 
      TotalGross* 
      Change 
      TicketsSold 
      Change 
      # ofMovies 
      TotalScreens 
      Avg.TicketPrice 
      Avg.Cost^ 
      #1 Movie 
     
  
  
    
      0 
      2018 
      $4,310.3 
      - 
      470.6 
      - 
      264 
      - 
      $9.16 
      - 
      Black Panther 
     
    
      1 
      2017 
      $11,071.9 
      -2.7% 
      1,234.3 
      -6.2% 
      738 
      - 
      $8.97 
      - 
      Star Wars: The Last Jedi 
     
    
      2 
      2016 
      $11,377.7 
      +2.2% 
      1,315.3 
      -0.4% 
      736 
      - 
      $8.65 
      - 
      Rogue One 
     
    
      3 
      2015 
      $11,129.4 
      +7.4% 
      1,320.2 
      +4.1% 
      705 
      - 
      $8.43 
      - 
      Star Wars: The Force Awakens 
     
    
      4 
      2014 
      $10,361.2 
      -5.2% 
      1,268.2 
      -5.6% 
      706 
      - 
      $8.17 
      - 
      American Sniper 
     
    
      5 
      2013 
      $10,924.6 
      +0.8% 
      1,343.7 
      -1.3% 
      688 
      - 
      $8.13 
      - 
      Catching Fire 
     
    
      6 
      2012 
      $10,837.6 
      +6.5% 
      1,361.5 
      +6.1% 
      669 
      - 
      $7.96 
      - 
      The Avengers 
     
    
      7 
      2011 
      $10,174.2 
      -3.7% 
      1,283.0 
      -4.2% 
      602 
      - 
      $7.93 
      - 
      Harry Potter / Deathly Hallows (P2) 
     
    
      8 
      2010 
      $10,565.6 
      -0.3% 
      1,339.1 
      -5.2% 
      537 
      - 
      $7.89 
      - 
      Toy Story 3 
     
    
      9 
      2009 
      $10,595.5 
      +10.0% 
      1,412.7 
      +5.3% 
      521 
      - 
      $7.50 
      - 
      Avatar 
     
    
      10 
      2008 
      $9,630.7 
      -0.3% 
      1,341.3 
      -4.5% 
      607 
      - 
      $7.18 
      - 
      The Dark Knight 
     
    
      11 
      2007 
      $9,663.8 
      +4.9% 
      1,404.6 
      -0.1% 
      631 
      - 
      $6.88 
      - 
      Spider-Man 3 
     
    
      12 
      2006 
      $9,209.5 
      +4.2% 
      1,406.0 
      +2.0% 
      608 
      - 
      $6.55 
      - 
      Dead Man's Chest 
     
    
      13 
      2005 
      $8,840.5 
      -5.8% 
      1,379.2 
      -8.7% 
      547 
      - 
      $6.41 
      - 
      Revenge of the Sith 
     
    
      14 
      2004 
      $9,380.5 
      +1.5% 
      1,510.5 
      -1.4% 
      551 
      - 
      $6.21 
      - 
      Shrek 2 
     
    
      15 
      2003 
      $9,239.7 
      +0.9% 
      1,532.3 
      -2.8% 
      506 
      - 
      $6.03 
      $63.8 
      Return of the King 
     
    
      16 
      2002 
      $9,155.1 
      +8.8% 
      1,575.7 
      +6.0% 
      480 
      35,592 
      $5.81 
      $58.8 
      Spider-Man 
     
    
      17 
      2001 
      $8,412.5 
      +9.8% 
      1,487.3 
      +4.7% 
      482 
      36,764 
      $5.66 
      $47.7 
      Harry Potter / Sorcerer's Stone 
     
    
      18 
      2000 
      $7,661.0 
      +2.9% 
      1,420.8 
      -3.0% 
      478 
      37,396 
      $5.39 
      $54.8 
      The Grinch 
     
    
      19 
      1999 
      $7,448.0 
      +7.2% 
      1,465.2 
      -1.1% 
      461 
      37,185 
      $5.08 
      $51.5 
      The Phantom Menace 
     
    
      20 
      1998 
      $6,949.0 
      +9.2% 
      1,480.7 
      +6.7% 
      509 
      34,186 
      $4.69 
      $52.7 
      Saving Private Ryan 
     
    
      21 
      1997 
      $6,365.9 
      +7.7% 
      1,387.7 
      +3.7% 
      510 
      31,640 
      $4.59 
      $53.4 
      Titanic 
     
    
      22 
      1996 
      $5,911.5 
      +7.6% 
      1,338.6 
      +6.0% 
      471 
      29,690 
      $4.42 
      $39.8 
      Independence Day 
     
    
      23 
      1995 
      $5,493.5 
      +1.8% 
      1,262.6 
      -2.3% 
      411 
      27,805 
      $4.35 
      $36.4 
      Toy Story 
     
    
      24 
      1994 
      $5,396.2 
      +4.7% 
      1,291.7 
      +3.8% 
      453 
      26,586 
      $4.18 
      $34.3 
      Forrest Gump 
     
    
      25 
      1993 
      $5,154.2 
      +5.8% 
      1,244.0 
      +6.0% 
      462 
      25,737 
      $4.14 
      $29.9 
      Jurassic Park 
     
    
      26 
      1992 
      $4,871.0 
      +1.4% 
      1,173.2 
      +2.9% 
      480 
      25,105 
      $4.15 
      $28.9 
      Aladdin 
     
    
      27 
      1991 
      $4,803.2 
      -4.4% 
      1,140.6 
      -4.0% 
      458 
      24,570 
      $4.21 
      $26.1 
      Terminator 2 
     
    
      28 
      1990 
      $5,021.8 
      -0.2% 
      1,188.6 
      -5.9% 
      410 
      23,689 
      $4.23 
      $26.8 
      Home Alone 
     
    
      29 
      1989 
      $5,033.4 
      +12.9% 
      1,262.8 
      +16.4% 
      502 
      23,132 
      $3.97 
      $23.5 
      Batman 
     
    
      30 
      1988 
      $4,458.4 
      +4.8% 
      1,084.8 
      -0.3% 
      510 
      23,234 
      $4.11 
      $18.1 
      Rain Man 
     
    
      31 
      1987 
      $4,252.9 
      +12.6% 
      1,088.5 
      +7.0% 
      509 
      23,555 
      $3.91 
      $20.1 
      Three Men and a Baby 
     
    
      32 
      1986 
      $3,778.0 
      +0.8% 
      1,017.2 
      -3.7% 
      451 
      22,765 
      $3.71 
      $17.5 
      Top Gun 
     
    
      33 
      1985 
      $3,749.2 
      -7.0% 
      1,056.1 
      -11.9% 
      470 
      21,147 
      $3.55 
      $16.8 
      Back to the Future 
     
    
      34 
      1984 
      $4,031.0 
      +7.0% 
      1,199.0 
      +0.2% 
      536 
      20,200 
      $3.36 
      $14.4 
      Beverly Hills Cop 
     
    
      35 
      1983 
      $3,766.0 
      +9.1% 
      1,197.0 
      +1.9% 
      495 
      18,884 
      $3.15 
      $11.9 
      Return of the Jedi 
     
    
      36 
      1982 
      $3,453.0 
      +16.4% 
      1,175.0 
      +10.1% 
      428 
      18,020 
      $2.94 
      $11.8 
      E.T. 
     
    
      37 
      1981 
      $2,966.0 
      +7.9% 
      1,067.0 
      +4.4% 
      173 
      18,040 
      $2.78 
      $11.3 
      Raiders / Lost Ark 
     
    
      38 
      1980 
      $2,749.0 
      - 
      1,022.0 
      - 
      161 
      17,590 
      $2.69 
      $9.4 
      The Empire Strikes Back 
     
  
In [4]:
    
results = os.path.abspath('../results')
if not os.path.exists(results):
    os.makedirs(results)
filename = os.path.join(results, 'boxofficemojo.csv')
df.to_csv(filename, index=False)
print('Save csv to {}'.format(filename))
    
    
Save csv to /home/dirl/github/Python-Crawling-Tutorial/results/boxofficemojo.csv
Content source: afunTW/dsc-crawling
Similar notebooks: