notebook.community

Edit and run



In [1]:

    
!pwd









    



/Users/seanreed1/AnacondaProjects/scrapy-projects/movie-project/box-office-data/parsed-movie-files



In [2]:

    
!ls









    



1.html                        3801.html
1001.html                     3901.html
101.html                      4001.html
1101.html                     401.html
1201.html                     4101.html
1301.html                     4201.html
1401.html                     4301.html
1501.html                     4401.html
1601.html                     4501.html
1701.html                     4601.html
1801.html                     4701.html
1901.html                     4801.html
2001.html                     4901.html
201.html                      5001.html
2101.html                     501.html
2201.html                     5101.html
2301.html                     5201.html
2401.html                     5301.html
2501.html                     5401.html
2601.html                     5501.html
2701.html                     5601.html
2801.html                     5701.html
2901.html                     5801.html
3001.html                     5901.html
301.html                      6001.html
3101.html                     601.html
3201.html                     701.html
3301.html                     801.html
3401.html                     901.html
3501.html                     The-Numbers-HTML-Parser.ipynb
3601.html                     The-Numbers-Parser-v2.ipynb
3701.html



In [3]:

    
from bs4 import BeautifulSoup
from pandas import DataFrame, read_csv
import pandas as pd

headings = ["Rank","Title", "Release_Year",  "Domestic Box Office", "International Box Office", "Total Box Office"]

movie_list = []

for n in range(1,6101,100):
    fname = str(n)+'.html'
    with open(fname) as f:
        soup = BeautifulSoup(f, 'lxml')
        td = soup.table.tbody.find_all('td')

    for i in range(0,600,6):
        movie_tuple =tuple([td[i].contents[0], 
                                   td[i+2].a.contents[0],
                                   td[i+1].a.contents[0], 
                                   td[i+3].contents[0],
                                   td[i+4].contents[0],
                                   td[i+5].contents[0]])
        movie_list.append(movie_tuple)

df = pd.DataFrame(data = movie_list, columns=headings)



In [5]:

    
df.tail()









    Out[5]:






  
    
      
      Rank
      Title
      Release_Year
      Domestic Box Office
      International Box Office
      Total Box Office
    
  
  
    
      6095
      6,096
      Heart Condition
      1990
      $4,134,992
      $0
      $4,134,992
    
    
      6096
      6,097
      Venom
      1982
      $4,117,958
      $0
      $4,117,958
    
    
      6097
      6,098
      Partners
      1982
      $4,109,724
      $0
      $4,109,724
    
    
      6098
      6,099
      Kama Sutra
      1997
      $4,109,095
      $0
      $4,109,095
    
    
      6099
      6,100
      Serbuan maut
      2012
      $4,105,123
      $5,192,284
      $9,297,407



In [10]:

    
df.to_csv('final-box-office-data.csv', index=False, header=True)



In [7]:

    
!ls









    



1.html                        3801.html
1001.html                     3901.html
101.html                      4001.html
1101.html                     401.html
1201.html                     4101.html
1301.html                     4201.html
1401.html                     4301.html
1501.html                     4401.html
1601.html                     4501.html
1701.html                     4601.html
1801.html                     4701.html
1901.html                     4801.html
2001.html                     4901.html
201.html                      5001.html
2101.html                     501.html
2201.html                     5101.html
2301.html                     5201.html
2401.html                     5301.html
2501.html                     5401.html
2601.html                     5501.html
2701.html                     5601.html
2801.html                     5701.html
2901.html                     5801.html
3001.html                     5901.html
301.html                      6001.html
3101.html                     601.html
3201.html                     701.html
3301.html                     801.html
3401.html                     901.html
3501.html                     The-Numbers-HTML-Parser.ipynb
3601.html                     The-Numbers-Parser-v2.ipynb
3701.html                     final-box-office-data.csv



In [8]:

    
'''
general form(not needed since already in the proper directory)
Location = r'C:\Users\david\notebooks\update\births1880.csv'
df = pd.read_csv(Location)
'''

df2 = read_csv('final-box-office-data.csv')



In [9]:

    
df2.tail()









    Out[9]:






  
    
      
      1
      Star Wars Ep. VII: The Force Awakens
      2015
      $936,662,225
      $1,122,000,000
      $2,058,662,225
    
  
  
    
      6094
      6,096
      Heart Condition
      1990
      $4,134,992
      $0
      $4,134,992
    
    
      6095
      6,097
      Venom
      1982
      $4,117,958
      $0
      $4,117,958
    
    
      6096
      6,098
      Partners
      1982
      $4,109,724
      $0
      $4,109,724
    
    
      6097
      6,099
      Kama Sutra
      1997
      $4,109,095
      $0
      $4,109,095
    
    
      6098
      6,100
      Serbuan maut
      2012
      $4,105,123
      $5,192,284
      $9,297,407



In [ ]:

	Rank	Title	Release_Year	Domestic Box Office	International Box Office	Total Box Office
6095	6,096	Heart Condition	1990	$4,134,992	$0	$4,134,992
6096	6,097	Venom	1982	$4,117,958	$0	$4,117,958
6097	6,098	Partners	1982	$4,109,724	$0	$4,109,724
6098	6,099	Kama Sutra	1997	$4,109,095	$0	$4,109,095
6099	6,100	Serbuan maut	2012	$4,105,123	$5,192,284	$9,297,407

	1	Star Wars Ep. VII: The Force Awakens	2015	$936,662,225	$1,122,000,000	$2,058,662,225
6094	6,096	Heart Condition	1990	$4,134,992	$0	$4,134,992
6095	6,097	Venom	1982	$4,117,958	$0	$4,117,958
6096	6,098	Partners	1982	$4,109,724	$0	$4,109,724
6097	6,099	Kama Sutra	1997	$4,109,095	$0	$4,109,095
6098	6,100	Serbuan maut	2012	$4,105,123	$5,192,284	$9,297,407