In [1]:
!pwd


/Users/seanreed1/AnacondaProjects/scrapy-projects/movie-project/box-office-data/parsed-movie-files

In [2]:
!ls


1.html                        3801.html
1001.html                     3901.html
101.html                      4001.html
1101.html                     401.html
1201.html                     4101.html
1301.html                     4201.html
1401.html                     4301.html
1501.html                     4401.html
1601.html                     4501.html
1701.html                     4601.html
1801.html                     4701.html
1901.html                     4801.html
2001.html                     4901.html
201.html                      5001.html
2101.html                     501.html
2201.html                     5101.html
2301.html                     5201.html
2401.html                     5301.html
2501.html                     5401.html
2601.html                     5501.html
2701.html                     5601.html
2801.html                     5701.html
2901.html                     5801.html
3001.html                     5901.html
301.html                      6001.html
3101.html                     601.html
3201.html                     701.html
3301.html                     801.html
3401.html                     901.html
3501.html                     The-Numbers-HTML-Parser.ipynb
3601.html                     The-Numbers-Parser-v2.ipynb
3701.html

In [3]:
from bs4 import BeautifulSoup
from pandas import DataFrame, read_csv
import pandas as pd

headings = ["Rank","Title", "Release_Year",  "Domestic Box Office", "International Box Office", "Total Box Office"]

movie_list = []

for n in range(1,6101,100):
    fname = str(n)+'.html'
    with open(fname) as f:
        soup = BeautifulSoup(f, 'lxml')
        td = soup.table.tbody.find_all('td')

    for i in range(0,600,6):
        movie_tuple =tuple([td[i].contents[0], 
                                   td[i+2].a.contents[0],
                                   td[i+1].a.contents[0], 
                                   td[i+3].contents[0],
                                   td[i+4].contents[0],
                                   td[i+5].contents[0]])
        movie_list.append(movie_tuple)

df = pd.DataFrame(data = movie_list, columns=headings)

In [5]:
df.tail()


Out[5]:
Rank Title Release_Year Domestic Box Office International Box Office Total Box Office
6095 6,096 Heart Condition 1990 $4,134,992 $0 $4,134,992
6096 6,097 Venom 1982 $4,117,958 $0 $4,117,958
6097 6,098 Partners 1982 $4,109,724 $0 $4,109,724
6098 6,099 Kama Sutra 1997 $4,109,095 $0 $4,109,095
6099 6,100 Serbuan maut 2012 $4,105,123 $5,192,284 $9,297,407

In [10]:
df.to_csv('final-box-office-data.csv', index=False, header=True)

In [7]:
!ls


1.html                        3801.html
1001.html                     3901.html
101.html                      4001.html
1101.html                     401.html
1201.html                     4101.html
1301.html                     4201.html
1401.html                     4301.html
1501.html                     4401.html
1601.html                     4501.html
1701.html                     4601.html
1801.html                     4701.html
1901.html                     4801.html
2001.html                     4901.html
201.html                      5001.html
2101.html                     501.html
2201.html                     5101.html
2301.html                     5201.html
2401.html                     5301.html
2501.html                     5401.html
2601.html                     5501.html
2701.html                     5601.html
2801.html                     5701.html
2901.html                     5801.html
3001.html                     5901.html
301.html                      6001.html
3101.html                     601.html
3201.html                     701.html
3301.html                     801.html
3401.html                     901.html
3501.html                     The-Numbers-HTML-Parser.ipynb
3601.html                     The-Numbers-Parser-v2.ipynb
3701.html                     final-box-office-data.csv

In [8]:
'''
general form(not needed since already in the proper directory)
Location = r'C:\Users\david\notebooks\update\births1880.csv'
df = pd.read_csv(Location)
'''

df2 = read_csv('final-box-office-data.csv')

In [9]:
df2.tail()


Out[9]:
1 Star Wars Ep. VII: The Force Awakens 2015 $936,662,225 $1,122,000,000 $2,058,662,225
6094 6,096 Heart Condition 1990 $4,134,992 $0 $4,134,992
6095 6,097 Venom 1982 $4,117,958 $0 $4,117,958
6096 6,098 Partners 1982 $4,109,724 $0 $4,109,724
6097 6,099 Kama Sutra 1997 $4,109,095 $0 $4,109,095
6098 6,100 Serbuan maut 2012 $4,105,123 $5,192,284 $9,297,407

In [ ]: