In [1]:
!pwd
In [2]:
!ls
In [3]:
from bs4 import BeautifulSoup
from pandas import DataFrame, read_csv
import pandas as pd
headings = ["Rank","Title", "Release_Year", "Domestic Box Office", "International Box Office", "Total Box Office"]
movie_list = []
for n in range(1,6101,100):
fname = str(n)+'.html'
with open(fname) as f:
soup = BeautifulSoup(f, 'lxml')
td = soup.table.tbody.find_all('td')
for i in range(0,600,6):
movie_tuple =tuple([td[i].contents[0],
td[i+2].a.contents[0],
td[i+1].a.contents[0],
td[i+3].contents[0],
td[i+4].contents[0],
td[i+5].contents[0]])
movie_list.append(movie_tuple)
df = pd.DataFrame(data = movie_list, columns=headings)
In [5]:
df.tail()
Out[5]:
In [10]:
df.to_csv('final-box-office-data.csv', index=False, header=True)
In [7]:
!ls
In [8]:
'''
general form(not needed since already in the proper directory)
Location = r'C:\Users\david\notebooks\update\births1880.csv'
df = pd.read_csv(Location)
'''
df2 = read_csv('final-box-office-data.csv')
In [9]:
df2.tail()
Out[9]:
In [ ]: