In [1]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import requests
from s3fs.core import S3FileSystem
In [2]:
base_address = 'http://www.the-numbers.com/movie/budgets/all/'
In [3]:
movies_dict = {}
record_count = 1
while True:
address = '{0}{1}'.format(base_address, record_count)
web_req = requests.get(address)
soup = BeautifulSoup(web_req.text, 'lxml')
# collect table data tags in a list
html_td_tags = [td_tag.text for td_tag in soup.select('td ')]
# when to exit loop
if not html_td_tags:
break
# chunk table data tag list to groups of 6 (6 colums in original table)
movie_records = [html_td_tags[i:i + 6] for i in range(0, len(html_td_tags), 6)]
# update record number modifier based on page number
record_mod = record_count - 1
# move data into dictionary to send to pandas dataframe
for movie in movie_records:
movies_dict[int(movie[0]) + record_mod] = movie[1:]
record_count += 100
df = pd.DataFrame.from_dict(movies_dict, orient='index')
print(len(df))
df.head()
Out[3]:
In [4]:
df.columns = ['release_date', 'title', 'production_budget', 'domestic_gross', 'world_gross']
df = df[['title', 'release_date', 'production_budget', 'domestic_gross', 'world_gross']]
df.head()
Out[4]:
In [5]:
# convert currency values to integers
for col in ['production_budget','domestic_gross','world_gross']:
df[[col]] = df[[col]].replace(r'[\$,]', '', regex=True).astype(int)
In [6]:
# convert release date to datetime
df['release_date'] = pd.to_datetime(df['release_date'], format='%m/%d/%Y')
df.head()
Out[6]:
In [7]:
# remove apostrophe conversion errors
df['title'] = df['title'].apply(lambda x: x.replace('âs', 's'))
df['title'] = df['title'].apply(lambda x: x.replace('â', ' '))
df.head()
Out[7]:
In [8]:
os.environ['AWS_CONFIG_FILE'] = 'aws_config.ini'
s3 = S3FileSystem(anon=False)
key = 'TheNumbers_budgets.csv'
bucket = 'movie-torrents'
write_bytes = df.to_csv(None).encode()
with s3.open('s3://{0}/{1}'.format(bucket, key), 'wb') as f:
f.write(write_bytes)
In [9]:
# drop out clean csv
# df.to_csv(write_file, sep=',', index=False)
In [ ]: