In [1]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import requests
from s3fs.core import S3FileSystem

In [2]:
base_address = 'http://www.the-numbers.com/movie/budgets/all/'

In [3]:
movies_dict = {}
record_count = 1

while True:
    address = '{0}{1}'.format(base_address, record_count)
    web_req  = requests.get(address)
    soup = BeautifulSoup(web_req.text, 'lxml')

    # collect table data tags in a list
    html_td_tags = [td_tag.text for td_tag in soup.select('td ')]
    
    # when to exit loop
    if not html_td_tags:
        break
    
    # chunk table data tag list to groups of 6 (6 colums in original table)
    movie_records = [html_td_tags[i:i + 6] for i in range(0, len(html_td_tags), 6)]

    # update record number modifier based on page number
    record_mod = record_count - 1

    # move data into dictionary to send to pandas dataframe
    for movie in movie_records:
        movies_dict[int(movie[0]) + record_mod] = movie[1:]
    
    record_count += 100
        
df = pd.DataFrame.from_dict(movies_dict, orient='index')
print(len(df))
df.head()


5396
Out[3]:
0 1 2 3 4
1 12/18/2009 Avatar $425,000,000 $760,507,625 $2,783,918,982
2 12/18/2015 Star Wars Ep. VII: The Force Awakens $306,000,000 $936,662,225 $2,058,662,225
3 5/24/2007 Pirates of the Caribbean: At World’s End $300,000,000 $309,420,425 $963,420,425
4 11/6/2015 Spectre $300,000,000 $200,074,175 $879,620,923
5 7/20/2012 The Dark Knight Rises $275,000,000 $448,139,099 $1,084,439,099

In [4]:
df.columns = ['release_date', 'title', 'production_budget', 'domestic_gross', 'world_gross']
df = df[['title', 'release_date', 'production_budget', 'domestic_gross', 'world_gross']]
df.head()


Out[4]:
title release_date production_budget domestic_gross world_gross
1 Avatar 12/18/2009 $425,000,000 $760,507,625 $2,783,918,982
2 Star Wars Ep. VII: The Force Awakens 12/18/2015 $306,000,000 $936,662,225 $2,058,662,225
3 Pirates of the Caribbean: At World’s End 5/24/2007 $300,000,000 $309,420,425 $963,420,425
4 Spectre 11/6/2015 $300,000,000 $200,074,175 $879,620,923
5 The Dark Knight Rises 7/20/2012 $275,000,000 $448,139,099 $1,084,439,099

In [5]:
# convert currency values to integers
for col in ['production_budget','domestic_gross','world_gross']:
    df[[col]] = df[[col]].replace(r'[\$,]', '', regex=True).astype(int)

In [6]:
# convert release date to datetime
df['release_date'] = pd.to_datetime(df['release_date'], format='%m/%d/%Y')
df.head()


Out[6]:
title release_date production_budget domestic_gross world_gross
1 Avatar 2009-12-18 425000000 760507625 2783918982
2 Star Wars Ep. VII: The Force Awakens 2015-12-18 306000000 936662225 2058662225
3 Pirates of the Caribbean: At World’s End 2007-05-24 300000000 309420425 963420425
4 Spectre 2015-11-06 300000000 200074175 879620923
5 The Dark Knight Rises 2012-07-20 275000000 448139099 1084439099

In [7]:
# remove apostrophe conversion errors
df['title'] = df['title'].apply(lambda x: x.replace('’s', 's'))
df['title'] = df['title'].apply(lambda x: x.replace('â', ' '))
df.head()


Out[7]:
title release_date production_budget domestic_gross world_gross
1 Avatar 2009-12-18 425000000 760507625 2783918982
2 Star Wars Ep. VII: The Force Awakens 2015-12-18 306000000 936662225 2058662225
3 Pirates of the Caribbean: At Worlds End 2007-05-24 300000000 309420425 963420425
4 Spectre 2015-11-06 300000000 200074175 879620923
5 The Dark Knight Rises 2012-07-20 275000000 448139099 1084439099

In [8]:
os.environ['AWS_CONFIG_FILE'] = 'aws_config.ini'

s3 = S3FileSystem(anon=False)
key = 'TheNumbers_budgets.csv'
bucket = 'movie-torrents'

write_bytes = df.to_csv(None).encode()
with s3.open('s3://{0}/{1}'.format(bucket, key), 'wb') as f:
    f.write(write_bytes)

In [9]:
# drop out clean csv
# df.to_csv(write_file, sep=',', index=False)

In [ ]: