notebook.community

Edit and run



In [1]:

    
import os
import pandas as pd
from bs4 import BeautifulSoup
import requests
from s3fs.core import S3FileSystem



In [2]:

    
base_address = 'http://www.the-numbers.com/movie/budgets/all/'



In [3]:

    
movies_dict = {}
record_count = 1

while True:
    address = '{0}{1}'.format(base_address, record_count)
    web_req  = requests.get(address)
    soup = BeautifulSoup(web_req.text, 'lxml')

    # collect table data tags in a list
    html_td_tags = [td_tag.text for td_tag in soup.select('td ')]
    
    # when to exit loop
    if not html_td_tags:
        break
    
    # chunk table data tag list to groups of 6 (6 colums in original table)
    movie_records = [html_td_tags[i:i + 6] for i in range(0, len(html_td_tags), 6)]

    # update record number modifier based on page number
    record_mod = record_count - 1

    # move data into dictionary to send to pandas dataframe
    for movie in movie_records:
        movies_dict[int(movie[0]) + record_mod] = movie[1:]
    
    record_count += 100
        
df = pd.DataFrame.from_dict(movies_dict, orient='index')
print(len(df))
df.head()









    



5396






    Out[3]:







  
    
      
      0
      1
      2
      3
      4
    
  
  
    
      1
      12/18/2009
      Avatar
      $425,000,000
      $760,507,625
      $2,783,918,982
    
    
      2
      12/18/2015
      Star Wars Ep. VII: The Force Awakens
      $306,000,000
      $936,662,225
      $2,058,662,225
    
    
      3
      5/24/2007
      Pirates of the Caribbean: At Worldâs End
      $300,000,000
      $309,420,425
      $963,420,425
    
    
      4
      11/6/2015
      Spectre
      $300,000,000
      $200,074,175
      $879,620,923
    
    
      5
      7/20/2012
      The Dark Knight Rises
      $275,000,000
      $448,139,099
      $1,084,439,099



In [4]:

    
df.columns = ['release_date', 'title', 'production_budget', 'domestic_gross', 'world_gross']
df = df[['title', 'release_date', 'production_budget', 'domestic_gross', 'world_gross']]
df.head()









    Out[4]:







  
    
      
      title
      release_date
      production_budget
      domestic_gross
      world_gross
    
  
  
    
      1
      Avatar
      12/18/2009
      $425,000,000
      $760,507,625
      $2,783,918,982
    
    
      2
      Star Wars Ep. VII: The Force Awakens
      12/18/2015
      $306,000,000
      $936,662,225
      $2,058,662,225
    
    
      3
      Pirates of the Caribbean: At Worldâs End
      5/24/2007
      $300,000,000
      $309,420,425
      $963,420,425
    
    
      4
      Spectre
      11/6/2015
      $300,000,000
      $200,074,175
      $879,620,923
    
    
      5
      The Dark Knight Rises
      7/20/2012
      $275,000,000
      $448,139,099
      $1,084,439,099



In [5]:

    
# convert currency values to integers
for col in ['production_budget','domestic_gross','world_gross']:
    df[[col]] = df[[col]].replace(r'[\$,]', '', regex=True).astype(int)



In [6]:

    
# convert release date to datetime
df['release_date'] = pd.to_datetime(df['release_date'], format='%m/%d/%Y')
df.head()









    Out[6]:







  
    
      
      title
      release_date
      production_budget
      domestic_gross
      world_gross
    
  
  
    
      1
      Avatar
      2009-12-18
      425000000
      760507625
      2783918982
    
    
      2
      Star Wars Ep. VII: The Force Awakens
      2015-12-18
      306000000
      936662225
      2058662225
    
    
      3
      Pirates of the Caribbean: At Worldâs End
      2007-05-24
      300000000
      309420425
      963420425
    
    
      4
      Spectre
      2015-11-06
      300000000
      200074175
      879620923
    
    
      5
      The Dark Knight Rises
      2012-07-20
      275000000
      448139099
      1084439099



In [7]:

    
# remove apostrophe conversion errors
df['title'] = df['title'].apply(lambda x: x.replace('âs', 's'))
df['title'] = df['title'].apply(lambda x: x.replace('â', ' '))
df.head()









    Out[7]:







  
    
      
      title
      release_date
      production_budget
      domestic_gross
      world_gross
    
  
  
    
      1
      Avatar
      2009-12-18
      425000000
      760507625
      2783918982
    
    
      2
      Star Wars Ep. VII: The Force Awakens
      2015-12-18
      306000000
      936662225
      2058662225
    
    
      3
      Pirates of the Caribbean: At Worlds End
      2007-05-24
      300000000
      309420425
      963420425
    
    
      4
      Spectre
      2015-11-06
      300000000
      200074175
      879620923
    
    
      5
      The Dark Knight Rises
      2012-07-20
      275000000
      448139099
      1084439099



In [8]:

    
os.environ['AWS_CONFIG_FILE'] = 'aws_config.ini'

s3 = S3FileSystem(anon=False)
key = 'TheNumbers_budgets.csv'
bucket = 'movie-torrents'

write_bytes = df.to_csv(None).encode()
with s3.open('s3://{0}/{1}'.format(bucket, key), 'wb') as f:
    f.write(write_bytes)



In [9]:

    
# drop out clean csv
# df.to_csv(write_file, sep=',', index=False)



In [ ]:

	0	1	2	3	4
1	12/18/2009	Avatar	$425,000,000	$760,507,625	$2,783,918,982
2	12/18/2015	Star Wars Ep. VII: The Force Awakens	$306,000,000	$936,662,225	$2,058,662,225
3	5/24/2007	Pirates of the Caribbean: At Worldâs End	$300,000,000	$309,420,425	$963,420,425
4	11/6/2015	Spectre	$300,000,000	$200,074,175	$879,620,923
5	7/20/2012	The Dark Knight Rises	$275,000,000	$448,139,099	$1,084,439,099

	title	release_date	production_budget	domestic_gross	world_gross
1	Avatar	2009-12-18	425000000	760507625	2783918982
2	Star Wars Ep. VII: The Force Awakens	2015-12-18	306000000	936662225	2058662225
3	Pirates of the Caribbean: At Worldâs End	2007-05-24	300000000	309420425	963420425
4	Spectre	2015-11-06	300000000	200074175	879620923
5	The Dark Knight Rises	2012-07-20	275000000	448139099	1084439099