In [88]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import urllib
import pandas as pd
import pickle
import glob
In [40]:
def get_data(years,month_start, month_stop):
for year in years:
# Fire up the web driver and navigate to starting webpage
firefox = webdriver.Firefox()
matrix_url = "http://www.the-numbers.com/movies/report-builder"
firefox.get(matrix_url)
# Report builder page
# Select the genre
genre_selector = '//select[@name="genre"]/option[@value="Horror"]'
firefox.find_element_by_xpath(genre_selector).click()
# Specify min year
elem = firefox.find_element_by_name("min-year")
elem.send_keys(year)
# Specify max year
elem = firefox.find_element_by_name("max-year")
elem.send_keys(year)
# Specify min month
elem = firefox.find_element_by_name("min-month")
elem.send_keys(str(month_start))
# Specify max month
elem = firefox.find_element_by_name("max-month")
elem.send_keys(str(month_stop))
# Build report
firefox.find_element_by_xpath("//input[@value='Build Report']").click()
# Click link to go to navigate to full report webpage
link = firefox.find_element_by_link_text('Show and customize full report')
link.click()
# Movie report page
# Check show release year
firefox.find_element_by_name('show-release-year').click()
# Check show production budget
firefox.find_element_by_name('show-production-budget').click()
# Check show
firefox.find_element_by_name('show-inflation-adjusted-domestic-box-office').click()
# Select view order by
order_selector = '//select[@name="view-order-by"]/option[@value="release-date"]'
firefox.find_element_by_xpath(order_selector).click()
# Update report
firefox.find_element_by_xpath("//input[@value='Update Report']").click()
# Get data
data = []
for tr in firefox.find_elements_by_xpath('//center/table/tbody/tr'):
tds=tr.find_elements_by_tag_name('td')
if tds:
data.append([td.text for td in tds])
# Put data into dataframe
df = pd.DataFrame(data)
# Pickle dataframe
with open(str(year) + '-' + str(month_start) + '-' + str(month_stop)+ '.pickle', 'wb') as f:
pickle.dump(df, f)
# Close Firefox window
firefox.close()
In [48]:
# get 1970 through 2010
years = range(1970,2010)
start_month = 1
stop_month = 12
get_data(years,start_month, stop_month)
In [ ]:
# get first half years and pickle data
years = [2011, 2012, 2013, 2014, 2015]
start_month = 1
stop_month = 6
get_data(years,start_month, stop_month)
In [45]:
# get first half years and pickle data
years = [2011, 2012, 2013, 2014, 2015]
start_month = 7
stop_month = 12
get_data(years,start_month, stop_month)