In [88]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import urllib
import pandas as pd
import pickle
import glob

In [40]:
def get_data(years,month_start, month_stop):
    for year in years: 
    
        # Fire up the web driver and navigate to starting webpage
        firefox = webdriver.Firefox()
        matrix_url = "http://www.the-numbers.com/movies/report-builder"
        firefox.get(matrix_url)

        # Report builder page

        # Select the genre
        genre_selector = '//select[@name="genre"]/option[@value="Horror"]'
        firefox.find_element_by_xpath(genre_selector).click()

        # Specify min year
        elem = firefox.find_element_by_name("min-year")
        elem.send_keys(year)

        # Specify max year
        elem = firefox.find_element_by_name("max-year")
        elem.send_keys(year)

        # Specify min month
        elem = firefox.find_element_by_name("min-month")
        elem.send_keys(str(month_start))

        # Specify max month
        elem = firefox.find_element_by_name("max-month")
        elem.send_keys(str(month_stop))

        # Build report
        firefox.find_element_by_xpath("//input[@value='Build Report']").click()

        # Click link to go to navigate to full report webpage
        link = firefox.find_element_by_link_text('Show and customize full report')
        link.click()

        # Movie report page

        # Check show release year
        firefox.find_element_by_name('show-release-year').click()

        # Check show production budget
        firefox.find_element_by_name('show-production-budget').click()

        # Check show 
        firefox.find_element_by_name('show-inflation-adjusted-domestic-box-office').click()

        # Select view order by
        order_selector = '//select[@name="view-order-by"]/option[@value="release-date"]'
        firefox.find_element_by_xpath(order_selector).click()

        # Update report
        firefox.find_element_by_xpath("//input[@value='Update Report']").click()

        # Get data
        data = []
        for tr in firefox.find_elements_by_xpath('//center/table/tbody/tr'):
            tds=tr.find_elements_by_tag_name('td')
            if tds: 
                data.append([td.text for td in tds])

        # Put data into dataframe
        df = pd.DataFrame(data)

        # Pickle dataframe
        with open(str(year) + '-' + str(month_start) + '-' + str(month_stop)+ '.pickle', 'wb') as f:
            pickle.dump(df, f)

        # Close Firefox window
        firefox.close()

In [48]:
# get 1970 through 2010
years = range(1970,2010)
start_month = 1
stop_month = 12
get_data(years,start_month, stop_month)


[1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009]

In [ ]:
# get first half years and pickle data
years = [2011, 2012, 2013, 2014, 2015]
start_month = 1
stop_month = 6
get_data(years,start_month, stop_month)

In [45]:
# get first half years and pickle data
years = [2011, 2012, 2013, 2014, 2015]
start_month = 7
stop_month = 12
get_data(years,start_month, stop_month)