Reading in movie box office data from boxofficemojo


In [270]:
import requests
from bs4 import BeautifulSoup

In [271]:
import os
os.getcwd()
os.chdir('/Users/ItelinaMa/Documents/Metis/Luther')

In [272]:
urllist = ['http://www.boxofficemojo.com/daily/?view=bymovie&yr=2015&page=1&sort=title&order=ASC&p=.htm', 'http://www.boxofficemojo.com/daily/?view=bymovie&yr=2015&page=2&sort=title&order=ASC&p=.htm']
url = urllist[1]

In [273]:
def returnSoup(urllist):
    soups = []
    for url in urllist:
        response = requests.get(url)
        print response.status_code
        soup = BeautifulSoup(response.text)
        soups.append(soup)
    return soups

In [274]:
soups = returnSoup(urllist)


200
200

In [275]:
def makeData(soups):
    releasegross = []
    movienames = []
    releasedate =[]
    studio = []
    for soup in soups:
        tablelength = len(soup.find_all('table')[1].find_all('tr'))
        for i in range(2, tablelength):
            releasegross.append(soup.find_all('table')[1].find_all('tr')[i].find_all('td')[3].text)
            movienames.append(soup.find_all('table')[1].find_all('tr')[i].find_all('td')[0].text)
            releasedate.append(soup.find_all('table')[1].find_all('tr')[i].find_all('td')[4].text)
            studio.append(soup.find_all('table')[1].find_all('tr')[i].find_all('td')[1].text)
    boxoffice ={}
    for i, item in enumerate(movienames):
        boxoffice[item] = zip(studio, releasegross, releasedate)[i]
    return boxoffice

In [276]:
boxoffice = makeData(soups)

In [280]:
len(boxoffice) == 156


Out[280]:
True

In [281]:
import pickle
with open('boxofficedata.pkl', 'w') as picklefile:
    pickle.dump(boxoffice, picklefile)