In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import os
import seaborn as sns
import charts_function_list
_,data,outputs = charts_function_list.folder_setup()

Functions for pulling films and unstructured actors


In [472]:
def box_office_mojo_compile(year):
    url = 'http://www.boxofficemojo.com/yearly/chart/?yr='+str(year)
    html = urlopen(url) #get_request
    soup = BeautifulSoup(html,"lxml")
    empty = []
    for i in soup.find_all('table')[6]:
        empty.append(i.find_all('td'))
        
    new_empty = []
    for x in range(1,101):
        empty2 = []
        for x2 in range(0,7):
            empty2.append(empty[x][x2].text)
        year_frame = pd.DataFrame([empty2],
                                  columns=['Rank','Title','Studio','Total Gross','Theaters','Opening','Opening Theaters'])
        year_frame['Year']=year
        year_frame['Rank']=year_frame['Rank'].astype('int')
        year_frame['link']=empty[x][1].find('a').get('href')
        new_empty.append(year_frame)
    return pd.concat(new_empty)

def top_actors(movie):
    url = 'http://www.boxofficemojo.com'+movie
    html = urlopen(url) #get_request
    
    soup = BeautifulSoup(html,"lxml")
    body = soup.find("div",{"id":"body"})
    table1 = body.find("table")
    
    
    cast_crew = []
    for x in table1.findAll('font'):
        cast_crew.append(x.getText())
    
    frame = pd.DataFrame(cast_crew)
    return frame.loc[frame[frame[0].str.contains('Actor')].index[0]+1].values[0]

Compile table for Top 100 in Box Office Mojo by year


In [428]:
all_years = []
for x in range(1980,2018):
    all_years.append(box_office_mojo_compile(x))
all_year = pd.concat(all_years)

#top 5
top_5_by_year =all_year[all_year['Rank']<=5].reset_index(drop=True)

Compile Top 5 by year with pull of actors (albeit with errors!)


In [491]:
empty = []
for title,link in zip(top_5_by_year['Title'],top_5_by_year['link']):
    try: 
        empty.append(top_actors(link))
    except:
        empty.append("ERROR FOR: "+title)
    
top_5_by_year['Actors']=empty

Export


In [502]:
os.chdir(outputs)
all_year.to_csv('top_100_films.csv')
top_5_by_year.to_csv('Top_5.csv')