In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import os
import seaborn as sns
import charts_function_list
_,data,outputs = charts_function_list.folder_setup()
In [472]:
def box_office_mojo_compile(year):
url = 'http://www.boxofficemojo.com/yearly/chart/?yr='+str(year)
html = urlopen(url) #get_request
soup = BeautifulSoup(html,"lxml")
empty = []
for i in soup.find_all('table')[6]:
empty.append(i.find_all('td'))
new_empty = []
for x in range(1,101):
empty2 = []
for x2 in range(0,7):
empty2.append(empty[x][x2].text)
year_frame = pd.DataFrame([empty2],
columns=['Rank','Title','Studio','Total Gross','Theaters','Opening','Opening Theaters'])
year_frame['Year']=year
year_frame['Rank']=year_frame['Rank'].astype('int')
year_frame['link']=empty[x][1].find('a').get('href')
new_empty.append(year_frame)
return pd.concat(new_empty)
def top_actors(movie):
url = 'http://www.boxofficemojo.com'+movie
html = urlopen(url) #get_request
soup = BeautifulSoup(html,"lxml")
body = soup.find("div",{"id":"body"})
table1 = body.find("table")
cast_crew = []
for x in table1.findAll('font'):
cast_crew.append(x.getText())
frame = pd.DataFrame(cast_crew)
return frame.loc[frame[frame[0].str.contains('Actor')].index[0]+1].values[0]
In [428]:
all_years = []
for x in range(1980,2018):
all_years.append(box_office_mojo_compile(x))
all_year = pd.concat(all_years)
#top 5
top_5_by_year =all_year[all_year['Rank']<=5].reset_index(drop=True)
In [491]:
empty = []
for title,link in zip(top_5_by_year['Title'],top_5_by_year['link']):
try:
empty.append(top_actors(link))
except:
empty.append("ERROR FOR: "+title)
top_5_by_year['Actors']=empty
In [502]:
os.chdir(outputs)
all_year.to_csv('top_100_films.csv')
top_5_by_year.to_csv('Top_5.csv')