In [1]:
!ls ../data/
In [3]:
import os
import re
import sys
import glob
import bs4
import pandas as pd
sys.path.append('../code')
from utils import *
pd.set_option('display.max_columns', 100)
In [4]:
actor_cols = ['year', 'winner', 'wiki_slug_person',
'name_actor', 'wiki_slug_film', 'name_film']
In [5]:
nominees_actor = load_labels('../data/best_actor.txt',
actor_cols,
'actor')
In [6]:
nominees_actress = load_labels('../data/best_actress.txt',
actor_cols,
'actress')
In [7]:
nominees_supporting_actor = load_labels('../data/best_supporting_actor.txt',
actor_cols,
'supporting actor')
In [8]:
nominees_supporting_actress = load_labels('../data/best_supporting_actress.txt',
actor_cols,
'supporting actress')
In [9]:
film_cols = ['year', 'winner', 'wiki_slug_film', 'name_film']
In [10]:
nominees_film = load_labels('../data/film.txt', film_cols, 'film')
In [11]:
director_cols = ['year', 'winner', 'wiki_slug_person',
'name_director', 'wiki_slug_film', 'name_film']
In [12]:
nominees_director = load_labels('../data/director.txt',
director_cols,
'director')
In [13]:
box_office_cols = ['boxoffice', 'revenue', 'wiki_slug_film']
In [14]:
features_box_office = load_features('../data/box_office.txt', box_office_cols)
In [15]:
features_box_office = features_box_office[['wiki_slug_film', 'revenue']]
In [16]:
golden_globe_cols = ['outcome', 'value', 'wiki_slug_film']
In [17]:
features_golden_globes = load_features('../data/golden_globe_winners.txt',
golden_globe_cols)
In [18]:
features_golden_globes = features_golden_globes[['wiki_slug_film', 'outcome']]
In [19]:
features_golden_globes['outcome'] = features_golden_globes.outcome.apply(lambda x: x[12:])
In [20]:
runtime_cols = ['runtime', 'minutes', 'wiki_slug_film']
In [21]:
features_runtime = load_features('../data/runtime.txt', runtime_cols)
In [22]:
features_runtime = features_runtime[['wiki_slug_film', 'minutes']]
In [23]:
movie_path = '../data/wiki/movies'
In [24]:
movie_data = pd.DataFrame([])
for f in os.listdir(movie_path):
film_id = '/wiki/'+f[:-5]
fname = movie_path+'/'+f
try:
movie_data = movie_data.append(wiki_data(fname, film_id, 'infobox vevent'))
except:
pass
In [25]:
movie_data.shape
Out[25]:
In [26]:
people_path = '../data/wiki/people'
In [27]:
people_data = pd.DataFrame([])
for f in os.listdir(people_path):
person_id = '/wiki/'+f[:-5]
fname = people_path+'/'+f
try:
people_data = people_data.append(wiki_data(fname, person_id, 'infobox biography vcard'))
except:
pass
In [28]:
people_data.shape
Out[28]:
In [29]:
features_box_office.head()
Out[29]:
In [30]:
features_runtime.head()
Out[30]:
In [31]:
films = pd.merge(features_box_office,
features_runtime,
on='wiki_slug_film',
how='outer')
In [32]:
films = pd.merge(films, movie_data, on='wiki_slug_film', how='inner')
In [33]:
films.fillna('', inplace=True)
In [34]:
text_columns = ['based_on', 'box_office', 'budget', 'cinematography',
'country', 'distributed_by', 'music_by', 'produced_by',
'production_companies', 'production_company', 'release_dates',
'running_time', 'screenplay_by', 'starring', 'written_by']
In [35]:
for c in text_columns:
films[c] = films[c].str.replace(u'\xa0', u' ')
films[c] = films[c].apply(lambda x: re.sub('\[[0-9]\]', '', x).strip())
films[c] = films[c].str.split('\n')
films[c] = films[c].apply(lambda x: [a.strip() for a in x])
films[c] = films[c].apply(lambda x: [a for a in filter(None, x)])
In [36]:
films['n_starring'] = films.starring.apply(lambda x: len(x))
films['based_on'] = films.based_on.apply(lambda x: 1 if len(x) > 0 else 0)
films['n_countries'] = films.country.apply(lambda x: len(x))
re_months = re.compile('(january|february|march|april|may|june|july|august|september|october|november|december)')
films['release_month'] = films.release_dates.apply(lambda x: re_months.search(x[0].lower()).group()
if re_months.search(x[0].lower()) is not None
else '')
films['revenue'] = films.revenue.apply(lambda x: re.sub('\D', '', x))
In [37]:
films = films[['wiki_slug_film', 'revenue', 'minutes', 'based_on', 'n_starring', 'n_countries', 'release_month']]
In [38]:
films.revenue.replace('', 0, inplace=True)
films['revenue'] = films.revenue.astype(int)
films.minutes.replace('', 0, inplace=True)
films['minutes'] = films.minutes.astype(int)
In [39]:
def month_to_q(m):
if m in ['january', 'february', 'march']:
return 1
elif m in ['april', 'may', 'june']:
return 2
elif m in ['july', 'august', 'september']:
return 3
elif m in ['october', 'november', 'december']:
return 4
else:
return 0
In [40]:
films['release_quarter'] = films.release_month.apply(lambda x: month_to_q(x))
In [41]:
films.drop('release_month', axis=1, inplace=True)
In [42]:
films.head()
Out[42]:
In [43]:
pd.merge(films, nominees_film[['wiki_slug_film', 'year', 'winner']], on='wiki_slug_film', how='left')
Out[43]:
In [ ]: