In [1]:
!ls ../data/


best_actor.txt              box_office_max_min.txt      runtime_max_min.txt
best_actress.txt            director.txt                transcripts
best_supporting_actor.txt   film.txt                    wiki
best_supporting_actress.txt golden_globe_winners.txt
box_office.txt              runtime.txt

In [3]:
import os
import re
import sys
import glob

import bs4
import pandas as pd

sys.path.append('../code')
from utils import *

pd.set_option('display.max_columns', 100)

Load Data

Labels

Actors


In [4]:
actor_cols = ['year', 'winner', 'wiki_slug_person',
              'name_actor', 'wiki_slug_film', 'name_film']

In [5]:
nominees_actor = load_labels('../data/best_actor.txt',
                             actor_cols,
                             'actor')

In [6]:
nominees_actress = load_labels('../data/best_actress.txt',
                               actor_cols,
                               'actress')

In [7]:
nominees_supporting_actor = load_labels('../data/best_supporting_actor.txt',
                                        actor_cols,
                                        'supporting actor')

In [8]:
nominees_supporting_actress = load_labels('../data/best_supporting_actress.txt',
                                          actor_cols,
                                          'supporting actress')

Film


In [9]:
film_cols = ['year', 'winner', 'wiki_slug_film', 'name_film']

In [10]:
nominees_film = load_labels('../data/film.txt', film_cols, 'film')

Director


In [11]:
director_cols = ['year', 'winner', 'wiki_slug_person',
                 'name_director', 'wiki_slug_film', 'name_film']

In [12]:
nominees_director = load_labels('../data/director.txt',
                                director_cols,
                                'director')

Features

Box Office


In [13]:
box_office_cols = ['boxoffice', 'revenue', 'wiki_slug_film']

In [14]:
features_box_office = load_features('../data/box_office.txt', box_office_cols)

In [15]:
features_box_office = features_box_office[['wiki_slug_film', 'revenue']]

Golden Globe Winners


In [16]:
golden_globe_cols = ['outcome', 'value', 'wiki_slug_film']

In [17]:
features_golden_globes = load_features('../data/golden_globe_winners.txt',
                                       golden_globe_cols)

In [18]:
features_golden_globes = features_golden_globes[['wiki_slug_film', 'outcome']]

In [19]:
features_golden_globes['outcome'] = features_golden_globes.outcome.apply(lambda x: x[12:])

Runtime


In [20]:
runtime_cols = ['runtime', 'minutes', 'wiki_slug_film']

In [21]:
features_runtime = load_features('../data/runtime.txt', runtime_cols)

In [22]:
features_runtime = features_runtime[['wiki_slug_film', 'minutes']]

Wiki


In [23]:
movie_path = '../data/wiki/movies'

In [24]:
movie_data = pd.DataFrame([])
for f in os.listdir(movie_path):
    film_id = '/wiki/'+f[:-5]
    fname = movie_path+'/'+f
    try:
        movie_data = movie_data.append(wiki_data(fname, film_id, 'infobox vevent'))
    except:
        pass

In [25]:
movie_data.shape


Out[25]:
(1263, 21)

In [26]:
people_path = '../data/wiki/people'

In [27]:
people_data = pd.DataFrame([])
for f in os.listdir(people_path):
    person_id = '/wiki/'+f[:-5]
    fname = people_path+'/'+f
    try:
        people_data = people_data.append(wiki_data(fname, person_id, 'infobox biography vcard'))
    except:
        pass

In [28]:
people_data.shape


Out[28]:
(549, 41)

Merging


In [29]:
features_box_office.head()


Out[29]:
wiki_slug_film revenue
0 /wiki/127_Hours 60700000
1 /wiki/12_Angry_Men_(1957_film) 1000000
2 /wiki/12_Monkeys 168800000
3 /wiki/12_Years_a_Slave_(film) 187700000
4 /wiki/2001:_A_Space_Odyssey_(film) 190000000

In [30]:
features_runtime.head()


Out[30]:
wiki_slug_film minutes
0 /wiki/127_Hours 93
1 /wiki/12_Angry_Men_(1957_film) 96
2 /wiki/12_Monkeys 129
3 /wiki/12_Years_a_Slave_(film) 134
4 /wiki/2001:_A_Space_Odyssey_(film) 161

In [31]:
films = pd.merge(features_box_office,
                 features_runtime,
                 on='wiki_slug_film',
                 how='outer')

In [32]:
films = pd.merge(films, movie_data, on='wiki_slug_film', how='inner')

In [33]:
films.fillna('', inplace=True)

In [34]:
text_columns = ['based_on', 'box_office', 'budget', 'cinematography',
                'country', 'distributed_by', 'music_by', 'produced_by',
                'production_companies', 'production_company', 'release_dates',
                'running_time', 'screenplay_by', 'starring', 'written_by']

In [35]:
for c in text_columns:
    films[c] = films[c].str.replace(u'\xa0', u' ')
    films[c] = films[c].apply(lambda x: re.sub('\[[0-9]\]', '', x).strip())
    films[c] = films[c].str.split('\n')
    films[c] = films[c].apply(lambda x: [a.strip() for a in x])
    films[c] = films[c].apply(lambda x: [a for a in filter(None, x)])

In [36]:
films['n_starring'] = films.starring.apply(lambda x: len(x))
films['based_on'] = films.based_on.apply(lambda x: 1 if len(x) > 0 else 0)
films['n_countries'] = films.country.apply(lambda x: len(x))
re_months = re.compile('(january|february|march|april|may|june|july|august|september|october|november|december)')
films['release_month'] = films.release_dates.apply(lambda x: re_months.search(x[0].lower()).group()
                                                   if re_months.search(x[0].lower()) is not None
                                                   else '')
films['revenue'] = films.revenue.apply(lambda x: re.sub('\D', '', x))

In [37]:
films = films[['wiki_slug_film', 'revenue', 'minutes', 'based_on', 'n_starring', 'n_countries', 'release_month']]

In [38]:
films.revenue.replace('', 0, inplace=True)
films['revenue'] = films.revenue.astype(int)
films.minutes.replace('', 0, inplace=True)
films['minutes'] = films.minutes.astype(int)

In [39]:
def month_to_q(m):
    if m in ['january', 'february', 'march']:
        return 1
    elif m in ['april', 'may', 'june']:
        return 2
    elif m in ['july', 'august', 'september']:
        return 3
    elif m in ['october', 'november', 'december']:
        return 4
    else:
        return 0

In [40]:
films['release_quarter'] = films.release_month.apply(lambda x: month_to_q(x))

In [41]:
films.drop('release_month', axis=1, inplace=True)

In [42]:
films.head()


Out[42]:
wiki_slug_film revenue minutes based_on n_starring n_countries release_quarter
0 /wiki/127_Hours 60700000 93 1 3 2 3
1 /wiki/12_Angry_Men_(1957_film) 1000000 96 0 12 1 2
2 /wiki/12_Monkeys 168800000 129 1 5 1 4
3 /wiki/12_Years_a_Slave_(film) 187700000 134 1 9 2 3
4 /wiki/2001:_A_Space_Odyssey_(film) 190000000 161 0 2 2 2

In [43]:
pd.merge(films, nominees_film[['wiki_slug_film', 'year', 'winner']], on='wiki_slug_film', how='left')


Out[43]:
wiki_slug_film revenue minutes based_on n_starring n_countries release_quarter year winner
0 /wiki/127_Hours 60700000 93 1 3 2 3 2010 0
1 /wiki/12_Angry_Men_(1957_film) 1000000 96 0 12 1 2 1957 0
2 /wiki/12_Monkeys 168800000 129 1 5 1 4 NaN NaN
3 /wiki/12_Years_a_Slave_(film) 187700000 134 1 9 2 3 2013 1
4 /wiki/2001:_A_Space_Odyssey_(film) 190000000 161 0 2 2 2 NaN NaN
5 /wiki/21_Grams 60400000 124 0 4 1 4 NaN NaN
6 /wiki/42nd_Street_(film) 2250000 89 1 4 1 1 1932 0
7 /wiki/45_Years 9300000 95 1 2 1 1 NaN NaN
8 /wiki/49th_Parallel_(film) 5000000 123 0 5 1 4 1942 0
9 /wiki/5_Fingers 1350000 108 1 3 1 1 NaN NaN
10 /wiki/7th_Heaven_(1927_film) 2500000 110 1 3 1 2 NaN NaN
11 /wiki/7th_Heaven_(1927_film) 2500000 110 1 3 1 2 NaN NaN
12 /wiki/7th_Heaven_(1927_film) 2500000 110 1 3 1 2 NaN NaN
13 /wiki/7th_Heaven_(1927_film) 2500000 110 1 3 1 2 NaN NaN
14 /wiki/A_Beautiful_Mind_(film) 313600000 135 1 9 1 4 2001 1
15 /wiki/A_Civil_Action_(film) 112215759 115 1 9 1 4 NaN NaN
16 /wiki/A_Clockwork_Orange_(film) 26600000 136 1 4 2 4 1971 0
17 /wiki/A_Double_Life 1700000 104 0 3 1 4 NaN NaN
18 /wiki/A_Dry_White_Season 3766879 97 1 6 1 3 NaN NaN
19 /wiki/A_Farewell_to_Arms_(1957_film) 20000000 152 1 3 1 4 NaN NaN
20 /wiki/A_Few_Good_Men 243200000 138 1 8 1 4 1992 0
21 /wiki/A_Fish_Called_Wanda 62493712 109 0 4 2 3 NaN NaN
22 /wiki/A_Free_Soul 1422000 91 0 4 1 2 NaN NaN
23 /wiki/A_Hatful_of_Rain 1500000 109 1 4 1 3 NaN NaN
24 /wiki/A_History_of_Violence 60700000 96 1 5 1 2 NaN NaN
25 /wiki/A_Man_and_a_Woman 14000000 102 0 2 1 3 NaN NaN
26 /wiki/A_Man_for_All_Seasons_(1966_film) 28400000 120 1 6 1 4 1966 1
27 /wiki/A_Midsummer_Night%27s_Dream_(1935_film) 1229000 133 1 7 1 4 1935 0
28 /wiki/A_Passage_to_India_(film) 27200000 163 1 6 2 4 1984 0
29 /wiki/A_Patch_of_Blue 6750000 105 1 3 1 4 NaN NaN
... ... ... ... ... ... ... ... ... ...
1244 /wiki/The_Roman_Spring_of_Mrs._Stone 0 103 0 5 1 4 NaN NaN
1245 /wiki/The_Ruling_Class_(film) 0 154 0 4 0 3 NaN NaN
1246 /wiki/The_Search 0 105 0 5 2 1 NaN NaN
1247 /wiki/The_Sin_of_Madelon_Claudet 0 75 1 4 1 4 NaN NaN
1248 /wiki/The_Smiling_Lieutenant 0 89 1 3 1 3 1931 0
1249 /wiki/The_Story_of_G.I._Joe 0 108 0 2 1 2 NaN NaN
1250 /wiki/The_Story_of_Louis_Pasteur 0 87 0 4 1 1 1936 0
1251 /wiki/The_Sunshine_Boys_(1975_film) 0 111 1 4 1 4 NaN NaN
1252 /wiki/The_Trespasser 0 90 0 2 1 4 NaN NaN
1253 /wiki/The_Westerner_(film) 0 100 0 4 1 3 NaN NaN
1254 /wiki/The_Whales_of_August 0 90 0 4 1 4 NaN NaN
1255 /wiki/The_Whisperers 0 105 0 1 1 3 NaN NaN
1256 /wiki/The_White_Parade 0 80 0 2 1 4 1934 0
1257 /wiki/Their_Own_Desire 0 65 1 5 1 4 NaN NaN
1258 /wiki/Theodora_Goes_Wild 0 94 0 2 1 4 NaN NaN
1259 /wiki/These_Three 0 93 0 4 1 1 NaN NaN
1260 /wiki/This_Sporting_Life 0 134 0 4 1 1 NaN NaN
1261 /wiki/Three_Smart_Girls 0 84 0 4 1 4 1936 0
1262 /wiki/Thunderbolt_(1929_film) 0 85 0 5 1 3 NaN NaN
1263 /wiki/Topper_(film) 0 97 1 2 1 3 NaN NaN
1264 /wiki/Trader_Horn_(1931_film) 0 122 1 3 1 2 1930 0
1265 /wiki/Twilight_of_Honor 0 104 0 2 1 4 NaN NaN
1266 /wiki/Two_Arabian_Knights 0 92 0 3 1 3 NaN NaN
1267 /wiki/Valiant_Is_the_Word_for_Carrie 0 110 0 3 1 4 NaN NaN
1268 /wiki/Voyage_of_the_Damned 0 155 1 6 1 4 NaN NaN
1269 /wiki/White_Banners 0 92 0 6 1 2 NaN NaN
1270 /wiki/Who_Is_Harry_Kellerman_and_Why_Is_He_Say... 0 108 0 6 1 2 NaN NaN
1271 /wiki/Wild_Is_the_Wind 0 114 0 2 1 4 NaN NaN
1272 /wiki/Wings_(1927_film) 0 111 0 4 1 3 1927 1
1273 /wiki/You%27re_a_Big_Boy_Now 0 97 1 10 1 4 NaN NaN

1274 rows × 9 columns


In [ ]: