In [1]:

    
!ls ../data/









    



best_actor.txt              box_office_max_min.txt      runtime_max_min.txt
best_actress.txt            director.txt                transcripts
best_supporting_actor.txt   film.txt                    wiki
best_supporting_actress.txt golden_globe_winners.txt
box_office.txt              runtime.txt



In [3]:

    
import os
import re
import sys
import glob

import bs4
import pandas as pd

sys.path.append('../code')
from utils import *

pd.set_option('display.max_columns', 100)

Load Data

Labels

Actors



In [4]:

    
actor_cols = ['year', 'winner', 'wiki_slug_person',
              'name_actor', 'wiki_slug_film', 'name_film']



In [5]:

    
nominees_actor = load_labels('../data/best_actor.txt',
                             actor_cols,
                             'actor')



In [6]:

    
nominees_actress = load_labels('../data/best_actress.txt',
                               actor_cols,
                               'actress')



In [7]:

    
nominees_supporting_actor = load_labels('../data/best_supporting_actor.txt',
                                        actor_cols,
                                        'supporting actor')



In [8]:

    
nominees_supporting_actress = load_labels('../data/best_supporting_actress.txt',
                                          actor_cols,
                                          'supporting actress')

Film



In [9]:

    
film_cols = ['year', 'winner', 'wiki_slug_film', 'name_film']



In [10]:

    
nominees_film = load_labels('../data/film.txt', film_cols, 'film')

Director



In [11]:

    
director_cols = ['year', 'winner', 'wiki_slug_person',
                 'name_director', 'wiki_slug_film', 'name_film']



In [12]:

    
nominees_director = load_labels('../data/director.txt',
                                director_cols,
                                'director')

Features

Box Office



In [13]:

    
box_office_cols = ['boxoffice', 'revenue', 'wiki_slug_film']



In [14]:

    
features_box_office = load_features('../data/box_office.txt', box_office_cols)



In [15]:

    
features_box_office = features_box_office[['wiki_slug_film', 'revenue']]

Golden Globe Winners



In [16]:

    
golden_globe_cols = ['outcome', 'value', 'wiki_slug_film']



In [17]:

    
features_golden_globes = load_features('../data/golden_globe_winners.txt',
                                       golden_globe_cols)



In [18]:

    
features_golden_globes = features_golden_globes[['wiki_slug_film', 'outcome']]



In [19]:

    
features_golden_globes['outcome'] = features_golden_globes.outcome.apply(lambda x: x[12:])

Runtime



In [20]:

    
runtime_cols = ['runtime', 'minutes', 'wiki_slug_film']



In [21]:

    
features_runtime = load_features('../data/runtime.txt', runtime_cols)



In [22]:

    
features_runtime = features_runtime[['wiki_slug_film', 'minutes']]

Wiki



In [23]:

    
movie_path = '../data/wiki/movies'



In [24]:

    
movie_data = pd.DataFrame([])
for f in os.listdir(movie_path):
    film_id = '/wiki/'+f[:-5]
    fname = movie_path+'/'+f
    try:
        movie_data = movie_data.append(wiki_data(fname, film_id, 'infobox vevent'))
    except:
        pass



In [25]:

    
movie_data.shape









    Out[25]:





(1263, 21)



In [26]:

    
people_path = '../data/wiki/people'



In [27]:

    
people_data = pd.DataFrame([])
for f in os.listdir(people_path):
    person_id = '/wiki/'+f[:-5]
    fname = people_path+'/'+f
    try:
        people_data = people_data.append(wiki_data(fname, person_id, 'infobox biography vcard'))
    except:
        pass



In [28]:

    
people_data.shape









    Out[28]:





(549, 41)

Merging



In [29]:

    
features_box_office.head()









    Out[29]:






  
    
      
      wiki_slug_film
      revenue
    
  
  
    
      0
      /wiki/127_Hours
      60700000
    
    
      1
      /wiki/12_Angry_Men_(1957_film)
      1000000
    
    
      2
      /wiki/12_Monkeys
      168800000
    
    
      3
      /wiki/12_Years_a_Slave_(film)
      187700000
    
    
      4
      /wiki/2001:_A_Space_Odyssey_(film)
      190000000



In [30]:

    
features_runtime.head()









    Out[30]:






  
    
      
      wiki_slug_film
      minutes
    
  
  
    
      0
      /wiki/127_Hours
      93
    
    
      1
      /wiki/12_Angry_Men_(1957_film)
      96
    
    
      2
      /wiki/12_Monkeys
      129
    
    
      3
      /wiki/12_Years_a_Slave_(film)
      134
    
    
      4
      /wiki/2001:_A_Space_Odyssey_(film)
      161



In [31]:

    
films = pd.merge(features_box_office,
                 features_runtime,
                 on='wiki_slug_film',
                 how='outer')



In [32]:

    
films = pd.merge(films, movie_data, on='wiki_slug_film', how='inner')



In [33]:

    
films.fillna('', inplace=True)



In [34]:

    
text_columns = ['based_on', 'box_office', 'budget', 'cinematography',
                'country', 'distributed_by', 'music_by', 'produced_by',
                'production_companies', 'production_company', 'release_dates',
                'running_time', 'screenplay_by', 'starring', 'written_by']



In [35]:

    
for c in text_columns:
    films[c] = films[c].str.replace(u'\xa0', u' ')
    films[c] = films[c].apply(lambda x: re.sub('\[[0-9]\]', '', x).strip())
    films[c] = films[c].str.split('\n')
    films[c] = films[c].apply(lambda x: [a.strip() for a in x])
    films[c] = films[c].apply(lambda x: [a for a in filter(None, x)])



In [36]:

    
films['n_starring'] = films.starring.apply(lambda x: len(x))
films['based_on'] = films.based_on.apply(lambda x: 1 if len(x) > 0 else 0)
films['n_countries'] = films.country.apply(lambda x: len(x))
re_months = re.compile('(january|february|march|april|may|june|july|august|september|october|november|december)')
films['release_month'] = films.release_dates.apply(lambda x: re_months.search(x[0].lower()).group()
                                                   if re_months.search(x[0].lower()) is not None
                                                   else '')
films['revenue'] = films.revenue.apply(lambda x: re.sub('\D', '', x))



In [37]:

    
films = films[['wiki_slug_film', 'revenue', 'minutes', 'based_on', 'n_starring', 'n_countries', 'release_month']]



In [38]:

    
films.revenue.replace('', 0, inplace=True)
films['revenue'] = films.revenue.astype(int)
films.minutes.replace('', 0, inplace=True)
films['minutes'] = films.minutes.astype(int)



In [39]:

    
def month_to_q(m):
    if m in ['january', 'february', 'march']:
        return 1
    elif m in ['april', 'may', 'june']:
        return 2
    elif m in ['july', 'august', 'september']:
        return 3
    elif m in ['october', 'november', 'december']:
        return 4
    else:
        return 0



In [40]:

    
films['release_quarter'] = films.release_month.apply(lambda x: month_to_q(x))



In [41]:

    
films.drop('release_month', axis=1, inplace=True)



In [42]:

    
films.head()









    Out[42]:






  
    
      
      wiki_slug_film
      revenue
      minutes
      based_on
      n_starring
      n_countries
      release_quarter
    
  
  
    
      0
      /wiki/127_Hours
      60700000
      93
      1
      3
      2
      3
    
    
      1
      /wiki/12_Angry_Men_(1957_film)
      1000000
      96
      0
      12
      1
      2
    
    
      2
      /wiki/12_Monkeys
      168800000
      129
      1
      5
      1
      4
    
    
      3
      /wiki/12_Years_a_Slave_(film)
      187700000
      134
      1
      9
      2
      3
    
    
      4
      /wiki/2001:_A_Space_Odyssey_(film)
      190000000
      161
      0
      2
      2
      2



In [43]:

    
pd.merge(films, nominees_film[['wiki_slug_film', 'year', 'winner']], on='wiki_slug_film', how='left')









    Out[43]:






  
    
      
      wiki_slug_film
      revenue
      minutes
      based_on
      n_starring
      n_countries
      release_quarter
      year
      winner
    
  
  
    
      0
      /wiki/127_Hours
      60700000
      93
      1
      3
      2
      3
      2010
      0
    
    
      1
      /wiki/12_Angry_Men_(1957_film)
      1000000
      96
      0
      12
      1
      2
      1957
      0
    
    
      2
      /wiki/12_Monkeys
      168800000
      129
      1
      5
      1
      4
      NaN
      NaN
    
    
      3
      /wiki/12_Years_a_Slave_(film)
      187700000
      134
      1
      9
      2
      3
      2013
      1
    
    
      4
      /wiki/2001:_A_Space_Odyssey_(film)
      190000000
      161
      0
      2
      2
      2
      NaN
      NaN
    
    
      5
      /wiki/21_Grams
      60400000
      124
      0
      4
      1
      4
      NaN
      NaN
    
    
      6
      /wiki/42nd_Street_(film)
      2250000
      89
      1
      4
      1
      1
      1932
      0
    
    
      7
      /wiki/45_Years
      9300000
      95
      1
      2
      1
      1
      NaN
      NaN
    
    
      8
      /wiki/49th_Parallel_(film)
      5000000
      123
      0
      5
      1
      4
      1942
      0
    
    
      9
      /wiki/5_Fingers
      1350000
      108
      1
      3
      1
      1
      NaN
      NaN
    
    
      10
      /wiki/7th_Heaven_(1927_film)
      2500000
      110
      1
      3
      1
      2
      NaN
      NaN
    
    
      11
      /wiki/7th_Heaven_(1927_film)
      2500000
      110
      1
      3
      1
      2
      NaN
      NaN
    
    
      12
      /wiki/7th_Heaven_(1927_film)
      2500000
      110
      1
      3
      1
      2
      NaN
      NaN
    
    
      13
      /wiki/7th_Heaven_(1927_film)
      2500000
      110
      1
      3
      1
      2
      NaN
      NaN
    
    
      14
      /wiki/A_Beautiful_Mind_(film)
      313600000
      135
      1
      9
      1
      4
      2001
      1
    
    
      15
      /wiki/A_Civil_Action_(film)
      112215759
      115
      1
      9
      1
      4
      NaN
      NaN
    
    
      16
      /wiki/A_Clockwork_Orange_(film)
      26600000
      136
      1
      4
      2
      4
      1971
      0
    
    
      17
      /wiki/A_Double_Life
      1700000
      104
      0
      3
      1
      4
      NaN
      NaN
    
    
      18
      /wiki/A_Dry_White_Season
      3766879
      97
      1
      6
      1
      3
      NaN
      NaN
    
    
      19
      /wiki/A_Farewell_to_Arms_(1957_film)
      20000000
      152
      1
      3
      1
      4
      NaN
      NaN
    
    
      20
      /wiki/A_Few_Good_Men
      243200000
      138
      1
      8
      1
      4
      1992
      0
    
    
      21
      /wiki/A_Fish_Called_Wanda
      62493712
      109
      0
      4
      2
      3
      NaN
      NaN
    
    
      22
      /wiki/A_Free_Soul
      1422000
      91
      0
      4
      1
      2
      NaN
      NaN
    
    
      23
      /wiki/A_Hatful_of_Rain
      1500000
      109
      1
      4
      1
      3
      NaN
      NaN
    
    
      24
      /wiki/A_History_of_Violence
      60700000
      96
      1
      5
      1
      2
      NaN
      NaN
    
    
      25
      /wiki/A_Man_and_a_Woman
      14000000
      102
      0
      2
      1
      3
      NaN
      NaN
    
    
      26
      /wiki/A_Man_for_All_Seasons_(1966_film)
      28400000
      120
      1
      6
      1
      4
      1966
      1
    
    
      27
      /wiki/A_Midsummer_Night%27s_Dream_(1935_film)
      1229000
      133
      1
      7
      1
      4
      1935
      0
    
    
      28
      /wiki/A_Passage_to_India_(film)
      27200000
      163
      1
      6
      2
      4
      1984
      0
    
    
      29
      /wiki/A_Patch_of_Blue
      6750000
      105
      1
      3
      1
      4
      NaN
      NaN
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      1244
      /wiki/The_Roman_Spring_of_Mrs._Stone
      0
      103
      0
      5
      1
      4
      NaN
      NaN
    
    
      1245
      /wiki/The_Ruling_Class_(film)
      0
      154
      0
      4
      0
      3
      NaN
      NaN
    
    
      1246
      /wiki/The_Search
      0
      105
      0
      5
      2
      1
      NaN
      NaN
    
    
      1247
      /wiki/The_Sin_of_Madelon_Claudet
      0
      75
      1
      4
      1
      4
      NaN
      NaN
    
    
      1248
      /wiki/The_Smiling_Lieutenant
      0
      89
      1
      3
      1
      3
      1931
      0
    
    
      1249
      /wiki/The_Story_of_G.I._Joe
      0
      108
      0
      2
      1
      2
      NaN
      NaN
    
    
      1250
      /wiki/The_Story_of_Louis_Pasteur
      0
      87
      0
      4
      1
      1
      1936
      0
    
    
      1251
      /wiki/The_Sunshine_Boys_(1975_film)
      0
      111
      1
      4
      1
      4
      NaN
      NaN
    
    
      1252
      /wiki/The_Trespasser
      0
      90
      0
      2
      1
      4
      NaN
      NaN
    
    
      1253
      /wiki/The_Westerner_(film)
      0
      100
      0
      4
      1
      3
      NaN
      NaN
    
    
      1254
      /wiki/The_Whales_of_August
      0
      90
      0
      4
      1
      4
      NaN
      NaN
    
    
      1255
      /wiki/The_Whisperers
      0
      105
      0
      1
      1
      3
      NaN
      NaN
    
    
      1256
      /wiki/The_White_Parade
      0
      80
      0
      2
      1
      4
      1934
      0
    
    
      1257
      /wiki/Their_Own_Desire
      0
      65
      1
      5
      1
      4
      NaN
      NaN
    
    
      1258
      /wiki/Theodora_Goes_Wild
      0
      94
      0
      2
      1
      4
      NaN
      NaN
    
    
      1259
      /wiki/These_Three
      0
      93
      0
      4
      1
      1
      NaN
      NaN
    
    
      1260
      /wiki/This_Sporting_Life
      0
      134
      0
      4
      1
      1
      NaN
      NaN
    
    
      1261
      /wiki/Three_Smart_Girls
      0
      84
      0
      4
      1
      4
      1936
      0
    
    
      1262
      /wiki/Thunderbolt_(1929_film)
      0
      85
      0
      5
      1
      3
      NaN
      NaN
    
    
      1263
      /wiki/Topper_(film)
      0
      97
      1
      2
      1
      3
      NaN
      NaN
    
    
      1264
      /wiki/Trader_Horn_(1931_film)
      0
      122
      1
      3
      1
      2
      1930
      0
    
    
      1265
      /wiki/Twilight_of_Honor
      0
      104
      0
      2
      1
      4
      NaN
      NaN
    
    
      1266
      /wiki/Two_Arabian_Knights
      0
      92
      0
      3
      1
      3
      NaN
      NaN
    
    
      1267
      /wiki/Valiant_Is_the_Word_for_Carrie
      0
      110
      0
      3
      1
      4
      NaN
      NaN
    
    
      1268
      /wiki/Voyage_of_the_Damned
      0
      155
      1
      6
      1
      4
      NaN
      NaN
    
    
      1269
      /wiki/White_Banners
      0
      92
      0
      6
      1
      2
      NaN
      NaN
    
    
      1270
      /wiki/Who_Is_Harry_Kellerman_and_Why_Is_He_Say...
      0
      108
      0
      6
      1
      2
      NaN
      NaN
    
    
      1271
      /wiki/Wild_Is_the_Wind
      0
      114
      0
      2
      1
      4
      NaN
      NaN
    
    
      1272
      /wiki/Wings_(1927_film)
      0
      111
      0
      4
      1
      3
      1927
      1
    
    
      1273
      /wiki/You%27re_a_Big_Boy_Now
      0
      97
      1
      10
      1
      4
      NaN
      NaN
    
  

1274 rows × 9 columns



In [ ]:

	wiki_slug_film	revenue
0	/wiki/127_Hours	60700000
1	/wiki/12_Angry_Men_(1957_film)	1000000
2	/wiki/12_Monkeys	168800000
3	/wiki/12_Years_a_Slave_(film)	187700000
4	/wiki/2001:_A_Space_Odyssey_(film)	190000000

	wiki_slug_film	revenue	minutes	based_on	n_starring	n_countries	release_quarter	year	winner
0	/wiki/127_Hours	60700000	93	1	3	2	3	2010	0
1	/wiki/12_Angry_Men_(1957_film)	1000000	96	0	12	1	2	1957	0
2	/wiki/12_Monkeys	168800000	129	1	5	1	4	NaN	NaN
3	/wiki/12_Years_a_Slave_(film)	187700000	134	1	9	2	3	2013	1
4	/wiki/2001:_A_Space_Odyssey_(film)	190000000	161	0	2	2	2	NaN	NaN
5	/wiki/21_Grams	60400000	124	0	4	1	4	NaN	NaN
6	/wiki/42nd_Street_(film)	2250000	89	1	4	1	1	1932	0
7	/wiki/45_Years	9300000	95	1	2	1	1	NaN	NaN
8	/wiki/49th_Parallel_(film)	5000000	123	0	5	1	4	1942	0
9	/wiki/5_Fingers	1350000	108	1	3	1	1	NaN	NaN
10	/wiki/7th_Heaven_(1927_film)	2500000	110	1	3	1	2	NaN	NaN
11	/wiki/7th_Heaven_(1927_film)	2500000	110	1	3	1	2	NaN	NaN
12	/wiki/7th_Heaven_(1927_film)	2500000	110	1	3	1	2	NaN	NaN
13	/wiki/7th_Heaven_(1927_film)	2500000	110	1	3	1	2	NaN	NaN
14	/wiki/A_Beautiful_Mind_(film)	313600000	135	1	9	1	4	2001	1
15	/wiki/A_Civil_Action_(film)	112215759	115	1	9	1	4	NaN	NaN
16	/wiki/A_Clockwork_Orange_(film)	26600000	136	1	4	2	4	1971	0
17	/wiki/A_Double_Life	1700000	104	0	3	1	4	NaN	NaN
18	/wiki/A_Dry_White_Season	3766879	97	1	6	1	3	NaN	NaN
19	/wiki/A_Farewell_to_Arms_(1957_film)	20000000	152	1	3	1	4	NaN	NaN
20	/wiki/A_Few_Good_Men	243200000	138	1	8	1	4	1992	0
21	/wiki/A_Fish_Called_Wanda	62493712	109	0	4	2	3	NaN	NaN
22	/wiki/A_Free_Soul	1422000	91	0	4	1	2	NaN	NaN
23	/wiki/A_Hatful_of_Rain	1500000	109	1	4	1	3	NaN	NaN
24	/wiki/A_History_of_Violence	60700000	96	1	5	1	2	NaN	NaN
25	/wiki/A_Man_and_a_Woman	14000000	102	0	2	1	3	NaN	NaN
26	/wiki/A_Man_for_All_Seasons_(1966_film)	28400000	120	1	6	1	4	1966	1
27	/wiki/A_Midsummer_Night%27s_Dream_(1935_film)	1229000	133	1	7	1	4	1935	0
28	/wiki/A_Passage_to_India_(film)	27200000	163	1	6	2	4	1984	0
29	/wiki/A_Patch_of_Blue	6750000	105	1	3	1	4	NaN	NaN
...	...	...	...	...	...	...	...	...	...
1244	/wiki/The_Roman_Spring_of_Mrs._Stone	0	103	0	5	1	4	NaN	NaN
1245	/wiki/The_Ruling_Class_(film)	0	154	0	4	0	3	NaN	NaN
1246	/wiki/The_Search	0	105	0	5	2	1	NaN	NaN
1247	/wiki/The_Sin_of_Madelon_Claudet	0	75	1	4	1	4	NaN	NaN
1248	/wiki/The_Smiling_Lieutenant	0	89	1	3	1	3	1931	0
1249	/wiki/The_Story_of_G.I._Joe	0	108	0	2	1	2	NaN	NaN
1250	/wiki/The_Story_of_Louis_Pasteur	0	87	0	4	1	1	1936	0
1251	/wiki/The_Sunshine_Boys_(1975_film)	0	111	1	4	1	4	NaN	NaN
1252	/wiki/The_Trespasser	0	90	0	2	1	4	NaN	NaN
1253	/wiki/The_Westerner_(film)	0	100	0	4	1	3	NaN	NaN
1254	/wiki/The_Whales_of_August	0	90	0	4	1	4	NaN	NaN
1255	/wiki/The_Whisperers	0	105	0	1	1	3	NaN	NaN
1256	/wiki/The_White_Parade	0	80	0	2	1	4	1934	0
1257	/wiki/Their_Own_Desire	0	65	1	5	1	4	NaN	NaN
1258	/wiki/Theodora_Goes_Wild	0	94	0	2	1	4	NaN	NaN
1259	/wiki/These_Three	0	93	0	4	1	1	NaN	NaN
1260	/wiki/This_Sporting_Life	0	134	0	4	1	1	NaN	NaN
1261	/wiki/Three_Smart_Girls	0	84	0	4	1	4	1936	0
1262	/wiki/Thunderbolt_(1929_film)	0	85	0	5	1	3	NaN	NaN
1263	/wiki/Topper_(film)	0	97	1	2	1	3	NaN	NaN
1264	/wiki/Trader_Horn_(1931_film)	0	122	1	3	1	2	1930	0
1265	/wiki/Twilight_of_Honor	0	104	0	2	1	4	NaN	NaN
1266	/wiki/Two_Arabian_Knights	0	92	0	3	1	3	NaN	NaN
1267	/wiki/Valiant_Is_the_Word_for_Carrie	0	110	0	3	1	4	NaN	NaN
1268	/wiki/Voyage_of_the_Damned	0	155	1	6	1	4	NaN	NaN
1269	/wiki/White_Banners	0	92	0	6	1	2	NaN	NaN
1270	/wiki/Who_Is_Harry_Kellerman_and_Why_Is_He_Say...	0	108	0	6	1	2	NaN	NaN
1271	/wiki/Wild_Is_the_Wind	0	114	0	2	1	4	NaN	NaN
1272	/wiki/Wings_(1927_film)	0	111	0	4	1	3	1927	1
1273	/wiki/You%27re_a_Big_Boy_Now	0	97	1	10	1	4	NaN	NaN