Predicting Gross Income

Loading the dataset


In [166]:
import os

import pandas as pd
import sklearn as skl

# import holcrawl.shared

In [40]:
# dataset_dir = holcrawl.shared._get_dataset_dir_path()

In [41]:
# dataset_path = os.path.join(dataset_dir, 'movies_dataset.csv')

In [245]:
df = pd.read_csv('movies_dataset.csv')

In [246]:
df.year.value_counts()


Out[246]:
2009.0    163
2011.0    152
2012.0    148
2014.0    129
2013.0    122
2010.0    105
2015.0     67
2008.0     14
Name: year, dtype: int64

In [247]:
# list(df.columns)

Feature Generation


In [169]:
df['name_length'] = df['name'].map(lambda name: len(name))

In [170]:
len(df)


Out[170]:
900

In [171]:
df.isnull().sum()


Out[171]:
avg_screens                                 0
budget                                    149
budget_currency                           149
closing_date                                0
critic_review_count                         0
duration                                    0
gross_income                                0
imdb_user_reviews                           0
max_screens                                 0
mc_avg_user_score                           0
mc_metascore                                0
mc_mixed_rating_frequency                   0
mc_movie_name                               0
mc_negative_rating_frequency                0
mc_positive_rating_frequency                0
mc_pro_critic_reviews                       0
mc_user_reviews                             0
metascore                                   3
name                                        0
num_weekends                                0
opening_weekend_date                        0
opening_weekend_income                      0
opening_weekend_income_currency             0
opening_weekend_screens                     0
rating                                      0
rating_count                                0
release_day                               191
release_month                             191
release_year                              191
screens_by_weekend                          0
total_screens                               0
user_review_count                           0
year                                        0
avg_rating_per_demo.aged_18-29              0
avg_rating_per_demo.aged_30-44              0
avg_rating_per_demo.aged_45+                0
avg_rating_per_demo.aged_under_18          10
avg_rating_per_demo.females                 0
avg_rating_per_demo.females_aged_18-29      0
avg_rating_per_demo.females_aged_30-44      0
avg_rating_per_demo.females_aged_45+        0
avg_rating_per_demo.females_under_18       59
avg_rating_per_demo.imdb_staff             40
avg_rating_per_demo.imdb_users              0
avg_rating_per_demo.males                   0
avg_rating_per_demo.males_aged_18-29        0
avg_rating_per_demo.males_aged_30-44        0
avg_rating_per_demo.males_aged_45+          0
avg_rating_per_demo.males_under_18         18
avg_rating_per_demo.non-us_users            0
avg_rating_per_demo.top_1000_voters         0
avg_rating_per_demo.us_users                0
votes_per_demo.aged_18-29                   0
votes_per_demo.aged_30-44                   0
votes_per_demo.aged_45+                     0
votes_per_demo.aged_under_18               10
votes_per_demo.females                      0
votes_per_demo.females_aged_18-29           0
votes_per_demo.females_aged_30-44           0
votes_per_demo.females_aged_45+             0
votes_per_demo.females_under_18            59
votes_per_demo.imdb_staff                  40
votes_per_demo.imdb_users                   0
votes_per_demo.males                        0
votes_per_demo.males_aged_18-29             0
votes_per_demo.males_aged_30-44             0
votes_per_demo.males_aged_45+               0
votes_per_demo.males_under_18              18
votes_per_demo.non-us_users                 0
votes_per_demo.top_1000_voters              0
votes_per_demo.us_users                     0
rating_freq.1                               0
rating_freq.10                              0
rating_freq.2                               0
rating_freq.3                               0
rating_freq.4                               0
rating_freq.5                               0
rating_freq.6                               0
rating_freq.7                               0
rating_freq.8                               0
rating_freq.9                               0
genres.action                               0
genres.adventure                            0
genres.animation                            0
genres.biography                            0
genres.comedy                               0
genres.crime                                0
genres.documentary                          0
genres.drama                                0
genres.family                               0
genres.fantasy                              0
genres.history                              0
genres.horror                               0
genres.music                                0
genres.musical                              0
genres.mystery                              0
genres.news                                 0
genres.romance                              0
genres.sci-fi                               0
genres.sport                                0
genres.thriller                             0
genres.war                                  0
genres.western                              0
num_mc_critic                               0
avg_mc_critic                             235
num_mc_critic_by_opening                    0
avg_mc_critic_by_opening                  266
num_mc_user                                 0
avg_mc_user                                16
num_mc_user_by_opening                      0
avg_mc_user_by_opening                    441
num_imdb_user                               0
avg_imdb_user                               0
num_imdb_user_by_opening                    0
avg_imdb_user_by_opening                   45
opening_month                               0
opening_day                                 0
opening_day_of_year                         0
norm_gross                                149
profit                                    149
ROI                                       149
name_length                                 0
mc_avg_user_score_scaled                    0
rating_scaled                               0
dtype: int64

In [172]:
len(df[df['avg_mc_critic_by_opening'].notnull()])


Out[172]:
634

In [174]:
BASE_FEAT_TO_KEEP = [
    'duration', 'budget', 'opening_month', 'opening_day', 'opening_day_of_year', 'year',
    'avg_mc_critic_by_opening', 'num_mc_critic_by_opening', 'name_length', 
    'num_imdb_user_by_opening', 'avg_imdb_user_by_opening', 'opening_weekend_screens'# 'avg_mc_user_by_opening'
]

In [175]:
FEAT_TO_KEEP = BASE_FEAT_TO_KEEP + [col for col in df.columns if 'genres' in col]

In [176]:
features = df.drop([col for col in df.columns if col not in BASE_FEAT_TO_KEEP], axis=1)

In [177]:
dataset = df.drop([col for col in df.columns if col not in FEAT_TO_KEEP], axis=1)

In [180]:
dataset = dataset.dropna(axis=0)

In [181]:
len(dataset)


Out[181]:
518

In [186]:
dataset.columns


Out[186]:
Index(['budget', 'duration', 'opening_weekend_screens', 'year',
       'genres.action', 'genres.adventure', 'genres.animation',
       'genres.biography', 'genres.comedy', 'genres.crime',
       'genres.documentary', 'genres.drama', 'genres.family', 'genres.fantasy',
       'genres.history', 'genres.horror', 'genres.music', 'genres.musical',
       'genres.mystery', 'genres.news', 'genres.romance', 'genres.sci-fi',
       'genres.sport', 'genres.thriller', 'genres.war', 'genres.western',
       'num_mc_critic_by_opening', 'avg_mc_critic_by_opening',
       'num_imdb_user_by_opening', 'avg_imdb_user_by_opening', 'opening_month',
       'opening_day', 'opening_day_of_year', 'name_length'],
      dtype='object')

In [187]:
pd.options.display.max_columns = 999
dataset.iloc[0:3]


Out[187]:
budget duration opening_weekend_screens year genres.action genres.adventure genres.animation genres.biography genres.comedy genres.crime genres.documentary genres.drama genres.family genres.fantasy genres.history genres.horror genres.music genres.musical genres.mystery genres.news genres.romance genres.sci-fi genres.sport genres.thriller genres.war genres.western num_mc_critic_by_opening avg_mc_critic_by_opening num_imdb_user_by_opening avg_imdb_user_by_opening opening_month opening_day opening_day_of_year name_length
2 4000000.0 93.0 45.0 2014.0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 10 49.700000 15 7.000000 4 18 108 7
5 42000000.0 110.0 3121.0 2012.0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 39 72.179487 23 7.000000 3 16 76 14
6 50000000.0 112.0 3306.0 2014.0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 45 72.844444 31 8.290323 6 13 164 14

Predicting Gross Income


In [276]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split

In [277]:
X = dataset
# Y = df['ROI'].ix[dataset.index]
Y = df['gross_income'].ix[dataset.index]

In [278]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30)

Feature selection with Mutual Information Regression


In [279]:
FEATURES_TO_SELECT = 10

In [280]:
from sklearn.feature_selection import f_regression, mutual_info_regression
mi = mutual_info_regression(X_train, y_train)

In [281]:
mi_df = pd.DataFrame([mi]).T
mi_df.index = X_train.columns
mi_df.columns = ['MI']

In [282]:
mi_df = mi_df.sort('MI', ascending=False)
mi_df


/Users/shaypalachy/miniconda3/envs/ds/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  if __name__ == '__main__':
Out[282]:
MI
opening_weekend_screens 5.257127e-01
budget 3.586542e-01
num_mc_critic_by_opening 2.638703e-01
num_imdb_user_by_opening 2.006672e-01
avg_mc_critic_by_opening 1.538417e-01
genres.adventure 9.011780e-02
opening_month 5.058676e-02
duration 4.018463e-02
opening_day_of_year 4.018256e-02
name_length 3.968138e-02
genres.animation 3.829288e-02
genres.family 2.957691e-02
genres.drama 2.613771e-02
genres.horror 2.348821e-02
genres.fantasy 1.613176e-02
genres.music 1.107048e-02
genres.crime 7.022980e-03
genres.documentary 5.541445e-03
genres.sport 4.538791e-03
genres.mystery 3.025178e-03
genres.war 9.131581e-04
genres.musical 2.886580e-15
genres.news 1.887379e-15
genres.romance 0.000000e+00
genres.sci-fi 0.000000e+00
genres.thriller 0.000000e+00
genres.western 0.000000e+00
genres.comedy 0.000000e+00
genres.biography 0.000000e+00
genres.history 0.000000e+00
avg_imdb_user_by_opening 0.000000e+00
genres.action 0.000000e+00
opening_day 0.000000e+00
year 0.000000e+00

In [283]:
support = mi_df.index[0:FEATURES_TO_SELECT]

Linear Regression


In [284]:
selected_X_train = X_train.drop([col for col in X_train if col not in support], axis=1)

In [285]:
selected_X_test = X_test.drop([col for col in X_test if col not in support], axis=1)

In [286]:
regr = linear_model.LinearRegression(fit_intercept=True, normalize=True)
regr = regr.fit(selected_X_train, y_train)

In [287]:
coef_df = pd.DataFrame({'coef': regr.coef_}, index=selected_X_train.columns)
coef_df.sort('coef', ascending=False)


/Users/shaypalachy/miniconda3/envs/ds/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  from ipykernel import kernelapp as app
Out[287]:
coef
genres.adventure 9.753665e+06
opening_month 6.106549e+06
avg_mc_critic_by_opening 1.409603e+06
name_length 5.972524e+05
num_imdb_user_by_opening 4.134863e+05
num_mc_critic_by_opening 2.234796e+05
opening_weekend_screens 1.645777e+04
budget 5.119382e-01
duration -5.560414e+04
opening_day_of_year -2.007545e+05

In [288]:
y_predict = regr.predict(selected_X_test)

In [289]:
regr.score(selected_X_test, y_test)


Out[289]:
0.66185796843055644

In [290]:
plt.figure(figsize=(12,9))
plt.scatter(y_test / 1000000, y_predict / 1000000)
plt.xlabel('True Gross Income')
plt.ylabel('Predicted Gross Income');


With opening weekend income


In [256]:
X_train_weekend_inc = X_train
X_train_weekend_inc['opening_weekend_income'] = df.ix[X_train.index]['opening_weekend_income']
X_test_weekend_inc = X_test
X_test_weekend_inc['opening_weekend_income'] = df.ix[X_test.index]['opening_weekend_income']


/Users/shaypalachy/miniconda3/envs/ds/lib/python3.5/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
/Users/shaypalachy/miniconda3/envs/ds/lib/python3.5/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [257]:
mi = mutual_info_regression(X_train_weekend_inc, y_train)

In [258]:
mi_df = pd.DataFrame([mi]).T
mi_df.index = X_train_weekend_inc.columns
mi_df.columns = ['MI']

In [259]:
mi_df = mi_df.sort('MI', ascending=False)
mi_df


/Users/shaypalachy/miniconda3/envs/ds/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  if __name__ == '__main__':
Out[259]:
MI
opening_weekend_income 1.229179e+00
opening_weekend_screens 6.176942e-01
budget 4.129944e-01
num_mc_critic_by_opening 2.830909e-01
num_imdb_user_by_opening 1.497448e-01
genres.adventure 1.076145e-01
opening_day_of_year 9.753823e-02
avg_mc_critic_by_opening 7.608544e-02
duration 6.042927e-02
genres.animation 5.459138e-02
genres.drama 4.253039e-02
genres.thriller 3.863557e-02
name_length 3.571095e-02
year 3.381337e-02
avg_imdb_user_by_opening 1.938500e-02
genres.sci-fi 1.696322e-02
genres.crime 1.560913e-02
opening_month 1.326453e-02
genres.comedy 8.335426e-03
genres.romance 8.279536e-03
genres.family 8.081836e-03
genres.horror 7.834359e-03
genres.sport 5.239986e-03
genres.war 3.660264e-03
genres.fantasy 2.390522e-03
genres.biography 1.910034e-03
genres.action 1.250647e-03
genres.documentary 1.550195e-04
genres.news 1.887379e-15
genres.musical 1.887379e-15
genres.western 0.000000e+00
genres.mystery 0.000000e+00
opening_day 0.000000e+00
genres.music 0.000000e+00
genres.history 0.000000e+00

In [260]:
support = mi_df.index[0:FEATURES_TO_SELECT]

In [261]:
selected_X_train_weekend_inc = X_train_weekend_inc
selected_X_test_weekend_inc = X_test_weekend_inc
selected_X_train_weekend_inc = X_train_weekend_inc.drop([col for col in X_train_weekend_inc if col not in support], axis=1)
selected_X_test_weekend_inc = X_test_weekend_inc.drop([col for col in X_test_weekend_inc if col not in support], axis=1)

In [262]:
regr2 = linear_model.LinearRegression(fit_intercept=True, normalize=True)
regr2 = regr2.fit(selected_X_train_weekend_inc, y_train)

In [263]:
coef_df2 = pd.DataFrame({'coef': regr2.coef_}, index=selected_X_train_weekend_inc.columns)
coef_df2.sort('coef', ascending=False)


/Users/shaypalachy/miniconda3/envs/ds/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  from ipykernel import kernelapp as app
Out[263]:
coef
genres.animation 3.641685e+07
avg_mc_critic_by_opening 6.797033e+05
duration 2.911722e+05
opening_day_of_year 3.573639e+04
opening_weekend_screens 2.587271e+03
opening_weekend_income 2.356574e+00
budget 1.995159e-01
num_imdb_user_by_opening -3.475150e+04
num_mc_critic_by_opening -3.527992e+05
genres.adventure -2.864501e+06

In [266]:
y_predict2 = regr2.predict(selected_X_test_weekend_inc)
regr2.score(selected_X_test_weekend_inc, y_test)


Out[266]:
0.81705961271413918

In [267]:
plt.figure(figsize=(12,9))
plt.scatter(y_test / 1000000, y_predict2 / 1000000)
plt.xlabel('True Gross Income')
plt.ylabel('Predicted Gross Income');
# plt.title('Predicted vs True Gross Income');



In [ ]: