Prediction Failed Movies

Loading the dataset


In [167]:
import os

import pandas as pd
import sklearn as skl

import holcrawl.shared

In [168]:
dataset_dir = holcrawl.shared._get_dataset_dir_path()

In [169]:
dataset_path = os.path.join(dataset_dir, 'movies_dataset.csv')

In [170]:
df = pd.read_csv(dataset_path)

Feature Generation

Generating some additional basic features:


In [174]:
df['ROI'] = (df['gross_income'] - df['budget']) / df['budget']

In [175]:
df['name_length'] = df['name'].map(lambda name: len(name))

In [176]:
len(df)


Out[176]:
900

The number of null values per column:


In [177]:
df.isnull().sum()


Out[177]:
avg_screens                          0
budget                             149
budget_currency                    149
closing_date                         0
critic_review_count                  0
duration                             0
gross_income                         0
imdb_user_reviews                    0
max_screens                          0
mc_avg_user_score                    0
mc_metascore                         0
mc_mixed_rating_frequency            0
mc_movie_name                        0
mc_negative_rating_frequency         0
mc_positive_rating_frequency         0
mc_pro_critic_reviews                0
mc_user_reviews                      0
metascore                            3
name                                 0
num_weekends                         0
opening_weekend_date                 0
opening_weekend_income               0
opening_weekend_income_currency      0
opening_weekend_screens              0
rating                               0
rating_count                         0
release_day                        191
release_month                      191
release_year                       191
screens_by_weekend                   0
                                  ... 
genres.horror                        0
genres.music                         0
genres.musical                       0
genres.mystery                       0
genres.news                          0
genres.romance                       0
genres.sci-fi                        0
genres.sport                         0
genres.thriller                      0
genres.war                           0
genres.western                       0
num_mc_critic                        0
avg_mc_critic                      235
num_mc_critic_by_opening             0
avg_mc_critic_by_opening           266
num_mc_user                          0
avg_mc_user                         16
num_mc_user_by_opening               0
avg_mc_user_by_opening             441
num_imdb_user                        0
avg_imdb_user                        0
num_imdb_user_by_opening             0
avg_imdb_user_by_opening            45
opening_month                        0
opening_day                          0
opening_day_of_year                  0
norm_gross                         149
profit                             149
ROI                                149
name_length                          0
dtype: int64

In [180]:
BASE_FEAT_TO_KEEP = [
    'duration', 'budget', 'opening_month', 'opening_day', 'opening_day_of_year', 'year',
    'avg_mc_critic_by_opening', 'num_mc_critic_by_opening', 'name_length', 'opening_weekend_income',
    'num_imdb_user_by_opening', 'avg_imdb_user_by_opening', 'opening_weekend_screens'# 'avg_mc_user_by_opening'
]

Keeping all genre dummy variables:


In [181]:
FEAT_TO_KEEP = BASE_FEAT_TO_KEEP + [col for col in df.columns if 'genres' in col]

In [182]:
features = df.drop([col for col in df.columns if col not in BASE_FEAT_TO_KEEP], axis=1)

Dropping non-feature columns:


In [183]:
dataset = df.drop([col for col in df.columns if col not in FEAT_TO_KEEP], axis=1)

Dropping all rows that still have null values:


In [184]:
dataset = dataset.dropna(axis=0)

Now, making sure we have no null values:


In [185]:
dataset.isnull().sum().sum()


Out[185]:
0

We end up with a dataset of size:


In [186]:
len(dataset)


Out[186]:
518

Prediction


In [187]:
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import cross_val_score

Predicting failed movies

We define a failed movie as a movie whose ROI (Return On Investment) is below zero, meaning the investors actually lost money making it:


In [188]:
failed = df['ROI'].ix[dataset.index] < 0

In [189]:
X = dataset
Y = failed

Running logistic regression over 5 folds of our dataset:


In [190]:
logreg = linear_model.LogisticRegression()
acc_scores = cross_val_score(logreg, X, Y, cv=5, n_jobs=1)

In [194]:
mean_accuracy = np.mean(acc_scores)
accuracy_std = np.std(acc_scores)
print("Accuracy is {:.2f}% ± {:.2f}%.".format(mean_accuracy*100, accuracy_std*100))


Accuracy is 80.70% ± 1.51%.

In [195]:
recall_scores = cross_val_score(logreg, X, Y, cv=5, n_jobs=1, scoring='recall')

In [196]:
mean_recall = np.mean(recall_scores)
recall_std = np.std(recall_scores)
print("Recall = {:.2f}% ± {:.2f}".format(mean_recall*100, recall_std*100))


Recall = 81.12% ± 7.76

In [ ]: