In [167]:
    
import os
import pandas as pd
import sklearn as skl
import holcrawl.shared
    
In [168]:
    
dataset_dir = holcrawl.shared._get_dataset_dir_path()
    
In [169]:
    
dataset_path = os.path.join(dataset_dir, 'movies_dataset.csv')
    
In [170]:
    
df = pd.read_csv(dataset_path)
    
Generating some additional basic features:
In [174]:
    
df['ROI'] = (df['gross_income'] - df['budget']) / df['budget']
    
In [175]:
    
df['name_length'] = df['name'].map(lambda name: len(name))
    
In [176]:
    
len(df)
    
    Out[176]:
The number of null values per column:
In [177]:
    
df.isnull().sum()
    
    Out[177]:
In [180]:
    
BASE_FEAT_TO_KEEP = [
    'duration', 'budget', 'opening_month', 'opening_day', 'opening_day_of_year', 'year',
    'avg_mc_critic_by_opening', 'num_mc_critic_by_opening', 'name_length', 'opening_weekend_income',
    'num_imdb_user_by_opening', 'avg_imdb_user_by_opening', 'opening_weekend_screens'# 'avg_mc_user_by_opening'
]
    
Keeping all genre dummy variables:
In [181]:
    
FEAT_TO_KEEP = BASE_FEAT_TO_KEEP + [col for col in df.columns if 'genres' in col]
    
In [182]:
    
features = df.drop([col for col in df.columns if col not in BASE_FEAT_TO_KEEP], axis=1)
    
Dropping non-feature columns:
In [183]:
    
dataset = df.drop([col for col in df.columns if col not in FEAT_TO_KEEP], axis=1)
    
Dropping all rows that still have null values:
In [184]:
    
dataset = dataset.dropna(axis=0)
    
Now, making sure we have no null values:
In [185]:
    
dataset.isnull().sum().sum()
    
    Out[185]:
We end up with a dataset of size:
In [186]:
    
len(dataset)
    
    Out[186]:
In [187]:
    
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
    
We define a failed movie as a movie whose ROI (Return On Investment) is below zero, meaning the investors actually lost money making it:
In [188]:
    
failed = df['ROI'].ix[dataset.index] < 0
    
In [189]:
    
X = dataset
Y = failed
    
Running logistic regression over 5 folds of our dataset:
In [190]:
    
logreg = linear_model.LogisticRegression()
acc_scores = cross_val_score(logreg, X, Y, cv=5, n_jobs=1)
    
In [194]:
    
mean_accuracy = np.mean(acc_scores)
accuracy_std = np.std(acc_scores)
print("Accuracy is {:.2f}% ± {:.2f}%.".format(mean_accuracy*100, accuracy_std*100))
    
    
In [195]:
    
recall_scores = cross_val_score(logreg, X, Y, cv=5, n_jobs=1, scoring='recall')
    
In [196]:
    
mean_recall = np.mean(recall_scores)
recall_std = np.std(recall_scores)
print("Recall = {:.2f}% ± {:.2f}".format(mean_recall*100, recall_std*100))
    
    
In [ ]: