In [167]:
import os
import pandas as pd
import sklearn as skl
import holcrawl.shared
In [168]:
dataset_dir = holcrawl.shared._get_dataset_dir_path()
In [169]:
dataset_path = os.path.join(dataset_dir, 'movies_dataset.csv')
In [170]:
df = pd.read_csv(dataset_path)
Generating some additional basic features:
In [174]:
df['ROI'] = (df['gross_income'] - df['budget']) / df['budget']
In [175]:
df['name_length'] = df['name'].map(lambda name: len(name))
In [176]:
len(df)
Out[176]:
The number of null values per column:
In [177]:
df.isnull().sum()
Out[177]:
In [180]:
BASE_FEAT_TO_KEEP = [
'duration', 'budget', 'opening_month', 'opening_day', 'opening_day_of_year', 'year',
'avg_mc_critic_by_opening', 'num_mc_critic_by_opening', 'name_length', 'opening_weekend_income',
'num_imdb_user_by_opening', 'avg_imdb_user_by_opening', 'opening_weekend_screens'# 'avg_mc_user_by_opening'
]
Keeping all genre dummy variables:
In [181]:
FEAT_TO_KEEP = BASE_FEAT_TO_KEEP + [col for col in df.columns if 'genres' in col]
In [182]:
features = df.drop([col for col in df.columns if col not in BASE_FEAT_TO_KEEP], axis=1)
Dropping non-feature columns:
In [183]:
dataset = df.drop([col for col in df.columns if col not in FEAT_TO_KEEP], axis=1)
Dropping all rows that still have null values:
In [184]:
dataset = dataset.dropna(axis=0)
Now, making sure we have no null values:
In [185]:
dataset.isnull().sum().sum()
Out[185]:
We end up with a dataset of size:
In [186]:
len(dataset)
Out[186]:
In [187]:
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
We define a failed movie as a movie whose ROI (Return On Investment) is below zero, meaning the investors actually lost money making it:
In [188]:
failed = df['ROI'].ix[dataset.index] < 0
In [189]:
X = dataset
Y = failed
Running logistic regression over 5 folds of our dataset:
In [190]:
logreg = linear_model.LogisticRegression()
acc_scores = cross_val_score(logreg, X, Y, cv=5, n_jobs=1)
In [194]:
mean_accuracy = np.mean(acc_scores)
accuracy_std = np.std(acc_scores)
print("Accuracy is {:.2f}% ± {:.2f}%.".format(mean_accuracy*100, accuracy_std*100))
In [195]:
recall_scores = cross_val_score(logreg, X, Y, cv=5, n_jobs=1, scoring='recall')
In [196]:
mean_recall = np.mean(recall_scores)
recall_std = np.std(recall_scores)
print("Recall = {:.2f}% ± {:.2f}".format(mean_recall*100, recall_std*100))
In [ ]: