IMDB Point Prediction with Random Forests


In [ ]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

sns.set(color_codes=True)
sns.set(style="ticks")

data_frame_original = pd.read_csv("movie_metadata.csv")
#data_frame.loc[data_frame['imdb_score'] >= 7.5, 'imdb_score'].size / data_frame['imdb_score'].count()

In [ ]:
imdb_score_limit = 7.5

data_frame = data_frame_original.copy()
data_frame = data_frame.drop('genres',axis=1);
data_frame = data_frame.drop('plot_keywords',axis=1);
data_frame = data_frame.drop('movie_imdb_link', axis=1)
data_frame = data_frame.drop('movie_title',axis=1)
#data_frame = data_frame.drop('movie_facebook_likes',axis=1)
#data_frame.loc[data_frame['imdb_score'] < imdb_score_limit, 'imdb_score'] = 0
#data_frame.loc[data_frame['imdb_score'] >= imdb_score_limit, 'imdb_score'] = 1
data_frame

In [ ]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder

for column_name in data_frame.columns:
    le = LabelEncoder()
    column = data_frame[column_name]
    try:
        if column_name == 'plot_keywords' or column_name == 'genres':
            mlb = MultiLabelBinarizer()
            keywords = list()
            column.fillna('UNKNOWN')
            column.apply(lambda x: keywords.append(x.split('|')) if x is not np.nan else keywords.append(['UNKNOWN']))
            binarized = mlb.fit_transform(keywords)
            plt.imshow(binarized)
            class_values = pd.Series(list(binarized))
            data_frame[column_name].update(class_values)

        else:
            normalized_column = column.fillna(0) if column.dtype.kind in 'biufc' else column.fillna('UNKNOWN')
            class_values = normalized_column if column.dtype.kind in 'biufc' else pd.Series(list(le.fit_transform(normalized_column))) 
            data_frame[column_name].update(class_values)
    except:
        pass
    
data_frame

In [ ]:
# Figures for representing some features
sns.set(font_scale = 2)
g = sns.jointplot('gross', 'imdb_score', size=12, data=data_frame, kind="kde", color="#10275F")
plt.subplots_adjust(top=0.55)
g.fig.suptitle('KDE of Imdb_Score and Gross', size=20, weight='bold')
sns.set(font_scale = 1)
# This is different from the kaggle kernel result(as in shape)

In [ ]:
# Showing the pearson correlation of features
with sns.plotting_context(font_scale=1.25):
    f, ax = plt.subplots(figsize=(20, 20))
    plt.title('Pearson Correlation of Movie Features', {'weight': 'bold', 'size': 20})
    # plot_keywords features are encoded as array which requires more care to plot in this way
    
    sns.heatmap(data_frame.astype(float).corr(), linewidths=0.25, vmax=1.0, square=True, annot=True)

Create training and test datasets with .75 probab.


In [ ]:
# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

data_frame_temp = data_frame.copy()
class0 = data_frame_temp[data_frame_temp['imdb_score']==0]
class1 = data_frame_temp[data_frame_temp['imdb_score']==1]
train0, test0, ign1, ign2 = train_test_split(class0,class0['imdb_score'], test_size=0.4)
train1, test1, ign1, ign2 = train_test_split(class1,class1['imdb_score'], test_size=0.4)
train = pd.concat([train0,train1])
test = pd.concat([test0,test1])
print(data_frame_temp)

Select features to use in the training, and the target class to predict


In [ ]:
## select the features
features = data_frame.columns
train_feat = features[features != 'imdb_score']
train_results = train.imdb_score

Fit the model to RandomForestClassifier with the training data

Get the results of the model for the test class and give the precision


In [ ]:
#print(len(set(data_frame.actor_1_name)))
clf = RandomForestClassifier(n_jobs=-1,bootstrap=True, n_estimators=25)

clf.fit(train[train_feat], train_results);


prediction = clf.predict(test[train_feat])


result =  test.imdb_score - prediction

#pd.crosstab(test['imdb_score'], prediction, rownames=['Actual Imbd >= 7.5'], colnames=['Predicted Imdb >= 7.5'])
#list(zip(train[features], clf.feature_importances_))

from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score


print(classification_report(test.imdb_score,prediction))
print(confusion_matrix(test.imdb_score,prediction))

ROC Curve


In [ ]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc
from itertools import cycle
from scipy import interp

fig = plt.figure()

X= train[train_feat]
y= train.imdb_score

cv = StratifiedKFold(n_splits=6)
classifier = RandomForestClassifier(n_jobs=-1,bootstrap=True, n_estimators=25)

mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)

colors = cycle(['cyan', 'indigo', 'seagreen', 'yellow', 'blue', 'darkorange'])
lw = 0.3

i = 0
for (train1, test1), color in zip(cv.split(X, y), colors):
    probas_ = classifier.fit(X[train1], y[train1]).predict_proba(X[test1])
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y[test1], probas_[:, 1])
    mean_tpr += interp(mean_fpr, fpr, tpr)
    mean_tpr[0] = 0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=lw, color=color,
             label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

    i += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k')

lw=2.2
mean_tpr /= cv.get_n_splits(X, y)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--',
         label='Mean ROC (area = %0.2f)' % mean_auc, lw=lw)

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

Random Forest Regression


In [ ]:
# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestRegressor

data_frame_temp = data_frame.copy()
train, test, ign1, ign2 = train_test_split(data_frame_temp,data_frame_temp['imdb_score'], test_size=0.25)

Select features to use in the training, and the target class to predict


In [ ]:
## select the features
features = data_frame.columns
train_results = train.imdb_score

Fit the model to RandomForestRegressor with the training data

Get the results of the model for the test class and give the precision


In [ ]:
#print(len(set(data_frame.actor_1_name)))
clf = RandomForestRegressor(n_jobs=-1,bootstrap=True, n_estimators=25)

clf.fit(train[train_feat], train_results);


prediction = clf.predict(test[train_feat])

print(prediction)
result =  test.imdb_score - prediction

#pd.crosstab(test['imdb_score'], prediction, rownames=['Actual Imbd >= 7.5'], colnames=['Predicted Imdb >= 7.5'])
#list(zip(train[features], clf.feature_importances_))

from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score

print(mean_squared_error(test.imdb_score,prediction))
print(explained_variance_score(test.imdb_score,prediction))

PCA


In [ ]:
from sklearn.decomposition import PCA

pca = PCA(n_components=20)
pca.fit(train[train_feat])

new_train = pca.fit_transform(train[train_feat])

#print(new_train)

print(sum(pca.explained_variance_ratio_))

In [ ]:
#print(len(set(data_frame.actor_1_name)))
clf = RandomForestRegressor(n_jobs=-1,bootstrap=True, n_estimators=25)

clf.fit(new_train, train.imdb_score);


prediction = clf.predict(pca.fit_transform(test[train_feat]))

print(prediction)
result =  test.imdb_score - prediction

#pd.crosstab(test['imdb_score'], prediction, rownames=['Actual Imbd >= 7.5'], colnames=['Predicted Imdb >= 7.5'])
#list(zip(train[features], clf.feature_importances_))

from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score

print(mean_squared_error(test.imdb_score,prediction))
print(explained_variance_score(test.imdb_score,prediction))