In [ ]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
sns.set(color_codes=True)
sns.set(style="ticks")
data_frame_original = pd.read_csv("movie_metadata.csv")
#data_frame.loc[data_frame['imdb_score'] >= 7.5, 'imdb_score'].size / data_frame['imdb_score'].count()
In [ ]:
imdb_score_limit = 7.5
data_frame = data_frame_original.copy()
data_frame = data_frame.drop('genres',axis=1);
data_frame = data_frame.drop('plot_keywords',axis=1);
data_frame = data_frame.drop('movie_imdb_link', axis=1)
data_frame = data_frame.drop('movie_title',axis=1)
#data_frame = data_frame.drop('movie_facebook_likes',axis=1)
#data_frame.loc[data_frame['imdb_score'] < imdb_score_limit, 'imdb_score'] = 0
#data_frame.loc[data_frame['imdb_score'] >= imdb_score_limit, 'imdb_score'] = 1
data_frame
In [ ]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
for column_name in data_frame.columns:
le = LabelEncoder()
column = data_frame[column_name]
try:
if column_name == 'plot_keywords' or column_name == 'genres':
mlb = MultiLabelBinarizer()
keywords = list()
column.fillna('UNKNOWN')
column.apply(lambda x: keywords.append(x.split('|')) if x is not np.nan else keywords.append(['UNKNOWN']))
binarized = mlb.fit_transform(keywords)
plt.imshow(binarized)
class_values = pd.Series(list(binarized))
data_frame[column_name].update(class_values)
else:
normalized_column = column.fillna(0) if column.dtype.kind in 'biufc' else column.fillna('UNKNOWN')
class_values = normalized_column if column.dtype.kind in 'biufc' else pd.Series(list(le.fit_transform(normalized_column)))
data_frame[column_name].update(class_values)
except:
pass
data_frame
In [ ]:
# Figures for representing some features
sns.set(font_scale = 2)
g = sns.jointplot('gross', 'imdb_score', size=12, data=data_frame, kind="kde", color="#10275F")
plt.subplots_adjust(top=0.55)
g.fig.suptitle('KDE of Imdb_Score and Gross', size=20, weight='bold')
sns.set(font_scale = 1)
# This is different from the kaggle kernel result(as in shape)
In [ ]:
# Showing the pearson correlation of features
with sns.plotting_context(font_scale=1.25):
f, ax = plt.subplots(figsize=(20, 20))
plt.title('Pearson Correlation of Movie Features', {'weight': 'bold', 'size': 20})
# plot_keywords features are encoded as array which requires more care to plot in this way
sns.heatmap(data_frame.astype(float).corr(), linewidths=0.25, vmax=1.0, square=True, annot=True)
Create training and test datasets with .75 probab.
In [ ]:
# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier
data_frame_temp = data_frame.copy()
class0 = data_frame_temp[data_frame_temp['imdb_score']==0]
class1 = data_frame_temp[data_frame_temp['imdb_score']==1]
train0, test0, ign1, ign2 = train_test_split(class0,class0['imdb_score'], test_size=0.4)
train1, test1, ign1, ign2 = train_test_split(class1,class1['imdb_score'], test_size=0.4)
train = pd.concat([train0,train1])
test = pd.concat([test0,test1])
print(data_frame_temp)
Select features to use in the training, and the target class to predict
In [ ]:
## select the features
features = data_frame.columns
train_feat = features[features != 'imdb_score']
train_results = train.imdb_score
Fit the model to RandomForestClassifier with the training data
Get the results of the model for the test class and give the precision
In [ ]:
#print(len(set(data_frame.actor_1_name)))
clf = RandomForestClassifier(n_jobs=-1,bootstrap=True, n_estimators=25)
clf.fit(train[train_feat], train_results);
prediction = clf.predict(test[train_feat])
result = test.imdb_score - prediction
#pd.crosstab(test['imdb_score'], prediction, rownames=['Actual Imbd >= 7.5'], colnames=['Predicted Imdb >= 7.5'])
#list(zip(train[features], clf.feature_importances_))
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
print(classification_report(test.imdb_score,prediction))
print(confusion_matrix(test.imdb_score,prediction))
In [ ]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc
from itertools import cycle
from scipy import interp
fig = plt.figure()
X= train[train_feat]
y= train.imdb_score
cv = StratifiedKFold(n_splits=6)
classifier = RandomForestClassifier(n_jobs=-1,bootstrap=True, n_estimators=25)
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
colors = cycle(['cyan', 'indigo', 'seagreen', 'yellow', 'blue', 'darkorange'])
lw = 0.3
i = 0
for (train1, test1), color in zip(cv.split(X, y), colors):
probas_ = classifier.fit(X[train1], y[train1]).predict_proba(X[test1])
# Compute ROC curve and area the curve
fpr, tpr, thresholds = roc_curve(y[test1], probas_[:, 1])
mean_tpr += interp(mean_fpr, fpr, tpr)
mean_tpr[0] = 0.0
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=lw, color=color,
label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
i += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k')
lw=2.2
mean_tpr /= cv.get_n_splits(X, y)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--',
label='Mean ROC (area = %0.2f)' % mean_auc, lw=lw)
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
In [ ]:
# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestRegressor
data_frame_temp = data_frame.copy()
train, test, ign1, ign2 = train_test_split(data_frame_temp,data_frame_temp['imdb_score'], test_size=0.25)
Select features to use in the training, and the target class to predict
In [ ]:
## select the features
features = data_frame.columns
train_results = train.imdb_score
Fit the model to RandomForestRegressor with the training data
Get the results of the model for the test class and give the precision
In [ ]:
#print(len(set(data_frame.actor_1_name)))
clf = RandomForestRegressor(n_jobs=-1,bootstrap=True, n_estimators=25)
clf.fit(train[train_feat], train_results);
prediction = clf.predict(test[train_feat])
print(prediction)
result = test.imdb_score - prediction
#pd.crosstab(test['imdb_score'], prediction, rownames=['Actual Imbd >= 7.5'], colnames=['Predicted Imdb >= 7.5'])
#list(zip(train[features], clf.feature_importances_))
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
print(mean_squared_error(test.imdb_score,prediction))
print(explained_variance_score(test.imdb_score,prediction))
In [ ]:
from sklearn.decomposition import PCA
pca = PCA(n_components=20)
pca.fit(train[train_feat])
new_train = pca.fit_transform(train[train_feat])
#print(new_train)
print(sum(pca.explained_variance_ratio_))
In [ ]:
#print(len(set(data_frame.actor_1_name)))
clf = RandomForestRegressor(n_jobs=-1,bootstrap=True, n_estimators=25)
clf.fit(new_train, train.imdb_score);
prediction = clf.predict(pca.fit_transform(test[train_feat]))
print(prediction)
result = test.imdb_score - prediction
#pd.crosstab(test['imdb_score'], prediction, rownames=['Actual Imbd >= 7.5'], colnames=['Predicted Imdb >= 7.5'])
#list(zip(train[features], clf.feature_importances_))
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
print(mean_squared_error(test.imdb_score,prediction))
print(explained_variance_score(test.imdb_score,prediction))