IMDB Movie Analysis


In [ ]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error

from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

sns.set(color_codes=True)
sns.set(style="ticks")

data_frame_base = pd.read_csv("movie_metadata.csv")
plot_keywords_column = data_frame_base['plot_keywords']
data_frame_base = data_frame_base.drop('plot_keywords',1)
genres_column = data_frame_base['genres']
data_frame_base = data_frame_base.drop('genres',1)
data_frame_base = data_frame_base.drop('movie_imdb_link', 1)
data_frame_base = data_frame_base.drop('movie_title',1)
#data_frame_base.head()

Imdb Score Limit and Imdb Score Classes For Classification


In [ ]:
imdb_score_limit = 7.5

data_frame_base['imdb_score_class'] = data_frame_base['imdb_score'].copy()

data_frame_base.loc[data_frame_base['imdb_score_class'] < imdb_score_limit, 'imdb_score_class'] = 0
data_frame_base.loc[data_frame_base['imdb_score_class'] >= imdb_score_limit, 'imdb_score_class'] = 1

Introducing new features for plot_keywords and genres


In [ ]:
from sklearn.feature_extraction.text import CountVectorizer
plot_keywords_column = plot_keywords_column.fillna('UNKNOWN')
genres_column = genres_column.fillna('UNKNOWN')

def token(text):
    return(text.split("|"))

## For plot keywords
cv=CountVectorizer(max_features=200,tokenizer=token )
plot_keywords_words=cv.fit_transform(plot_keywords_column)
plot_keywords_words=plot_keywords_words.toarray()

words = cv.get_feature_names()
words=["Keyword_"+w for w in words]

keywords=pd.DataFrame(plot_keywords_words, columns=words)
keys=[w for w in words if keywords[w].sum()>80] 
keywords = keywords[keys]

data_frame_base = pd.concat([data_frame_base,keywords])

## For genres
cv=CountVectorizer(max_features=200,tokenizer=token )
genres_words=cv.fit_transform(genres_column)
genres_words=genres_words.toarray()

words = cv.get_feature_names()
words=["Genres_"+w for w in words]

keywords=pd.DataFrame(genres_words, columns=words)
keys=[w for w in words if keywords[w].sum()>150] 
keywords = keywords[keys]

data_frame_base = pd.concat([data_frame_base,keywords])

Preprocessing


In [ ]:
data_frame = data_frame_base.copy()

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler


for column_name in data_frame.columns:
    if column_name == 'imdb_score': continue
    if column_name == 'imdb_score_class': continue
    
    le = LabelEncoder()
    scaler = StandardScaler()
    
    column = data_frame[column_name]
    normalized_column = column.fillna(0) if column.dtype.kind in 'biufc' else column.fillna('UNKNOWN')
    
    if (column.dtype.kind in 'biufc'): 
        normalized_column = normalized_column.astype(float)
        normalized_column = scaler.fit_transform(normalized_column.values.reshape(-1,1))
        class_values = pd.Series(list(scaler.fit_transform(normalized_column))) 
        data_frame[column_name].update(class_values)
        
        #class_values = normalized_column
        #data_frame[column_name].update(class_values)
    else: 
        normalized_column = pd.Series(list(le.fit_transform(normalized_column))) 
        normalized_column = normalized_column.astype(float)
        normalized_column = scaler.fit_transform(normalized_column.values.reshape(-1,1))
        class_values = pd.Series(list(scaler.fit_transform(normalized_column))) 
        data_frame[column_name].update(class_values)
        
        #class_values = pd.Series(list(le.fit_transform(normalized_column))) 
        #data_frame[column_name].update(class_values)

Training and Test Sets


In [ ]:
data_frame_temp = data_frame.copy()
class0 = data_frame_temp[data_frame_temp['imdb_score_class']==0]
class1 = data_frame_temp[data_frame_temp['imdb_score_class']==1]
train0, test0, ign1, ign2 = train_test_split(class0,class0['imdb_score'], test_size=0.3)
train1, test1, ign1, ign2 = train_test_split(class1,class1['imdb_score'], test_size=0.3)
train = pd.concat([train0,train1])
test = pd.concat([test0,test1])

DATA PREPARATION IS FINISHED

APPLY ML METHODS


SVM Classification


In [ ]:
features_svc = data_frame.columns
features_svc = features_svc[features_svc != 'imdb_score']
features_svc = features_svc[features_svc != 'imdb_score_class']
r_svc_feature = 'imdb_score_class'

train_svc_A = np.array(train[features_svc])
train_svc_R = np.array(train[r_svc_feature])
test_svc_A = np.array(test[features_svc])
test_svc_R = np.array(test[r_svc_feature])
Tuning Hyperparameters C and gamma

In [ ]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

C_range = [0.1] #np.logspace(-1, 1, 3)
gamma_range = [0.1] #np.logspace(-1, 1, 3)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
grid.fit(train_svc_A, train_svc_R)

print("The best parameters are %s with a score of %0.2f" % (grid.best_params_, grid.best_score_))

In [ ]:
from sklearn.svm import SVC
svc_func = SVC(kernel='rbf',C=14,gamma = 0.012)
svc_func.fit(train_svc_A,train_svc_R)
y_svc = svc_func.predict(test_svc_A)

print(classification_report(test_svc_R,y_svc))
print(confusion_matrix(test_svc_R,y_svc))

In [ ]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc
from itertools import cycle
from scipy import interp

fig = plt.figure()

X= train_svc_A
y= train_svc_R

cv = StratifiedKFold(n_splits=6)
classifier = SVC(kernel='rbf', probability=True, C=14,gamma=0.012)

mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)

colors = cycle(['cyan', 'indigo', 'seagreen', 'yellow', 'blue', 'darkorange'])
lw = 0.3

i = 0
for (train, test), color in zip(cv.split(X, y), colors):
    probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    mean_tpr += interp(mean_fpr, fpr, tpr)
    mean_tpr[0] = 0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=lw, color=color,
             label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

    i += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k')

lw=2.2
mean_tpr /= cv.get_n_splits(X, y)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--',
         label='Mean ROC (area = %0.2f)' % mean_auc, lw=lw)

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.show()

fig.savefig('svc.png',bbox_inches='tight')

SVM Regression


In [ ]:
features_svr = data_frame.columns
features_svr = features_svr[features_svr != 'imdb_score']
features_svr = features_svr[features_svr != 'imdb_score_class']
r_svr_feature = 'imdb_score'

train_svr_A = np.array(train[features_svr])
train_svr_R = np.array(train[r_svr_feature])
test_svr_A = np.array(test[features_svr])
test_svr_R = np.array(test[r_svr_feature])
Tuning Hyperparameters C and gamma

In [ ]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

C_range = [100,115,130] #np.logspace(-1, 1, 3)
gamma_range = [0.003,0.004,0.006,0.010] #np.logspace(-1, 1, 3)
param_grid = dict(gamma=gamma_range, C=C_range)

grid = GridSearchCV(SVR(epsilon=1e-3), param_grid=param_grid, cv=5)
grid.fit(train_svr_A, train_svr_R)

print("The best parameters are %s with a score of %0.2f" % (grid.best_params_, grid.best_score_))

In [ ]:
from sklearn.svm import SVR
svr_func= SVR(C=130,gamma=0.004,epsilon=1e-4)
svr_func.fit(train_svr_A,train_svr_R)
y_svr = svr_func.predict(test_svr_A)

print(y_svr)
print(test_svr_R)
print(len(y_svr[y_svr>=7.5]))
print("Explained variance score : "+ str(explained_variance_score(test_svr_R, y_svr)))
print("Mean squared error : "+str(mean_squared_error(test_svr_R, y_svr)))

margin = 0.4
y_svr_margined = [(1 if np.abs(x-y) > margin else 0) for x,y in zip(test_svr_R, y_svr)]
print("Prediction is in margin or not error : "+ str(np.sum(y_svr_margined)/len(test_svr_R)))

KNN Classification


In [ ]:
features_knnc = data_frame.columns
features_knnc = features_knnc[features_knnc != 'imdb_score']
features_knnc = features_knnc[features_knnc != 'imdb_score_class']
r_knnc_feature = 'imdb_score_class'

train_knnc_A = np.array(train[features_knnc])
train_knnc_R = np.array(train[r_knnc_feature])
test_knnc_A = np.array(test[features_knnc])
test_knnc_R = np.array(test[r_knnc_feature])
Selecting number of neighbours

In [ ]:
from sklearn.model_selection import GridSearchCV
from sklearn import neighbors

neigh_range = [5,11,12,13,20,50,100,150]
param_grid = dict(n_neighbors=neigh_range)

grid = GridSearchCV(neighbors.KNeighborsClassifier(weights='uniform'), param_grid=param_grid, cv=5)
grid.fit(train_knnc_A, train_knnc_R)

print("The best parameters are %s with a score of %0.2f" % (grid.best_params_, grid.best_score_))

In [ ]:
from sklearn import neighbors
n_neighbors=20
knn=neighbors.KNeighborsClassifier(n_neighbors,weights='uniform')
knn.fit(train_knnc_A,train_knnc_R)
y1_knn=knn.predict(test_knnc_A)


print(classification_report(test_knnc_R,y1_knn))
print(confusion_matrix(test_knnc_R,y1_knn))

In [ ]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc
from itertools import cycle
from scipy import interp

fig = plt.figure()

X= train_knnc_A
y= train_knnc_R

cv = StratifiedKFold(n_splits=6)
classifier = neighbors.KNeighborsClassifier(n_neighbors=20)

mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)

colors = cycle(['cyan', 'indigo', 'seagreen', 'yellow', 'blue', 'darkorange'])
lw = 0.3

i = 0
for (train, test), color in zip(cv.split(X, y), colors):
    probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    mean_tpr += interp(mean_fpr, fpr, tpr)
    mean_tpr[0] = 0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=lw, color=color,
             label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

    i += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k')

lw=2.2
mean_tpr /= cv.get_n_splits(X, y)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--',
         label='Mean ROC (area = %0.2f)' % mean_auc, lw=lw)

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.show()

fig.savefig('knnc.png',bbox_inches='tight')

KNN Regression


In [ ]:
features_knnr = data_frame.columns
features_knnr = features_knnr[features_knnr != 'imdb_score']
features_knnr = features_knnr[features_knnr != 'imdb_score_class']
r_knnr_feature = 'imdb_score'

train_knnr_A = np.array(train[features_knnr])
train_knnr_R = np.array(train[r_knnr_feature])
test_knnr_A = np.array(test[features_knnr])
test_knnr_R = np.array(test[r_knnr_feature])
Selecting number of neighbors

In [ ]:
from sklearn.model_selection import GridSearchCV
from sklearn import neighbors

neigh_range = [16,18,20,22,30,50,100,150]
param_grid = dict(n_neighbors=neigh_range)

grid = GridSearchCV(neighbors.KNeighborsRegressor(weights='uniform'), param_grid=param_grid, cv=5)
grid.fit(train_knnr_A, train_knnr_R)

print("The best parameters are %s with a score of %0.2f" % (grid.best_params_, grid.best_score_))

In [ ]:
from sklearn import neighbors
n_neighbors=50
knn=neighbors.KNeighborsRegressor(n_neighbors,weights='uniform')
knn.fit(train_knnr_A,train_knnr_R)
y1_knn=knn.predict(test_knnr_A)

print(y1_knn)
print(test_knnr_R)
print(len(y1_knn[y1_knn>=7.5]))
print("Explained variance score : "+ str(explained_variance_score(test_knnr_R, y1_knn)))
print("Mean squared error : "+str(mean_squared_error(test_knnr_R, y1_knn)))

margin = 1
y_knn_margined = [(1 if np.abs(x-y) > margin else 0) for x,y in zip(test_knnr_R, y1_knn)]
print("Prediction is in margin or not error : "+ str(np.sum(y_knn_margined)/len(test_knnr_R)))

RandomForest Classifier Roc Curve


In [ ]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc
from itertools import cycle
from scipy import interp
from sklearn.ensemble import RandomForestClassifier

fig = plt.figure()

X= train_knnc_A
y= train_knnc_R

cv = StratifiedKFold(n_splits=6)
classifier = RandomForestClassifier(n_jobs=-1,n_estimators =25,bootstrap=True)

mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)

colors = cycle(['cyan', 'indigo', 'seagreen', 'yellow', 'blue', 'darkorange'])
lw = 0.3

i = 0
for (train, test), color in zip(cv.split(X, y), colors):
    probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    mean_tpr += interp(mean_fpr, fpr, tpr)
    mean_tpr[0] = 0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=lw, color=color,
             label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

    i += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k')

lw=2.2
mean_tpr /= cv.get_n_splits(X, y)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--',
         label='Mean ROC (area = %0.2f)' % mean_auc, lw=lw)

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.show()

fig.savefig('rfc.png',bbox_inches='tight')