In [ ]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
sns.set(color_codes=True)
sns.set(style="ticks")
data_frame_base = pd.read_csv("movie_metadata.csv")
plot_keywords_column = data_frame_base['plot_keywords']
data_frame_base = data_frame_base.drop('plot_keywords',1)
genres_column = data_frame_base['genres']
data_frame_base = data_frame_base.drop('genres',1)
data_frame_base = data_frame_base.drop('movie_imdb_link', 1)
data_frame_base = data_frame_base.drop('movie_title',1)
#data_frame_base.head()
In [ ]:
imdb_score_limit = 7.5
data_frame_base['imdb_score_class'] = data_frame_base['imdb_score'].copy()
data_frame_base.loc[data_frame_base['imdb_score_class'] < imdb_score_limit, 'imdb_score_class'] = 0
data_frame_base.loc[data_frame_base['imdb_score_class'] >= imdb_score_limit, 'imdb_score_class'] = 1
In [ ]:
from sklearn.feature_extraction.text import CountVectorizer
plot_keywords_column = plot_keywords_column.fillna('UNKNOWN')
genres_column = genres_column.fillna('UNKNOWN')
def token(text):
return(text.split("|"))
## For plot keywords
cv=CountVectorizer(max_features=200,tokenizer=token )
plot_keywords_words=cv.fit_transform(plot_keywords_column)
plot_keywords_words=plot_keywords_words.toarray()
words = cv.get_feature_names()
words=["Keyword_"+w for w in words]
keywords=pd.DataFrame(plot_keywords_words, columns=words)
keys=[w for w in words if keywords[w].sum()>80]
keywords = keywords[keys]
data_frame_base = pd.concat([data_frame_base,keywords])
## For genres
cv=CountVectorizer(max_features=200,tokenizer=token )
genres_words=cv.fit_transform(genres_column)
genres_words=genres_words.toarray()
words = cv.get_feature_names()
words=["Genres_"+w for w in words]
keywords=pd.DataFrame(genres_words, columns=words)
keys=[w for w in words if keywords[w].sum()>150]
keywords = keywords[keys]
data_frame_base = pd.concat([data_frame_base,keywords])
In [ ]:
data_frame = data_frame_base.copy()
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
for column_name in data_frame.columns:
if column_name == 'imdb_score': continue
if column_name == 'imdb_score_class': continue
le = LabelEncoder()
scaler = StandardScaler()
column = data_frame[column_name]
normalized_column = column.fillna(0) if column.dtype.kind in 'biufc' else column.fillna('UNKNOWN')
if (column.dtype.kind in 'biufc'):
normalized_column = normalized_column.astype(float)
normalized_column = scaler.fit_transform(normalized_column.values.reshape(-1,1))
class_values = pd.Series(list(scaler.fit_transform(normalized_column)))
data_frame[column_name].update(class_values)
#class_values = normalized_column
#data_frame[column_name].update(class_values)
else:
normalized_column = pd.Series(list(le.fit_transform(normalized_column)))
normalized_column = normalized_column.astype(float)
normalized_column = scaler.fit_transform(normalized_column.values.reshape(-1,1))
class_values = pd.Series(list(scaler.fit_transform(normalized_column)))
data_frame[column_name].update(class_values)
#class_values = pd.Series(list(le.fit_transform(normalized_column)))
#data_frame[column_name].update(class_values)
In [ ]:
data_frame_temp = data_frame.copy()
class0 = data_frame_temp[data_frame_temp['imdb_score_class']==0]
class1 = data_frame_temp[data_frame_temp['imdb_score_class']==1]
train0, test0, ign1, ign2 = train_test_split(class0,class0['imdb_score'], test_size=0.3)
train1, test1, ign1, ign2 = train_test_split(class1,class1['imdb_score'], test_size=0.3)
train = pd.concat([train0,train1])
test = pd.concat([test0,test1])
In [ ]:
features_svc = data_frame.columns
features_svc = features_svc[features_svc != 'imdb_score']
features_svc = features_svc[features_svc != 'imdb_score_class']
r_svc_feature = 'imdb_score_class'
train_svc_A = np.array(train[features_svc])
train_svc_R = np.array(train[r_svc_feature])
test_svc_A = np.array(test[features_svc])
test_svc_R = np.array(test[r_svc_feature])
In [ ]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
C_range = [0.1] #np.logspace(-1, 1, 3)
gamma_range = [0.1] #np.logspace(-1, 1, 3)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
grid.fit(train_svc_A, train_svc_R)
print("The best parameters are %s with a score of %0.2f" % (grid.best_params_, grid.best_score_))
In [ ]:
from sklearn.svm import SVC
svc_func = SVC(kernel='rbf',C=14,gamma = 0.012)
svc_func.fit(train_svc_A,train_svc_R)
y_svc = svc_func.predict(test_svc_A)
print(classification_report(test_svc_R,y_svc))
print(confusion_matrix(test_svc_R,y_svc))
In [ ]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc
from itertools import cycle
from scipy import interp
fig = plt.figure()
X= train_svc_A
y= train_svc_R
cv = StratifiedKFold(n_splits=6)
classifier = SVC(kernel='rbf', probability=True, C=14,gamma=0.012)
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
colors = cycle(['cyan', 'indigo', 'seagreen', 'yellow', 'blue', 'darkorange'])
lw = 0.3
i = 0
for (train, test), color in zip(cv.split(X, y), colors):
probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
# Compute ROC curve and area the curve
fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
mean_tpr += interp(mean_fpr, fpr, tpr)
mean_tpr[0] = 0.0
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=lw, color=color,
label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
i += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k')
lw=2.2
mean_tpr /= cv.get_n_splits(X, y)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--',
label='Mean ROC (area = %0.2f)' % mean_auc, lw=lw)
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.show()
fig.savefig('svc.png',bbox_inches='tight')
In [ ]:
features_svr = data_frame.columns
features_svr = features_svr[features_svr != 'imdb_score']
features_svr = features_svr[features_svr != 'imdb_score_class']
r_svr_feature = 'imdb_score'
train_svr_A = np.array(train[features_svr])
train_svr_R = np.array(train[r_svr_feature])
test_svr_A = np.array(test[features_svr])
test_svr_R = np.array(test[r_svr_feature])
In [ ]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
C_range = [100,115,130] #np.logspace(-1, 1, 3)
gamma_range = [0.003,0.004,0.006,0.010] #np.logspace(-1, 1, 3)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVR(epsilon=1e-3), param_grid=param_grid, cv=5)
grid.fit(train_svr_A, train_svr_R)
print("The best parameters are %s with a score of %0.2f" % (grid.best_params_, grid.best_score_))
In [ ]:
from sklearn.svm import SVR
svr_func= SVR(C=130,gamma=0.004,epsilon=1e-4)
svr_func.fit(train_svr_A,train_svr_R)
y_svr = svr_func.predict(test_svr_A)
print(y_svr)
print(test_svr_R)
print(len(y_svr[y_svr>=7.5]))
print("Explained variance score : "+ str(explained_variance_score(test_svr_R, y_svr)))
print("Mean squared error : "+str(mean_squared_error(test_svr_R, y_svr)))
margin = 0.4
y_svr_margined = [(1 if np.abs(x-y) > margin else 0) for x,y in zip(test_svr_R, y_svr)]
print("Prediction is in margin or not error : "+ str(np.sum(y_svr_margined)/len(test_svr_R)))
In [ ]:
features_knnc = data_frame.columns
features_knnc = features_knnc[features_knnc != 'imdb_score']
features_knnc = features_knnc[features_knnc != 'imdb_score_class']
r_knnc_feature = 'imdb_score_class'
train_knnc_A = np.array(train[features_knnc])
train_knnc_R = np.array(train[r_knnc_feature])
test_knnc_A = np.array(test[features_knnc])
test_knnc_R = np.array(test[r_knnc_feature])
In [ ]:
from sklearn.model_selection import GridSearchCV
from sklearn import neighbors
neigh_range = [5,11,12,13,20,50,100,150]
param_grid = dict(n_neighbors=neigh_range)
grid = GridSearchCV(neighbors.KNeighborsClassifier(weights='uniform'), param_grid=param_grid, cv=5)
grid.fit(train_knnc_A, train_knnc_R)
print("The best parameters are %s with a score of %0.2f" % (grid.best_params_, grid.best_score_))
In [ ]:
from sklearn import neighbors
n_neighbors=20
knn=neighbors.KNeighborsClassifier(n_neighbors,weights='uniform')
knn.fit(train_knnc_A,train_knnc_R)
y1_knn=knn.predict(test_knnc_A)
print(classification_report(test_knnc_R,y1_knn))
print(confusion_matrix(test_knnc_R,y1_knn))
In [ ]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc
from itertools import cycle
from scipy import interp
fig = plt.figure()
X= train_knnc_A
y= train_knnc_R
cv = StratifiedKFold(n_splits=6)
classifier = neighbors.KNeighborsClassifier(n_neighbors=20)
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
colors = cycle(['cyan', 'indigo', 'seagreen', 'yellow', 'blue', 'darkorange'])
lw = 0.3
i = 0
for (train, test), color in zip(cv.split(X, y), colors):
probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
# Compute ROC curve and area the curve
fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
mean_tpr += interp(mean_fpr, fpr, tpr)
mean_tpr[0] = 0.0
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=lw, color=color,
label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
i += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k')
lw=2.2
mean_tpr /= cv.get_n_splits(X, y)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--',
label='Mean ROC (area = %0.2f)' % mean_auc, lw=lw)
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.show()
fig.savefig('knnc.png',bbox_inches='tight')
In [ ]:
features_knnr = data_frame.columns
features_knnr = features_knnr[features_knnr != 'imdb_score']
features_knnr = features_knnr[features_knnr != 'imdb_score_class']
r_knnr_feature = 'imdb_score'
train_knnr_A = np.array(train[features_knnr])
train_knnr_R = np.array(train[r_knnr_feature])
test_knnr_A = np.array(test[features_knnr])
test_knnr_R = np.array(test[r_knnr_feature])
In [ ]:
from sklearn.model_selection import GridSearchCV
from sklearn import neighbors
neigh_range = [16,18,20,22,30,50,100,150]
param_grid = dict(n_neighbors=neigh_range)
grid = GridSearchCV(neighbors.KNeighborsRegressor(weights='uniform'), param_grid=param_grid, cv=5)
grid.fit(train_knnr_A, train_knnr_R)
print("The best parameters are %s with a score of %0.2f" % (grid.best_params_, grid.best_score_))
In [ ]:
from sklearn import neighbors
n_neighbors=50
knn=neighbors.KNeighborsRegressor(n_neighbors,weights='uniform')
knn.fit(train_knnr_A,train_knnr_R)
y1_knn=knn.predict(test_knnr_A)
print(y1_knn)
print(test_knnr_R)
print(len(y1_knn[y1_knn>=7.5]))
print("Explained variance score : "+ str(explained_variance_score(test_knnr_R, y1_knn)))
print("Mean squared error : "+str(mean_squared_error(test_knnr_R, y1_knn)))
margin = 1
y_knn_margined = [(1 if np.abs(x-y) > margin else 0) for x,y in zip(test_knnr_R, y1_knn)]
print("Prediction is in margin or not error : "+ str(np.sum(y_knn_margined)/len(test_knnr_R)))
In [ ]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc
from itertools import cycle
from scipy import interp
from sklearn.ensemble import RandomForestClassifier
fig = plt.figure()
X= train_knnc_A
y= train_knnc_R
cv = StratifiedKFold(n_splits=6)
classifier = RandomForestClassifier(n_jobs=-1,n_estimators =25,bootstrap=True)
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
colors = cycle(['cyan', 'indigo', 'seagreen', 'yellow', 'blue', 'darkorange'])
lw = 0.3
i = 0
for (train, test), color in zip(cv.split(X, y), colors):
probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
# Compute ROC curve and area the curve
fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
mean_tpr += interp(mean_fpr, fpr, tpr)
mean_tpr[0] = 0.0
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=lw, color=color,
label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
i += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k')
lw=2.2
mean_tpr /= cv.get_n_splits(X, y)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--',
label='Mean ROC (area = %0.2f)' % mean_auc, lw=lw)
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.show()
fig.savefig('rfc.png',bbox_inches='tight')