In [102]:
from altair import *
import csv
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display # Allows the use of display() for DataFrames
%matplotlib inline
# Open csv file and read rows into a pandas dataframe
df = pd.read_csv('movies.csv')
print "Dataset has {} rows and {} columns.".format(*df.shape)
display(df.head())
In [117]:
def histogram(data, **bin_kwds):
"""
Create a Histogram of a 1-dimensional array or series of data
All parameters are passed to the altair's ``Bin`` class
"""
return Chart(data).mark_bar().encode(
x=X('Academy Awards, USA', bin=Bin(**bin_kwds)),
y='count(*):Q'
)
#histogram(df, maxbins=20)
fig, (ax1, ax2, ax3, ax4) = plt.subplots(ncols=4, figsize=(12, 6), sharey=True)
sns.countplot(x="Academy Awards, USA", data=df, ax=ax1)
sns.countplot(x="Screen Actors Guild Awards", data=df, ax=ax2)
sns.countplot(x="PGA Awards", data=df, ax=ax3)
sns.countplot(x="Directors Guild of America, USA", data=df, ax=ax4)
Out[117]:
In [104]:
from sklearn.model_selection import StratifiedShuffleSplit
X = df.drop(['Title', 'Academy Awards, USA'], axis=1, inplace=False)
y = df['Academy Awards, USA']
sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=42)
for train_ind, test_ind in sss.split(X, y):
print "TRAIN:", train_ind, "TEST:", test_ind
X_train, X_test = X.iloc[train_ind], X.iloc[test_ind]
y_train, y_test = y.iloc[train_ind], y.iloc[test_ind]
In [105]:
# Train model
from time import time
from pandas_ml import ConfusionMatrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.svm import SVC
def train_classifier(clf, X_train, y_train):
''' Fits a classifier to the training data. '''
# Start the clock, train the classifier, then stop the clock
start = time()
clf.fit(X_train, y_train)
end = time()
# Print the results
print "Trained model in {:.4f} seconds".format(end - start)
def predict_labels(clf, features, target):
''' Makes predictions using a fit classifier based on F1 score. '''
# Start the clock, make predictions, then stop the clock
start = time()
y_pred = clf.predict(features)
end = time()
# Print and return results
print "Made predictions in {:.4f} seconds.".format(end - start)
print "AUC Score:", roc_auc_score(target.values, y_pred)
print classification_report(target.values, y_pred)
plot_confusion_matrix(target.values, y_pred)
def train_predict(clf, X_train, y_train, X_test, y_test):
''' Train and predict using a classifer based on F1 score. '''
# Indicate the classifier and the training set size
print "Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train))
# Train the classifier
train_classifier(clf, X_train, y_train)
# Print the results of prediction for both training and testing
print "Report for training set: ", predict_labels(clf, X_train, y_train)
print "Report for test set: ", predict_labels(clf, X_test, y_test)
def plot_confusion_matrix(y_true, y_pred):
cm = ConfusionMatrix(y_true, y_pred)
cm.plot(normalized=True)
plt.show()
#clf = SVC(C=100, kernel='sigmoid', class_weight={0: 1, 1: 9}, random_state=42)
clf = SVC(C=1, kernel='rbf', class_weight={0: 1, 1: 9}, random_state=42)
train_predict(clf, X_train, y_train, X_test, y_test)
In [72]:
# Search for optimal parameters
from sklearn.model_selection import GridSearchCV
# Parameters to do GridSearch on
cv_params = {
'C': [1, 10, 100, 1000],
'kernel': ['rbf', 'linear', 'poly', 'sigmoid'],
'degree': [3, 2, 1, 4]
}
# Static model parameters
ind_params = {
'class_weight': {0: 1, 1: 9},
'random_state': 42
}
# Initialize GridSearch with its parameters
optimized_SVC = GridSearchCV(estimator=SVC(**ind_params),
param_grid=cv_params,
scoring='f1',
cv=10,
n_jobs=-1)
optimized_SVC.fit(X_train, y_train)
#optimized_SVC.cv_results_
print "Best score for training:", optimized_SVC.best_score_
print "Best score parameters:", optimized_SVC.best_params_
print "Score for testing:", optimized_SVC.score(X_test, y_test)
In [106]:
# Train final model on full dataset
start = time()
#clf = SVC(C=1, kernel='rbf', class_weight={0: 1, 1: 9}, random_state=42)
clf = SVC(C=100, kernel='sigmoid', class_weight={0: 1, 1: 9}, random_state=42)
clf.fit(X, y)
end = time()
print "Trained model in {:.4f} seconds".format(end - start)
# Saves model for future predictions
from sklearn.externals import joblib
joblib.dump(clf, 'svc.pickle')
print "Model saved."
# Load model
#clf = joblib.load('filename.pickle')
In [107]:
# Predict new labels
df_pred = pd.read_csv('movies_pred.csv')
print "Dataset has {} rows and {} columns.".format(*df_pred.shape)
display(df_pred)
X_pred = df_pred.drop(['Title'], axis=1, inplace=False)
# Load model
clf_pred = joblib.load('svc.pickle')
start = time()
y_pred = clf_pred.predict(X_pred)
end = time()
# Print and return results
print "Made predictions in {:.4f} seconds.".format(end - start)
print ""
print "Predictions for Best Picture:"
for title, pred in zip(df_pred['Title'], y_pred):
print title, pred
In [ ]: