In [102]:

    
from altair import *
import csv
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display # Allows the use of display() for DataFrames
%matplotlib inline

# Open csv file and read rows into a pandas dataframe
df = pd.read_csv('movies.csv')
print "Dataset has {} rows and {} columns.".format(*df.shape)
display(df.head())









    



Dataset has 108 rows and 5 columns.






    






  
    
      
      Title
      Screen Actors Guild Awards
      PGA Awards
      Directors Guild of America, USA
      Academy Awards, USA
    
  
  
    
      0
      Michael Clayton
      0
      0
      0
      0
    
    
      1
      Chocolat
      1
      0
      0
      0
    
    
      2
      In The Bedroom
      0
      0
      0
      0
    
    
      3
      Life of Pi
      0
      0
      0
      0
    
    
      4
      Sideways
      1
      0
      0
      0

Data Visualization



In [117]:

    
def histogram(data, **bin_kwds):
    """
    Create a Histogram of a 1-dimensional array or series of data
    All parameters are passed to the altair's ``Bin`` class
    """
    return Chart(data).mark_bar().encode(
               x=X('Academy Awards, USA', bin=Bin(**bin_kwds)),
               y='count(*):Q'
           )

#histogram(df, maxbins=20)

fig, (ax1, ax2, ax3, ax4) = plt.subplots(ncols=4, figsize=(12, 6), sharey=True)
sns.countplot(x="Academy Awards, USA", data=df, ax=ax1)
sns.countplot(x="Screen Actors Guild Awards", data=df, ax=ax2)
sns.countplot(x="PGA Awards", data=df, ax=ax3)
sns.countplot(x="Directors Guild of America, USA", data=df, ax=ax4)









    Out[117]:





<matplotlib.axes._subplots.AxesSubplot at 0x12117ba8>

Preprocessing



In [104]:

    
from sklearn.model_selection import StratifiedShuffleSplit

X = df.drop(['Title', 'Academy Awards, USA'], axis=1, inplace=False)
y = df['Academy Awards, USA']

sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=42)
for train_ind, test_ind in sss.split(X, y):
    print "TRAIN:", train_ind, "TEST:", test_ind
    X_train, X_test = X.iloc[train_ind], X.iloc[test_ind]
    y_train, y_test = y.iloc[train_ind], y.iloc[test_ind]









    



TRAIN: [ 60  30  38  26  67  74  86 105  10  84  19  25  75  32  54  27  71  62
  34  46  14   1   6  95  70  77  85  11  61  40  33  29  37   7  48  47
  43  66 101  91  13  53  99   3  73  63  96  28   2 107  42  89   9  88
  68  87  57  76  94  64 103  81  21  56  51  31  65  23 106  44  58  20
 102  36   0  45   4  39  90  82  72 100  52  50  22  41  49  18  16  79
   5  35  55  93  78  12   8] TEST: [ 98  80 104  15  17  69  59  92  24  83  97]
TRAIN: [ 55   2  12  83  31  67  68  88  13  53  97   6 105  38  95  99  54  58
 106 101  85  17  45  47  37  15 100  22   4 103  61  20  59   0  56  28
  62  70  64  65  51  71  63  14  30  87  90  80  50  44  92  42  34  48
   1  86  74  52   9  79  93   5  21  82  23  24  25  35   3  46  41 102
  89  75 104  84  49  98  26  39  32   7  29  91  57  60   8 107  72  16
  69  76  36  40  73  10  33] TEST: [43 96 78 81 11 94 77 27 66 18 19]
TRAIN: [ 27 106  73  44  21  75  41  31  37  33   6  58  82  51  74   3  77  81
 104  55  69  56  14  91  48  99 103  45  90  60  62  50  36  89   2  59
  84  20  22  25  23  52  97  16  28  79  88 107  40   5  83  19  61   1
  34  76  30   4  87  85  35  94  46  29  80  86  64  43  66  63  92  95
 105  18  15  24  10  72  57  47  11  38  17  98   7  49  42  67  13  71
  54  96  78   8  32  93 102] TEST: [ 68  39  70 101  12  53   9   0  26  65 100]
TRAIN: [ 32  57   8  82  11  80  99   2  20   4  69   0  13  93  66  17  45  42
  51  77  83  68  61  59  33   1  26  30  73  39  81  25  75  62  58  76
  98 102   9  23  91  54 105  24  35  19  27  31  40  90  36 101   6 107
  48  79  47  96  64  12  41  74  14  89  94  18  95  78  53  85 106   7
  15  38  44  29  56  60  34 104  86  16 100  87  84  10  67  22  46  88
   3  55  21  97  72  52  92] TEST: [ 71  65  63  49   5  28  70  50 103  43  37]
TRAIN: [ 62  23  10   4  17  43  19  84 105  72  33  92   1  44  36  79  85  40
  47  91  88  18  34  20  70   3  51  71  31  78  74  94   6  14  57  38
  27  98  58  75  61  90   5  42  82   0   2  54  49 101 102  21  59  89
  11  97  28  48  46  99  41  53  26  73 106  35  15  83  24 103  12  80
 107  56  77   9  39  93  65  45  32  60  76  37  69 104   7  68  66  86
  95  22  25  64  67  16   8] TEST: [ 81  63  87  29  96 100  50  13  55  30  52]
TRAIN: [  2  92  73  48 103  97  46 101   3  13  90   1  85  54  26  52  61  59
 106  42  56   9  12  60  76  88  84  34  98  66  49  70  82  10  67  33
  86  18  28  94  53   5  24  96  22  63  68  77   7  50  39  71  20  40
  69  87  65  95  14  19  72  27  79  57  38  78  47   8 102  35   0  17
  74 100  75  25  99  58  31  80  55  81  45  91  21  16  11  89  29  51
  44  15  83   6   4  36  93] TEST: [ 32  41 104  23  43 105  37  64 107  30  62]
TRAIN: [ 62  21  42  24  16  61  94  26  69  17  87  78  31  85  51   7   5  65
 105  32  14  76  84  60  89  10  55  54  81  92  29  86  72  58 107  15
  36  33  74  95  53  91   4 101  64 102  63  48   8  20  79  49  30  68
 106  46  98  34  59  83  39  56  82  25   2  23  99  73 100  43  66  44
  38  37   9  67  28   3  40  90  27  47  57  35  80  19  45  50  41  71
  93 103  96  18  13  70  88] TEST: [  0  97  11   6  12 104  22  75  77  52   1]
TRAIN: [ 78  58  99  30  96  80  70  54   1  81  25  41 102   0  61  29  84  10
  95  77  32  48  91  31  82  63  52  74  37  72  89  38  44  51  28  42
  97  65  19 103 105  21  17  69 101 107  53   8  86  75  68  24  62  43
  35  11   9  92  12   5  49  64  66 100  36  23  98  45  18   7  40  87
  14 106  90  13  83  47  79  94  59  15  88  56  67  27 104   6  34  22
  71   3  85  39  46  26  60] TEST: [ 4 57 50 76 73 33 93  2 20 55 16]
TRAIN: [ 40  84  22 101  52  72  91  77  74  39  38  56   6  80  63   9   8  31
  30  71 102  11  41 107  65  26   1  33  57  70  50  94  81  88   0  12
  69  60  97  15  49  25  62  93  87  16  36  73   7  86  47  18  95  35
  89  29  79  42  67  27   5  37  54  34  20  13  96  46   2  55  19  64
  76  92 105  21  75 104  10  43  28  66  98  78  48  61  58 100  53  59
  23  85  82  24  45   4  68] TEST: [ 90 103  17  32  51  83  44   3  99 106  14]
TRAIN: [ 93  29   0 101  83   5  61  13  99  84  94 104  34  26  91 100  58  79
   4  23  38  59  64  97  81  30  68  51  88  96  24 106  92  85  67   7
   6  27  55  37  71  90  31  22  89  21  86  41  76  48  36  74  72   8
 102   9  54  25  65  10  40  57 105  80  18  45  60  19  33  52  12  95
  98  35  56  66  14   2  20  28  62  11  53  50  73  82   3  39  44  78
 107  63  17  15  69  75  32] TEST: [ 70  47  49   1  43  16  77  87 103  46  42]

Modelling & Evaluation



In [105]:

    
# Train model
from time import time
from pandas_ml import ConfusionMatrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.svm import SVC

def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    
    # Print the results
    print "Trained model in {:.4f} seconds".format(end - start)

def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    end = time()
    
    # Print and return results
    print "Made predictions in {:.4f} seconds.".format(end - start)
    print "AUC Score:", roc_auc_score(target.values, y_pred)
    print classification_report(target.values, y_pred)
    plot_confusion_matrix(target.values, y_pred)


def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print "Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    print "Report for training set: ", predict_labels(clf, X_train, y_train)
    print "Report for test set: ", predict_labels(clf, X_test, y_test)
    
def plot_confusion_matrix(y_true, y_pred):
    cm = ConfusionMatrix(y_true, y_pred)
    cm.plot(normalized=True)
    plt.show()
    
#clf = SVC(C=100, kernel='sigmoid', class_weight={0: 1, 1: 9}, random_state=42)
clf = SVC(C=1, kernel='rbf', class_weight={0: 1, 1: 9}, random_state=42)

train_predict(clf, X_train, y_train, X_test, y_test)









    



Training a SVC using a training set size of 97. . .
Trained model in 0.0020 seconds
Report for training set:  Made predictions in 0.0010 seconds.
AUC Score: 0.845137420719
             precision    recall  f1-score   support

          0       0.97      0.87      0.92        86
          1       0.45      0.82      0.58        11

avg / total       0.91      0.87      0.88        97







    












    



None
Report for test set:  Made predictions in 0.0000 seconds.
AUC Score: 1.0
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        10
          1       1.00      1.00      1.00         1

avg / total       1.00      1.00      1.00        11







    












    



None



In [72]:

    
# Search for optimal parameters
from sklearn.model_selection import GridSearchCV

# Parameters to do GridSearch on
cv_params = {
            'C': [1, 10, 100, 1000],
            'kernel': ['rbf', 'linear', 'poly', 'sigmoid'],
            'degree': [3, 2, 1, 4]
            }

# Static model parameters
ind_params = {
            'class_weight': {0: 1, 1: 9},
            'random_state': 42
             }

# Initialize GridSearch with its parameters
optimized_SVC = GridSearchCV(estimator=SVC(**ind_params), 
                             param_grid=cv_params, 
                             scoring='f1',
                             cv=10,
                             n_jobs=-1)

optimized_SVC.fit(X_train, y_train)
#optimized_SVC.cv_results_
print "Best score for training:", optimized_SVC.best_score_
print "Best score parameters:", optimized_SVC.best_params_
print "Score for testing:", optimized_SVC.score(X_test, y_test)









    



Best score for training: 0.482474226804
Best score parameters: {'kernel': 'sigmoid', 'C': 100, 'degree': 3}
Score for testing: 1.0



In [106]:

    
# Train final model on full dataset
start = time()
#clf = SVC(C=1, kernel='rbf', class_weight={0: 1, 1: 9}, random_state=42)
clf = SVC(C=100, kernel='sigmoid', class_weight={0: 1, 1: 9}, random_state=42)
clf.fit(X, y)
end = time()

print "Trained model in {:.4f} seconds".format(end - start)

# Saves model for future predictions
from sklearn.externals import joblib
joblib.dump(clf, 'svc.pickle')
print "Model saved."

# Load model
#clf = joblib.load('filename.pickle')









    



Trained model in 0.0020 seconds
Model saved.



In [107]:

    
# Predict new labels
df_pred = pd.read_csv('movies_pred.csv')
print "Dataset has {} rows and {} columns.".format(*df_pred.shape)
display(df_pred)

X_pred = df_pred.drop(['Title'], axis=1, inplace=False)

# Load model
clf_pred = joblib.load('svc.pickle')
start = time()
y_pred = clf_pred.predict(X_pred)
end = time()

# Print and return results
print "Made predictions in {:.4f} seconds.".format(end - start)
print ""
print "Predictions for Best Picture:"
for title, pred in zip(df_pred['Title'], y_pred):
    print title, pred









    



Dataset has 9 rows and 4 columns.






    






  
    
      
      Title
      Screen Actors Guild Awards
      PGA Awards
      Directors Guild of America, USA
    
  
  
    
      0
      Moonlight
      1
      0
      0
    
    
      1
      Manchester by the Sea
      0
      0
      0
    
    
      2
      Fences
      1
      0
      0
    
    
      3
      Lion
      0
      0
      1
    
    
      4
      Hacksaw Ridge
      1
      0
      0
    
    
      5
      Hidden Figures
      1
      0
      0
    
    
      6
      La La Land
      1
      1
      1
    
    
      7
      Hell or High Water
      0
      0
      0
    
    
      8
      Arrival
      0
      0
      0
    
  








    



Made predictions in 0.0000 seconds.

Predictions for Best Picture:
Moonlight 0
Manchester by the Sea 0
Fences 0
Lion 1
Hacksaw Ridge 0
Hidden Figures 0
La La Land 1
Hell or High Water 0
Arrival 0



In [ ]:

	Title	Screen Actors Guild Awards
0	Michael Clayton	0
1	Chocolat	1
2	In The Bedroom	0
3	Life of Pi	0
4	Sideways	1

	Title	Screen Actors Guild Awards	PGA Awards	Directors Guild of America, USA
0	Moonlight	1	0	0
1	Manchester by the Sea	0	0	0
2	Fences	1	0	0
3	Lion	0	0	1
4	Hacksaw Ridge	1	0	0
5	Hidden Figures	1	0	0
6	La La Land	1	1	1
7	Hell or High Water	0	0	0
8	Arrival	0	0	0