Milestone 3: Genre Prediction Using Standard ML Classifiers


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import AdaBoostClassifier as AdaBoost
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.metrics import hamming_loss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.decomposition import PCA
from sklearn import preprocessing

from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from gensim import corpora, models
import gensim

import string
import time
from datetime import datetime
from __future__ import division

Let's begin by loading and examining our raw dataset, containing data obtained through the TMDB API and saved previously as a CSV file.


In [5]:
%cd ./Train_Data_Version_1/
features = pd.read_csv('features_V1.csv')


---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-5-68eea3862852> in <module>()
----> 1 features = pd.read_csv('features_V1.csv')

C:\Users\Joeseph\Anaconda2\lib\site-packages\pandas\io\parsers.pyc in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    560                     skip_blank_lines=skip_blank_lines)
    561 
--> 562         return _read(filepath_or_buffer, kwds)
    563 
    564     parser_f.__name__ = name

C:\Users\Joeseph\Anaconda2\lib\site-packages\pandas\io\parsers.pyc in _read(filepath_or_buffer, kwds)
    313 
    314     # Create the parser.
--> 315     parser = TextFileReader(filepath_or_buffer, **kwds)
    316 
    317     if (nrows is not None) and (chunksize is not None):

C:\Users\Joeseph\Anaconda2\lib\site-packages\pandas\io\parsers.pyc in __init__(self, f, engine, **kwds)
    643             self.options['has_index_names'] = kwds['has_index_names']
    644 
--> 645         self._make_engine(self.engine)
    646 
    647     def close(self):

C:\Users\Joeseph\Anaconda2\lib\site-packages\pandas\io\parsers.pyc in _make_engine(self, engine)
    797     def _make_engine(self, engine='c'):
    798         if engine == 'c':
--> 799             self._engine = CParserWrapper(self.f, **self.options)
    800         else:
    801             if engine == 'python':

C:\Users\Joeseph\Anaconda2\lib\site-packages\pandas\io\parsers.pyc in __init__(self, src, **kwds)
   1211         kwds['allow_leading_cols'] = self.index_col is not False
   1212 
-> 1213         self._reader = _parser.TextReader(src, **kwds)
   1214 
   1215         # XXX

pandas\parser.pyx in pandas.parser.TextReader.__cinit__ (pandas\parser.c:3427)()

pandas\parser.pyx in pandas.parser.TextReader._setup_parser_source (pandas\parser.c:6861)()

IOError: File features_V1.csv does not exist

In [3]:
features.head(5)


Out[3]:
id imdb_id title original_title release_date overview tagline budget revenue popularity ... spoken_languages_ur spoken_languages_uz spoken_languages_vi spoken_languages_wo spoken_languages_xh spoken_languages_xx spoken_languages_yi spoken_languages_za spoken_languages_zh spoken_languages_zu
0 10895 tt0032910 Pinocchio Pinocchio 1940-02-23 Lonely toymaker Geppetto has his wishes answer... For anyone who has ever wished upon a star. 2600000 84300000 2.418732 ... 0 0 0 0 0 0 0 0 0 0
1 223 tt0032976 Rebecca Rebecca 1940-04-12 A self-conscious bride is tormented by the mem... The shadow of this woman darkened their love. 1288000 6000000 1.583448 ... 0 0 0 0 0 0 0 0 0 0
2 756 tt0032455 Fantasia Fantasia 1940-11-13 Fantasia is the adventurous 1940 experiment fr... Hear the pictures! See the music! 2280000 83320000 1.498824 ... 0 0 0 0 0 0 0 0 0 0
3 981 tt0032904 The Philadelphia Story The Philadelphia Story 1940-12-01 Philadelphia heiress Tracy Lord throws out her... Broadway's howling year-run comedy hit of the ... 0 0 1.113673 ... 0 0 0 0 0 0 0 0 0 0
4 914 tt0032553 The Great Dictator The Great Dictator 1940-10-15 Dictator Adenoid Hynkel tries to expand his em... Once again - the whole world laughs! 2000000 11000000 1.005436 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 521 columns

Next we'll load our previously dummy-coded genre labels and verify they look as expected.


In [4]:
labels = pd.read_csv('multilabels.csv')

In [5]:
labels.head(5)


Out[5]:
Action Drama Comedy Family Romance Documentary Horror
0 0 0 0 1 0 0 0
1 1 1 0 0 1 0 0
2 1 1 0 1 0 0 0
3 0 0 1 0 1 0 0
4 0 0 1 0 0 0 0

Now, we begin preparing the data for model fitting. The first step in this process is to remove or reformat features that are not suitable, i.e. numeric.


In [6]:
#delete text only data
features = features.drop(features.columns[[0, 1, 2, 3, 5, 6, 12]], axis=1)
features.head(5)


Out[6]:
release_date budget revenue popularity vote_count vote_average runtime adult n_actors n_crew ... spoken_languages_ur spoken_languages_uz spoken_languages_vi spoken_languages_wo spoken_languages_xh spoken_languages_xx spoken_languages_yi spoken_languages_za spoken_languages_zh spoken_languages_zu
0 1940-02-23 2600000 84300000 2.418732 933 6.8 88.0 False 12.0 75.0 ... 0 0 0 0 0 0 0 0 0 0
1 1940-04-12 1288000 6000000 1.583448 271 7.6 130.0 False 25.0 37.0 ... 0 0 0 0 0 0 0 0 0 0
2 1940-11-13 2280000 83320000 1.498824 615 7.1 124.0 False 7.0 24.0 ... 0 0 0 0 0 0 0 0 0 0
3 1940-12-01 0 0 1.113673 160 7.6 112.0 False 28.0 13.0 ... 0 0 0 0 0 0 0 0 0 0
4 1940-10-15 2000000 11000000 1.005436 563 8.0 125.0 False 59.0 22.0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 514 columns


In [7]:
#convert release date string to numeric year
rd = features['release_date'].str[:4]
rd = pd.to_numeric(rd)

features['release_date'] = rd
features.head()


Out[7]:
release_date budget revenue popularity vote_count vote_average runtime adult n_actors n_crew ... spoken_languages_ur spoken_languages_uz spoken_languages_vi spoken_languages_wo spoken_languages_xh spoken_languages_xx spoken_languages_yi spoken_languages_za spoken_languages_zh spoken_languages_zu
0 1940.0 2600000 84300000 2.418732 933 6.8 88.0 False 12.0 75.0 ... 0 0 0 0 0 0 0 0 0 0
1 1940.0 1288000 6000000 1.583448 271 7.6 130.0 False 25.0 37.0 ... 0 0 0 0 0 0 0 0 0 0
2 1940.0 2280000 83320000 1.498824 615 7.1 124.0 False 7.0 24.0 ... 0 0 0 0 0 0 0 0 0 0
3 1940.0 0 0 1.113673 160 7.6 112.0 False 28.0 13.0 ... 0 0 0 0 0 0 0 0 0 0
4 1940.0 2000000 11000000 1.005436 563 8.0 125.0 False 59.0 22.0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 514 columns

There are a decent number of movies that are missing data for some of the features. Since we can't fit our models with NaN values present, we can choose to either remove these movies, or impute values for the missing features. Let's see how many movies we lose by dropping observations with any missing values.


In [8]:
# Count missing data to see how many movies we lose by dropping NAs:
features.shape[0] - features.dropna().shape[0]


Out[8]:
11578

So we'll lose almost 12K movies by dropping those with any missing values. Given the large size of our datset, this doesn't seem unmanageable.


In [39]:
#make a df with no important features missing
feat2 = pd.concat([features, labels], axis=1)
feat2 = feat2.dropna()
labels2 = feat2.ix[:,514:521]

#drop adult since it is always false
del feat2['adult']

We also need to worry about features that have erroneous or nonsensical values, e.g. a budget of $0.00. These erroneous values could have a negative influence on the model, if they're not dealt with. So we'll create a new dataset with erroneous values replaced by the mean, with which we'll compare the non-altered dataset later.


In [10]:
#make a second df with imputed mean values for nonsensical data in quantitative variables
feat3 = feat2.iloc[:,0:9].values
feat3 = feat3.astype('float')
feat3[feat3 == 0] = np.nan
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
feat3 = imp.fit_transform(feat3)

In [11]:
feat3 = pd.DataFrame(feat3, columns = list(feat2)[0:9])
feat3 = pd.concat([feat3, feat2.iloc[:,10:520]], axis=1).dropna()
feat3.head()


Out[11]:
release_date budget revenue popularity vote_count vote_average runtime n_actors n_crew original_language_ab ... spoken_languages_za spoken_languages_zh spoken_languages_zu Action Drama Comedy Family Romance Documentary Horror
0 1940.0 2.600000e+06 8.430000e+07 2.418732 933.0 6.8 88.0 12.0 75.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
1 1940.0 1.288000e+06 6.000000e+06 1.583448 271.0 7.6 130.0 25.0 37.0 0.0 ... 0.0 0.0 0.0 1.0 1.0 0.0 0.0 1.0 0.0 0.0
2 1940.0 2.280000e+06 8.332000e+07 1.498824 615.0 7.1 124.0 7.0 24.0 0.0 ... 0.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0
3 1940.0 1.523739e+07 6.234904e+07 1.113673 160.0 7.6 112.0 28.0 13.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0
4 1940.0 2.000000e+06 1.100000e+07 1.005436 563.0 8.0 125.0 59.0 22.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0

5 rows × 519 columns


In [12]:
# Pop labels off of imputed dataset
labels3 = feat3.ix[:,512:519]
feat3 = feat3.drop(feat3.columns[[512, 513, 514, 515, 516, 517, 518]], axis=1)
feat2 = feat2.drop(feat2.columns[[513, 514, 515, 516, 517, 518, 519]], axis=1)

Model fitting and comparison

Now that we've cleaned up our data sufficiently, it's finally time to begin model fitting and comparison. The first step is to split the full dataset into training and testing sets.


In [13]:
#split our data into training and testing
X_train, X_test, y_train, y_test = train_test_split(feat2, labels2, test_size=0.33, random_state=42)

# ----------------------------
# Standardize features
scaler = preprocessing.StandardScaler().fit(X_train)
# Call: scaler.transform(X_train) to scale
# ----------------------------

General strategy

We framed our genre prediction problem as a multi-label task, where a given movie may have >=1 genre labels. Rather than throwing out movies with >1 genre, or arbitrarily choosing to keep a single genre for each movie, we chose this framework because we believe this approach retains an important aspect of movie genres - that boundaries are often not clear - at the cost of making the classification task more difficult.

Classification scheme: We first chose to tackle the multi-label task by using a one-vs.-rest classification approach, essentially modeling each genre as the output of a single classifier, since it requires only fitting one classifier per genre. It's not the most powerful approach, but it more efficient than other classification schemes and is expected to give decent first-pass results.

Model selection: We wound up fitting three different models to the data - Naive Bayes (NB), Logistic regression (LR), and Random Forests Classifier (RFC) - fitting the LR and RFC with, and without, PCA.

Class imbalance As we showed in previous milestones, the genres are not all equally common, therefore we have to contend with significant class imbalance. This could be dealt with in a number of ways, but the way we chose to deal with it is to "balance" the groups by adjusting the weights to be inversely proportional to the class frequencies for the LR and and RFC, and setting the priors in NB to the observed frequencies.

Performance Evaluation: To evaluate performance, we chose to Hamming loss. We chose Hamming loss, as opposed to other measures, because it gives the label-by-label accuracy for our multi-label classification, and not an "all or nothing" accuracy indicating whether we got the exact combination of genres right for a particular movie.

Naive Bayes classifier

We decided to begin our modeling efforts with a simple Naive Bayes classifier. The Naive Bayes classifier should be simple to implement and quick to fit this large dataset, so it's a good place to start.


In [14]:
from sklearn.naive_bayes import GaussianNB

# Do multi-label NB
nb_mdl = OneVsRestClassifier(GaussianNB(), n_jobs=-1)
nb_mdl.fit(X_train, y_train)


Out[14]:
OneVsRestClassifier(estimator=GaussianNB(priors=None), n_jobs=-1)

In [15]:
# Test uncalibrated NB performance
nb_pred = nb_mdl.predict(X_test)
hamming_loss(y_test, nb_pred)


Out[15]:
0.310183745244874

We can see that the NB model appears to do a decent job right out of the box, with a Hamming loss of 0.31. This means that roughly 69% of our genre labels are predicted correctly for the test set. Since each genre assignment is a 50/50 choice, this would seem to be a large improvement over a random classifier.

We thought perhaps performance could be improved by tuning the NB model. There isn't really a good way to 'tune' an NB classifier, but we can calibrate the class probabilities and look at how varying the class threshold affects our loss on the test set.


In [16]:
from sklearn.calibration import CalibratedClassifierCV

nb_sigmoid = CalibratedClassifierCV(GaussianNB(), method='sigmoid')
nb_mdl = OneVsRestClassifier(nb_sigmoid, n_jobs=-1)
nb_mdl.fit(X_train, y_train)

nb_cal_proba = nb_mdl.predict_proba(X_test)


loss = []
# Choose optimal threshold by manual ROC
for thresh in np.linspace(0.01, 0.99, 99):
    
    pred = np.array(nb_cal_proba > thresh).astype(int)
    loss.append(hamming_loss(y_test, pred))

    
plt.plot(np.linspace(0.01, 0.99, 99), loss)
plt.xlabel('threshold'); plt.ylabel('Hamming Loss')

print 'Minumum Hamming loss: {loss}'.format(loss=min(loss))
print 'Loss when zero: {loss}'.format(loss=hamming_loss(y_test, np.zeros(np.shape(y_test))))


Minumum Hamming loss: 0.212766466493
Loss when zero: 0.214970572051

Surprisingly, it seems like the NB classifier does best (Hamming loss = 0.213) at the higest thresholds - when it predicts everything as 0, i.e. no genre assigned. In fact, it turns out that the best performance achievable here is what we would achieve by assigning no genre to any movie - the "all null" model (Hamming loss = 0.215). This is clearly not the desired effect, since prediction of no genre is completely unhelpful. We could try to impose a constraint on the model that at least one genre is always chosen, e.g. applying a softmax transformation, but this assumes labels are mutually exclusive, so it can't handle multi-label scenarios like this one. To circumvent these issues, we decided to try something a bit different, and turned to logistic regression.

Logistic regression

Now we implement a logistic regression model, using stochastic gradient descent to efficiently fit the large dataset. As above, we fit our model on the full training dataset, using a one-vs.-rest scheme, and report the Hamming loss performance metric. The model was previously tuned using the code in the Appendix, and C=100 was identified as the optimal regularization parameter value. We use that value here.


In [17]:
from sklearn.linear_model import LogisticRegression

# Now do LR - we'll use the SGD solver since the dataset is so large
lr_mdl = OneVsRestClassifier(LogisticRegression(class_weight='balanced', C=100, solver='sag'), n_jobs=-1)
lr_mdl.fit(scaler.transform(X_train), y_train)


Out[17]:
OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='sag', tol=0.0001, verbose=0, warm_start=False),
          n_jobs=-1)

In [18]:
# Test LR performance
lr_pred = lr_mdl.predict(scaler.transform(X_test))
hamming_loss(y_test, lr_pred)


Out[18]:
0.35005203722755224

In [19]:
# Let's look at the first handful of genre predictions
lr_pred[:10]


Out[19]:
array([[0, 0, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 1, 1],
       [1, 0, 0, 1, 0, 1, 0],
       [0, 0, 0, 1, 0, 1, 0],
       [0, 0, 1, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 0, 0],
       [1, 0, 1, 0, 1, 0, 1]])

We can see that our untuned LR does worse than the NB model and how we would do by simply predicting all zeros for the genres (Hamming loss of 0.350 vs. 0.215), but at least it does assign movies at least one genre. Presumably, performance would improve with some finer tuning of the regularization strength, but it seems doubtful that we will get too far with this model and this dataset.

PCA

To see whether we can improve performance by reducing the dimensionality of the feature space we implemented PCA and kept the minimum number of PCs required to explain 90% of the total variance. This will hopefully make the model fitting faster, and result in higher quality predictions. We will apply PCA to the logistic regression model above to see whether performance can be improved, and to the RFC below.


In [20]:
# implementation of LR using PCA to see if this results in better prediction
# start with 400 PCs, will only take the ones that explain ~90% of variance
pca = PCA(n_components=400)
pca.fit(scaler.transform(X_train))


Out[20]:
PCA(copy=True, iterated_power='auto', n_components=400, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [21]:
# show the number of PCs that explain at least 90% of the variance
var90pcs = len(pca.explained_variance_ratio_[np.cumsum(pca.explained_variance_ratio_)<.9])
print var90pcs
print np.cumsum(pca.explained_variance_ratio_)[var90pcs]


327
0.900468664604

We can see that 90% of the variance is explained by the first 327 PCs, so we will fit our LR using the first 327 PCs and compare the results to the model using the entire dataset above.


In [22]:
# reduce data using PCA
X_train_reduced = pca.transform(scaler.transform(X_train))
print X_train_reduced.shape
X_test_reduced = pca.transform(scaler.transform(X_test))
print X_test_reduced.shape


(96981, 400)
(47768, 400)

In [23]:
# Now the same LR as above but with 327 principal components
lr_mdl_pca = OneVsRestClassifier(LogisticRegression(class_weight='balanced', C=100, solver='sag'), n_jobs=-1)
lr_mdl_pca.fit(X_train_reduced[:,0:var90pcs], y_train)


Out[23]:
OneVsRestClassifier(estimator=LogisticRegression(C=100, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='sag', tol=0.0001, verbose=0, warm_start=False),
          n_jobs=-1)

In [24]:
# Test LR performance
lr_pred_pca = lr_mdl_pca.predict(X_test_reduced[:,0:var90pcs])
hamming_loss(y_test, lr_pred_pca)


Out[24]:
0.35687070842404955

We can see that the Hamming loss is pretty similar to that achieved using the full data set, only slightly worse. This indicates that the features aren't very compressible, i.e. information is distributed widely across features. It is possible that we will will see gains by implementing a more flexible classifier, so we decided to fit an RFC, and compare results with those above.

Random Forest Classifier

As with the LR, we pre-tuned our RFC using the code in the Appendix and apply those values here.


In [25]:
#run a tuned RFC on full data set and calculate Hamming loss
# **tuning parameters chosen previously using code in the Appendix
rfc_mdl = RFC(n_estimators=120, max_depth=60, class_weight ='balanced', n_jobs=-1).fit(X_train,y_train)
rf_pred = rfc_mdl.predict(X_test)
hamming_loss(y_test, rf_pred)


Out[25]:
0.16755688207287606

We can see that we achieve a significantly better Hamming score with RFC (0.17) than with NB or LR. This means that roughly 83% of our genre labels are predicted correctly for the test set.


In [26]:
#calculate the overall accuracy of our predictions
rfc_mdl.score(X_test, y_test)


Out[26]:
0.28720063640931165

We also compared the Hamming score metric with a more traditional overall accuracy metric. From the sklearn documentation: "In multi-label classification, this is the subset accuracy which is a harsh metric since you require for each sample that each label set be correctly predicted." While interesting to compare to the Hamming loss, this gives us a much coarser picture of how our model is performing at assigning genres. 29% of our films' genres are predicted exactly as we assigned them, but 83% of our predicted genres are correct. Since a random classifier would need to assign seven binary genres correctly to get one movie correct on this harsh metric, each movie would have a (.5)^7 or roughly 0.8% chance of being assigned correctly.

Next, we apply PCA and see whether dimensionality reduction improves RFC performance.


In [27]:
#run a tuned RFC on our data that has been reduced using PCA and calculate Hamming loss
rfc_mdl_pca = RFC(n_estimators=120, max_depth=60, class_weight = 'balanced').fit(X_train_reduced[:,0:var90pcs],y_train)
pred_pca = rfc_mdl_pca.predict(np.array(X_test_reduced[:,0:var90pcs]))
hamming_loss(y_test, pred_pca)


Out[27]:
0.17694750819436803

About 82% of the genres are predicted correctly using PCA with 327 PCs. This is pretty comparable to the RFC above that uses the full dimension dataset. Perhaps the slight loss in accuracy is offset by the reduction in time it takes to fit the random forest, although this is a subjective judgment.

Model Performance by genre

It's important to consider how the different models are making their mistakes, so we decided to break down the prediction accuracy by genre for the LR and RFC.


In [29]:
#visualize by-genre accuracy of predictions
def plot_acc_by_genre(pred, y_test, clf_str=''):
    
    preds = pd.DataFrame(pred)
    acc = []

    for i in range(7):
        a = np.array(y_test[[i]])
        b = np.array(preds[[i]])
        acc.append(np.count_nonzero(a==b) / preds.shape[0] * 100)

    plt.figure(figsize=(10,5))
    index = [0,1,2,3,4,5,6]
    plt.bar(index,acc,align="center")
    plt.xticks(index, ('Action', 'Drama', 'Comedy', 'Family', 'Romance', 'Documentary', 'Horror'))
    plt.ylim([0,100])
    plt.grid(axis='y')
    plt.title(clf_str + ' Accuracy by Genre')
    plt.ylabel('Accuracy')
    plt.show()

In [32]:
plot_acc_by_genre(lr_pred, y_test, 'Logistic Regression')



In [30]:
plot_acc_by_genre(rf_pred, y_test, 'Random Forest')


For the LR, accuracy is pretty similar across action, drama, comedy, and horror, while family and documentary genres are predicted with slightly higher fidelity.

For RFC, we correctly identify 4 out of our 7 combined genres almost 90% of the time, while action and comedy are accurately labeled roughly 75% of the time. Drama is the only genre with a lower than 70% accuracy rating, indicating that we may need to rethink which of our original genres we are flagging for inclusion in this category. It is also a genre that appears with a wide variety of other film types, making its discrimination more difficult.

Overall, the LR and RFC don't look drastically different, beyond the overall higher performance of RFC.

Exploring the importance of categorical features and effects of missing data in RF model

We next wanted to evaluate the contributions of different categorical variables we chose to include, as well as the effects of imputing data. We first look at performance of our tuned RFC on the imputed dataset.


In [33]:
#split our imputed data df into training and testing
X_train3, X_test3, y_train3, y_test3 = train_test_split(feat3, labels3, test_size=0.33, random_state=42)

#run a RFC on our data with imputed data
imputedmdl = RFC(n_estimators=120, max_depth=60, class_weight = 'balanced', n_jobs=-1).fit(X_train3,y_train3)
pred3 = imputedmdl.predict(X_test3)
hamming_loss(y_test3, pred3)


Out[33]:
0.20405906794877168

This means that roughly 80% of our genre labels are predicted correctly for the test set when using the dataset with imputed data. As this performance is worse than leaving erroneous value in place, we will continue to test other methods of handling missing / nonsensical data.

Next, we wondered whether including information about languages and production location significantly improved predictions.


In [34]:
#build several dfs with different predictors
a = X_train.ix[:,0:9] #just the basics
b = X_train.ix[:,0:132] #include original language
c = X_train.ix[:,0:365] #include production countries
d = X_train #include spoken language

e = X_test.ix[:,0:9] #just the basics
f = X_test.ix[:,0:132] #include original language
g = X_test.ix[:,0:365] #include production countries
h = X_test #include spoken language

In [35]:
#run models on each df
dfs_train = [a,b,c,d]
dfs_test = [e,f,g,h]
loss = []

for i in range(4):
    mdl = RFC(n_estimators=120, max_depth=60, class_weight = 'balanced', n_jobs=-1).fit(dfs_train[i],y_train)
    pred = mdl.predict(dfs_test[i])
    loss.append(hamming_loss(y_test, pred))
    
print loss


[0.17707311529535613, 0.1715434122066177, 0.16805931047682848, 0.16764959207598631]

In [36]:
plt.figure(figsize=(10,5))
index = [0,1,2,3]
plt.bar(index,loss,align="center")
plt.xticks(index, ('Basics', '+ Original language', '+ Production Countries', '+ Spoken Language'))
plt.ylim([0,0.2])
plt.grid(axis='y')
plt.title('Random Forest with Different Attributes of Films')
plt.ylabel('Hamming loss')
plt.show()


As we can see, ading all our language and country predictors as indicator variables leads to a 1% improvement in classifier performance. This is not a huge gain, but perhaps worth the added overhead.

Next, we wondered whether the different "basic" features had roughly similar importance for RFC predictions. We evaluated this by looking at the RFC importance scores.


In [37]:
importances = rfc_mdl.feature_importances_
std = np.std([tree.feature_importances_ for tree in rfc_mdl.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X_train.shape[1]), indices)
plt.xlim([-1, 15])
plt.show()


As we can see from the above visualization, seven features proved the most important in determining the movie genres for our Random Forest Classifier. The top nine contributors came from our "basic" core of quanitiative data scraped from TMDb and are, in order: running time, popularity, release year, # of actors, # of crew, vote average, and vote count. Budget and revenue come last among these variables and suffer from having a large amount of missing data compared to our other features.


In [38]:
# Print the full feature ranking
print("Feature ranking:")

for f in range(X_train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))


Feature ranking:
1. feature 6 (0.136471)
2. feature 3 (0.131113)
3. feature 0 (0.117326)
4. feature 7 (0.113605)
5. feature 8 (0.084507)
6. feature 5 (0.082935)
7. feature 4 (0.069611)
8. feature 1 (0.024706)
9. feature 2 (0.014729)
10. feature 394 (0.012339)
11. feature 345 (0.011786)
12. feature 235 (0.008720)
13. feature 35 (0.008469)
14. feature 423 (0.007209)
15. feature 203 (0.006606)
16. feature 390 (0.006119)
17. feature 166 (0.006049)
18. feature 403 (0.005994)
19. feature 396 (0.005037)
20. feature 201 (0.004618)
21. feature 62 (0.004548)
22. feature 227 (0.004233)
23. feature 183 (0.004148)
24. feature 421 (0.003273)
25. feature 470 (0.003135)
26. feature 232 (0.002911)
27. feature 145 (0.002812)
28. feature 194 (0.002714)
29. feature 508 (0.002537)
30. feature 43 (0.002503)
31. feature 31 (0.002501)
32. feature 489 (0.002429)
33. feature 219 (0.002287)
34. feature 383 (0.002267)
35. feature 511 (0.002264)
36. feature 307 (0.002250)
37. feature 413 (0.002086)
38. feature 243 (0.002068)
39. feature 467 (0.002020)
40. feature 386 (0.001939)
41. feature 431 (0.001909)
42. feature 416 (0.001831)
43. feature 37 (0.001782)
44. feature 465 (0.001688)
45. feature 295 (0.001573)
46. feature 151 (0.001550)
47. feature 60 (0.001538)
48. feature 114 (0.001535)
49. feature 277 (0.001510)
50. feature 182 (0.001502)
51. feature 495 (0.001491)
52. feature 312 (0.001473)
53. feature 185 (0.001469)
54. feature 490 (0.001456)
55. feature 389 (0.001369)
56. feature 487 (0.001335)
57. feature 69 (0.001302)
58. feature 99 (0.001289)
59. feature 175 (0.001263)
60. feature 492 (0.001257)
61. feature 456 (0.001203)
62. feature 285 (0.001193)
63. feature 28 (0.001116)
64. feature 25 (0.001041)
65. feature 170 (0.001034)
66. feature 30 (0.001012)
67. feature 160 (0.001007)
68. feature 340 (0.000972)
69. feature 130 (0.000936)
70. feature 330 (0.000910)
71. feature 144 (0.000881)
72. feature 225 (0.000862)
73. feature 305 (0.000851)
74. feature 446 (0.000845)
75. feature 369 (0.000836)
76. feature 297 (0.000822)
77. feature 90 (0.000808)
78. feature 289 (0.000795)
79. feature 450 (0.000783)
80. feature 498 (0.000781)
81. feature 96 (0.000769)
82. feature 142 (0.000765)
83. feature 52 (0.000764)
84. feature 393 (0.000762)
85. feature 362 (0.000735)
86. feature 337 (0.000726)
87. feature 112 (0.000723)
88. feature 94 (0.000706)
89. feature 286 (0.000700)
90. feature 117 (0.000684)
91. feature 196 (0.000659)
92. feature 115 (0.000658)
93. feature 214 (0.000643)
94. feature 278 (0.000633)
95. feature 223 (0.000632)
96. feature 412 (0.000624)
97. feature 430 (0.000623)
98. feature 401 (0.000596)
99. feature 458 (0.000594)
100. feature 119 (0.000581)
101. feature 484 (0.000555)
102. feature 418 (0.000544)
103. feature 226 (0.000533)
104. feature 91 (0.000531)
105. feature 153 (0.000523)
106. feature 342 (0.000487)
107. feature 298 (0.000480)
108. feature 173 (0.000456)
109. feature 224 (0.000442)
110. feature 469 (0.000434)
111. feature 300 (0.000413)
112. feature 81 (0.000413)
113. feature 221 (0.000407)
114. feature 105 (0.000398)
115. feature 120 (0.000392)
116. feature 231 (0.000375)
117. feature 56 (0.000369)
118. feature 479 (0.000361)
119. feature 414 (0.000331)
120. feature 34 (0.000330)
121. feature 316 (0.000311)
122. feature 191 (0.000309)
123. feature 313 (0.000287)
124. feature 436 (0.000285)
125. feature 420 (0.000277)
126. feature 399 (0.000277)
127. feature 121 (0.000277)
128. feature 306 (0.000272)
129. feature 255 (0.000255)
130. feature 462 (0.000248)
131. feature 190 (0.000246)
132. feature 464 (0.000241)
133. feature 41 (0.000240)
134. feature 54 (0.000238)
135. feature 314 (0.000226)
136. feature 179 (0.000226)
137. feature 478 (0.000223)
138. feature 374 (0.000222)
139. feature 164 (0.000217)
140. feature 502 (0.000205)
141. feature 366 (0.000204)
142. feature 397 (0.000203)
143. feature 110 (0.000203)
144. feature 505 (0.000197)
145. feature 381 (0.000196)
146. feature 350 (0.000189)
147. feature 292 (0.000189)
148. feature 53 (0.000188)
149. feature 133 (0.000187)
150. feature 331 (0.000182)
151. feature 176 (0.000179)
152. feature 23 (0.000168)
153. feature 68 (0.000165)
154. feature 230 (0.000159)
155. feature 13 (0.000157)
156. feature 361 (0.000157)
157. feature 104 (0.000151)
158. feature 417 (0.000149)
159. feature 254 (0.000148)
160. feature 377 (0.000145)
161. feature 98 (0.000144)
162. feature 51 (0.000141)
163. feature 483 (0.000137)
164. feature 109 (0.000131)
165. feature 178 (0.000128)
166. feature 137 (0.000127)
167. feature 205 (0.000124)
168. feature 187 (0.000124)
169. feature 503 (0.000114)
170. feature 509 (0.000114)
171. feature 476 (0.000112)
172. feature 40 (0.000112)
173. feature 10 (0.000109)
174. feature 124 (0.000106)
175. feature 85 (0.000104)
176. feature 440 (0.000102)
177. feature 264 (0.000096)
178. feature 425 (0.000096)
179. feature 388 (0.000090)
180. feature 38 (0.000089)
181. feature 283 (0.000087)
182. feature 75 (0.000087)
183. feature 256 (0.000083)
184. feature 353 (0.000083)
185. feature 93 (0.000082)
186. feature 127 (0.000081)
187. feature 427 (0.000076)
188. feature 238 (0.000074)
189. feature 512 (0.000071)
190. feature 447 (0.000068)
191. feature 380 (0.000068)
192. feature 406 (0.000068)
193. feature 59 (0.000067)
194. feature 429 (0.000065)
195. feature 398 (0.000065)
196. feature 323 (0.000065)
197. feature 441 (0.000064)
198. feature 347 (0.000063)
199. feature 455 (0.000062)
200. feature 220 (0.000062)
201. feature 504 (0.000061)
202. feature 259 (0.000060)
203. feature 449 (0.000059)
204. feature 18 (0.000056)
205. feature 148 (0.000056)
206. feature 445 (0.000055)
207. feature 258 (0.000054)
208. feature 189 (0.000054)
209. feature 246 (0.000054)
210. feature 64 (0.000053)
211. feature 299 (0.000052)
212. feature 444 (0.000051)
213. feature 123 (0.000050)
214. feature 346 (0.000049)
215. feature 409 (0.000048)
216. feature 273 (0.000048)
217. feature 88 (0.000045)
218. feature 20 (0.000044)
219. feature 248 (0.000044)
220. feature 296 (0.000044)
221. feature 356 (0.000042)
222. feature 302 (0.000042)
223. feature 284 (0.000042)
224. feature 135 (0.000041)
225. feature 102 (0.000040)
226. feature 177 (0.000039)
227. feature 76 (0.000039)
228. feature 236 (0.000039)
229. feature 507 (0.000038)
230. feature 405 (0.000037)
231. feature 287 (0.000036)
232. feature 242 (0.000035)
233. feature 270 (0.000035)
234. feature 482 (0.000035)
235. feature 251 (0.000035)
236. feature 159 (0.000034)
237. feature 433 (0.000033)
238. feature 150 (0.000033)
239. feature 488 (0.000032)
240. feature 215 (0.000031)
241. feature 11 (0.000031)
242. feature 22 (0.000030)
243. feature 39 (0.000030)
244. feature 324 (0.000030)
245. feature 80 (0.000029)
246. feature 338 (0.000029)
247. feature 303 (0.000028)
248. feature 207 (0.000027)
249. feature 506 (0.000027)
250. feature 454 (0.000027)
251. feature 165 (0.000026)
252. feature 181 (0.000026)
253. feature 266 (0.000025)
254. feature 250 (0.000025)
255. feature 378 (0.000025)
256. feature 451 (0.000024)
257. feature 47 (0.000024)
258. feature 261 (0.000024)
259. feature 29 (0.000023)
260. feature 247 (0.000023)
261. feature 335 (0.000023)
262. feature 188 (0.000023)
263. feature 291 (0.000022)
264. feature 134 (0.000022)
265. feature 452 (0.000021)
266. feature 122 (0.000020)
267. feature 439 (0.000020)
268. feature 395 (0.000020)
269. feature 408 (0.000020)
270. feature 229 (0.000020)
271. feature 267 (0.000020)
272. feature 434 (0.000020)
273. feature 293 (0.000020)
274. feature 67 (0.000019)
275. feature 172 (0.000019)
276. feature 84 (0.000019)
277. feature 167 (0.000019)
278. feature 138 (0.000018)
279. feature 468 (0.000018)
280. feature 510 (0.000017)
281. feature 372 (0.000017)
282. feature 163 (0.000017)
283. feature 143 (0.000017)
284. feature 357 (0.000017)
285. feature 234 (0.000017)
286. feature 233 (0.000017)
287. feature 319 (0.000017)
288. feature 213 (0.000016)
289. feature 460 (0.000015)
290. feature 363 (0.000015)
291. feature 49 (0.000014)
292. feature 282 (0.000014)
293. feature 195 (0.000013)
294. feature 158 (0.000013)
295. feature 147 (0.000013)
296. feature 382 (0.000013)
297. feature 149 (0.000012)
298. feature 268 (0.000012)
299. feature 341 (0.000012)
300. feature 161 (0.000012)
301. feature 152 (0.000011)
302. feature 407 (0.000011)
303. feature 222 (0.000011)
304. feature 197 (0.000011)
305. feature 419 (0.000011)
306. feature 437 (0.000011)
307. feature 97 (0.000010)
308. feature 252 (0.000010)
309. feature 245 (0.000010)
310. feature 209 (0.000010)
311. feature 325 (0.000010)
312. feature 154 (0.000010)
313. feature 368 (0.000010)
314. feature 237 (0.000010)
315. feature 262 (0.000010)
316. feature 339 (0.000009)
317. feature 280 (0.000009)
318. feature 344 (0.000008)
319. feature 334 (0.000008)
320. feature 257 (0.000008)
321. feature 438 (0.000008)
322. feature 364 (0.000008)
323. feature 415 (0.000008)
324. feature 471 (0.000008)
325. feature 497 (0.000008)
326. feature 354 (0.000007)
327. feature 477 (0.000007)
328. feature 326 (0.000007)
329. feature 174 (0.000007)
330. feature 140 (0.000007)
331. feature 265 (0.000007)
332. feature 422 (0.000007)
333. feature 376 (0.000007)
334. feature 402 (0.000007)
335. feature 146 (0.000007)
336. feature 320 (0.000007)
337. feature 156 (0.000006)
338. feature 239 (0.000006)
339. feature 260 (0.000006)
340. feature 55 (0.000006)
341. feature 392 (0.000006)
342. feature 184 (0.000006)
343. feature 485 (0.000006)
344. feature 141 (0.000006)
345. feature 202 (0.000006)
346. feature 343 (0.000006)
347. feature 103 (0.000006)
348. feature 411 (0.000006)
349. feature 486 (0.000005)
350. feature 244 (0.000005)
351. feature 500 (0.000005)
352. feature 162 (0.000005)
353. feature 200 (0.000005)
354. feature 169 (0.000005)
355. feature 328 (0.000004)
356. feature 168 (0.000004)
357. feature 157 (0.000004)
358. feature 73 (0.000004)
359. feature 424 (0.000004)
360. feature 466 (0.000004)
361. feature 304 (0.000004)
362. feature 349 (0.000004)
363. feature 463 (0.000004)
364. feature 186 (0.000004)
365. feature 42 (0.000004)
366. feature 276 (0.000004)
367. feature 125 (0.000004)
368. feature 443 (0.000004)
369. feature 281 (0.000004)
370. feature 74 (0.000004)
371. feature 327 (0.000003)
372. feature 332 (0.000003)
373. feature 288 (0.000003)
374. feature 171 (0.000003)
375. feature 442 (0.000003)
376. feature 481 (0.000003)
377. feature 384 (0.000003)
378. feature 311 (0.000003)
379. feature 435 (0.000003)
380. feature 387 (0.000003)
381. feature 501 (0.000003)
382. feature 16 (0.000003)
383. feature 86 (0.000003)
384. feature 65 (0.000003)
385. feature 275 (0.000002)
386. feature 50 (0.000002)
387. feature 58 (0.000002)
388. feature 210 (0.000002)
389. feature 308 (0.000002)
390. feature 279 (0.000002)
391. feature 263 (0.000002)
392. feature 474 (0.000002)
393. feature 373 (0.000002)
394. feature 391 (0.000002)
395. feature 271 (0.000002)
396. feature 32 (0.000002)
397. feature 359 (0.000002)
398. feature 139 (0.000002)
399. feature 82 (0.000002)
400. feature 45 (0.000002)
401. feature 301 (0.000002)
402. feature 155 (0.000002)
403. feature 128 (0.000002)
404. feature 321 (0.000002)
405. feature 428 (0.000002)
406. feature 385 (0.000002)
407. feature 211 (0.000002)
408. feature 78 (0.000002)
409. feature 217 (0.000002)
410. feature 432 (0.000002)
411. feature 204 (0.000002)
412. feature 480 (0.000002)
413. feature 57 (0.000002)
414. feature 318 (0.000002)
415. feature 70 (0.000002)
416. feature 410 (0.000001)
417. feature 322 (0.000001)
418. feature 269 (0.000001)
419. feature 367 (0.000001)
420. feature 453 (0.000001)
421. feature 116 (0.000001)
422. feature 491 (0.000001)
423. feature 358 (0.000001)
424. feature 496 (0.000001)
425. feature 132 (0.000001)
426. feature 240 (0.000001)
427. feature 19 (0.000001)
428. feature 309 (0.000001)
429. feature 290 (0.000001)
430. feature 370 (0.000001)
431. feature 461 (0.000001)
432. feature 180 (0.000001)
433. feature 315 (0.000001)
434. feature 317 (0.000001)
435. feature 457 (0.000001)
436. feature 253 (0.000001)
437. feature 499 (0.000001)
438. feature 17 (0.000001)
439. feature 360 (0.000001)
440. feature 228 (0.000001)
441. feature 426 (0.000001)
442. feature 111 (0.000001)
443. feature 329 (0.000001)
444. feature 87 (0.000001)
445. feature 71 (0.000001)
446. feature 208 (0.000001)
447. feature 494 (0.000001)
448. feature 107 (0.000001)
449. feature 198 (0.000001)
450. feature 21 (0.000001)
451. feature 333 (0.000001)
452. feature 294 (0.000001)
453. feature 472 (0.000001)
454. feature 216 (0.000001)
455. feature 212 (0.000001)
456. feature 61 (0.000001)
457. feature 473 (0.000001)
458. feature 274 (0.000000)
459. feature 136 (0.000000)
460. feature 404 (0.000000)
461. feature 351 (0.000000)
462. feature 131 (0.000000)
463. feature 66 (0.000000)
464. feature 310 (0.000000)
465. feature 33 (0.000000)
466. feature 206 (0.000000)
467. feature 63 (0.000000)
468. feature 48 (0.000000)
469. feature 459 (0.000000)
470. feature 336 (0.000000)
471. feature 72 (0.000000)
472. feature 108 (0.000000)
473. feature 95 (0.000000)
474. feature 77 (0.000000)
475. feature 15 (0.000000)
476. feature 106 (0.000000)
477. feature 44 (0.000000)
478. feature 118 (0.000000)
479. feature 126 (0.000000)
480. feature 89 (0.000000)
481. feature 355 (0.000000)
482. feature 129 (0.000000)
483. feature 113 (0.000000)
484. feature 400 (0.000000)
485. feature 218 (0.000000)
486. feature 14 (0.000000)
487. feature 352 (0.000000)
488. feature 12 (0.000000)
489. feature 375 (0.000000)
490. feature 9 (0.000000)
491. feature 348 (0.000000)
492. feature 26 (0.000000)
493. feature 272 (0.000000)
494. feature 249 (0.000000)
495. feature 83 (0.000000)
496. feature 379 (0.000000)
497. feature 46 (0.000000)
498. feature 92 (0.000000)
499. feature 371 (0.000000)
500. feature 241 (0.000000)
501. feature 199 (0.000000)
502. feature 475 (0.000000)
503. feature 448 (0.000000)
504. feature 493 (0.000000)
505. feature 36 (0.000000)
506. feature 79 (0.000000)
507. feature 100 (0.000000)
508. feature 101 (0.000000)
509. feature 192 (0.000000)
510. feature 27 (0.000000)
511. feature 365 (0.000000)
512. feature 24 (0.000000)
513. feature 193 (0.000000)

Appendix: model tuning code


In [ ]:
# Tuning for LR model
loss = []

for tuning_param in np.array([0.001, 0.01, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0 100]):
    
    lr_mdl = OneVsRestClassifier(LogisticRegression(class_weight ='balanced', C=tuning_param, solver='sag'), n_jobs=-1)
    lr_mdl.fit(scaler.transform(X_train), y_train)

    # Test LR performance
    lr_pred = lr_mdl.predict(scaler.transform(X_test))
    loss.append(hamming_loss(y_test, lr_pred))
    
plt.plot([0.001, 0.01, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0, 100], loss)
plt.xlabel('regularization tuning parameter'); plt.ylabel('Hamming loss')

In [ ]:
# Tuning for random forest model

n_folds = 5
kf = KFold(n_splits=n_folds)

# Parameters for tuning a random forest model
n_trees = np.arange(10, 100, 20)  # Trees and depth are explored on an exponentially growing space,
depths = np.arange(2, 10)   # since it is assumed that trees and depth will add accuracy in a decaying fashion.

# To keep track of the best model
best_score = 1

# Run grid search for model with 5-fold cross validation
print '5-fold cross validation:'
    
for trees in n_trees:
    for depth in depths:
        loss = []
        for ktrain, ktest in kf.split(feat2):
            mdl = RFC(n_estimators=trees, max_depth=depth).fit(feat2.iloc[ktrain,:],labels2.iloc[ktrain,:])
            pred = mdl.predict(feat2.iloc[ktest,:])
            loss.append(hamming_loss(labels2.iloc[ktest,:], pred))
        # Record and report probability
        average_loss = np.mean(loss)
                   
        # Record and report accuracy
        print "Trees:", trees, "Depth:", depth, "Loss:", average_loss
        
        # Update our record of the best parameters seen so far
        if average_loss < best_score:
            best_score = average_loss
            best_trees = trees
            best_depth = depth

print 'Best number of trees, depth:', best_trees, ',', best_depth

In [ ]:
# Go higher in trees and depth

# Parameters for tuning a random forest model
depths = np.arange(50, 80, 10)

# To keep track of the best model
best_score = 1

kf = KFold(n_splits=3)

# Run grid search for model with 5-fold cross validation
print '3-fold cross validation:'
    
for trees in n_trees:
    for depth in depths:
        loss = []
        for ktrain, ktest in kf.split(feat2):
            mdl = RFC(n_estimators=160, max_depth=depth, class_weight = 'balanced').fit(feat2.iloc[ktrain,:],labels2.iloc[ktrain,:])
            pred = mdl.predict(feat2.iloc[ktest,:])
            loss.append(hamming_loss(labels2.iloc[ktest,:], pred))
        # Record and report probability
        average_loss = np.mean(loss)
                   
        # Record and report accuracy
        print "Trees:", trees, "Depth:", depth, "Loss:", average_loss
        
        # Update our record of the best parameters seen so far
        if average_loss < best_score:
            best_score = average_loss
            best_trees = trees
            best_depth = depth

print 'Best number of trees, depth:', best_trees, ',', best_depth