In this notebook, we explore different techniques to optimize hyperparameters working with an IMDb dataset and trying to predict the score each movie received.


In [2]:
# Data analysis imports
import pandas as pd
import numpy as np

# Machine learning imports
import xgboost as xgb
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score

# Hyperopt imports
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

# Plotting imports
import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline

In [5]:
# Some constants
SEED = 314
TEST_SIZE = 0.2
MAX_EVALS = 100

In [6]:
# Some utility functions
def compute_rmse(model, features, targets):
    prediction = model.predict(features)
    rmse = np.sqrt(np.mean((prediction - targets) ** 2))
    return rmse

def train_grid_search(cv_parameters, features, targets):
    xgb_regressor = xgb.XGBRegressor({'eval_score':'rmse'})
    grid_search = GridSearchCV(xgb_regressor, cv_parameters, cv=5, verbose=1, n_jobs=4)
    grid_search.fit(features, targets)
    return grid_search

Load processed data


In [7]:
imdb_df = pd.read_csv('../data/processed_movie_metadata.csv')

In [8]:
imdb_df.head()


Out[8]:
num_critic_for_reviews duration director_facebook_likes actor_3_facebook_likes actor_1_facebook_likes num_voted_users num_user_for_reviews title_year actor_2_facebook_likes imdb_score ... Not Rated PG PG-13 Passed R TV-14 TV-G TV-PG Unrated X
0 723.0 178.0 0.0 855.0 1000.0 886204 3054.0 2009.0 936.0 7.9 ... 0 0 1 0 0 0 0 0 0 0
1 302.0 169.0 563.0 1000.0 40000.0 471220 1238.0 2007.0 5000.0 7.1 ... 0 0 1 0 0 0 0 0 0 0
2 602.0 148.0 0.0 161.0 11000.0 275868 994.0 2015.0 393.0 6.8 ... 0 0 1 0 0 0 0 0 0 0
3 813.0 164.0 22000.0 23000.0 27000.0 1144337 2701.0 2012.0 23000.0 8.5 ... 0 0 1 0 0 0 0 0 0 0
4 462.0 132.0 475.0 530.0 640.0 212204 738.0 2012.0 632.0 6.6 ... 0 0 1 0 0 0 0 0 0 0

5 rows × 145 columns

Train - test split


In [9]:
train_df, test_df = train_test_split(imdb_df, test_size=TEST_SIZE, random_state=SEED)

In [10]:
train_features = train_df.drop('imdb_score', axis=1)
train_targets = train_df.loc[:, 'imdb_score']
test_features = test_df.drop('imdb_score', axis=1)
test_targets = test_df.loc[:, 'imdb_score']

Naive grid search


In [11]:
naive_cv_parameters = {'max_depth':[4, 6, 8, 10],
                 'n_estimators': [10, 15, 20, 25],
                 'learning_rate': [0.2, 0.4, 0.6, 0.8], 
                 'gamma': [0.2, 0.4, 0.6, 0.8]
}

In [12]:
naive_gs = train_grid_search(naive_cv_parameters, train_features, train_targets)


Fitting 5 folds for each of 256 candidates, totalling 1280 fits
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    9.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   56.5s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  2.2min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  3.7min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  5.4min
[Parallel(n_jobs=4)]: Done 1280 out of 1280 | elapsed:  5.6min finished

Expert grid search

This grid search is based on the recommmendations of the following article:


In [13]:
expert_cv_parameters = {'max_depth':[4, 6, 10, 15],
                 'n_estimators': [10, 50, 100, 500],
                 'learning_rate': [0.01, 0.025, 0.05, 0.1],
                 'gamma': [0.05, 0.5, 0.9, 1.]
}

In [14]:
expert_gs = train_grid_search(expert_cv_parameters, train_features, train_targets)


Fitting 5 folds for each of 256 candidates, totalling 1280 fits
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  8.3min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed: 18.7min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed: 33.8min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed: 53.5min
[Parallel(n_jobs=4)]: Done 1280 out of 1280 | elapsed: 56.0min finished

Hyperopt

  • score => f(x)
  • optimize => defines the hyperparameters space and the optimization strategy (here TPE)

In [15]:
def score(params):
    params["gamma"] = np.log(params["gamma"])
    params["learning_rate"] = np.log(params["learning_rate"])
    params["n_estimators"] = int(params["n_estimators"])
    params["max_depth"] = int(params["max_depth"])
    xgb_regressor = xgb.XGBRegressor(silent=False, **params)
    score = cross_val_score(xgb_regressor, train_features, train_targets, 
                            cv=5, verbose=0, 
                            n_jobs=4).mean()
    # Try - score instead of 1 - score
    loss = - score
    return {'loss': loss, 
            'status': STATUS_OK}

In [16]:
def optimize(trials):
    space = {'n_estimators' : hp.quniform('n_estimators', 10, 1000, 1),
             'learning_rate' : hp.loguniform('learning_rate', 0.01, 1),
             'max_depth' : hp.quniform('max_depth', 3, 15, 1),
             'gamma': hp.loguniform('gamma', 0.01, 1)}
    best = fmin(score, space, algo=tpe.suggest, 
                trials=trials, 
                max_evals=MAX_EVALS)
    return best

In [17]:
trials = Trials()
optimal_param = optimize(trials)

In [22]:
hyperopt_params = {'max_depth': int(optimal_param['max_depth']),
                 'n_estimators':  int(optimal_param['n_estimators']),
               'learning_rate': np.log(optimal_param['learning_rate']),
                'gamma': np.log(optimal_param['gamma'])
                }

Results


In [25]:
def get_model_results(hyperparameters):
    xgb_regressor = xgb.XGBRegressor(**hyperparameters)
    cv_scores = cross_val_score(xgb_regressor, train_features, train_targets, cv=5, verbose=0, n_jobs=4)
    xgb_regressor.fit(train_features, train_targets)
    train_score = compute_rmse(xgb_regressor, train_features, train_targets)
    test_score = compute_rmse(xgb_regressor, test_features, test_targets)
    return {'hyperparameters': hyperparameters,
            'cv_scores': cv_scores, 
            'train_score': train_score,
            'mean_cv_score' : cv_scores.mean(),
            'test_score': test_score}

In [ ]:
configurations = {'model':['naive_grid', 'expert_grid', 'hyperopt_tpe'], 
                  'hyperparameters': [naive_gs.best_params_, expert_gs.best_params_, hyperopt_params]}

In [ ]:
results = [get_model_results(hp) for hp in configurations['hyperparameters']]

In [33]:
results_df = pd.DataFrame(results).assign(opt_method=lambda df: pd.Series(['naive_grid', 'expert_grid', 'hyperopt_tpe']))

Save the results


In [35]:
results_df.to_csv('../data/hyperparmaters_selection_results', index=False)

Load the results


In [35]:
results_df = pd.read_csv('../data/hyperparmaters_selection_results.csv')

In [37]:
results_df


Out[37]:
cv_scores mean_cv_score hyperparameters test_score train_score opt_method
0 [0.62770871, 0.53916399, 0.59777503, 0.5928037... 0.583 {'n_estimators': 25, 'learning_rate': 0.2, 'ma... 0.706723 0.529957 naive_grid
1 [0.6539893, 0.56968, 0.63451029, 0.61879454, 0... 0.609 {'n_estimators': 500, 'learning_rate': 0.05, '... 0.692121 0.375190 expert_grid
2 [0.65268997, 0.55835934, 0.63858404, 0.6084628... 0.605 {'n_estimators': 199, 'learning_rate': 0.04851... 0.678374 0.392884 hyperopt_tpe