In this notebook, we explore different techniques to optimize hyperparameters working with an IMDb dataset and trying to predict the score each movie received.



In [2]:

    
# Data analysis imports
import pandas as pd
import numpy as np

# Machine learning imports
import xgboost as xgb
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score

# Hyperopt imports
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

# Plotting imports
import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline



In [5]:

    
# Some constants
SEED = 314
TEST_SIZE = 0.2
MAX_EVALS = 100



In [6]:

    
# Some utility functions
def compute_rmse(model, features, targets):
    prediction = model.predict(features)
    rmse = np.sqrt(np.mean((prediction - targets) ** 2))
    return rmse

def train_grid_search(cv_parameters, features, targets):
    xgb_regressor = xgb.XGBRegressor({'eval_score':'rmse'})
    grid_search = GridSearchCV(xgb_regressor, cv_parameters, cv=5, verbose=1, n_jobs=4)
    grid_search.fit(features, targets)
    return grid_search

Load processed data



In [7]:

    
imdb_df = pd.read_csv('../data/processed_movie_metadata.csv')



In [8]:

    
imdb_df.head()









    Out[8]:






  
    
      
      num_critic_for_reviews
      duration
      director_facebook_likes
      actor_3_facebook_likes
      actor_1_facebook_likes
      num_voted_users
      num_user_for_reviews
      title_year
      actor_2_facebook_likes
      imdb_score
      ...
      Not Rated
      PG
      PG-13
      Passed
      R
      TV-14
      TV-G
      TV-PG
      Unrated
      X
    
  
  
    
      0
      723.0
      178.0
      0.0
      855.0
      1000.0
      886204
      3054.0
      2009.0
      936.0
      7.9
      ...
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
    
    
      1
      302.0
      169.0
      563.0
      1000.0
      40000.0
      471220
      1238.0
      2007.0
      5000.0
      7.1
      ...
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
    
    
      2
      602.0
      148.0
      0.0
      161.0
      11000.0
      275868
      994.0
      2015.0
      393.0
      6.8
      ...
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
    
    
      3
      813.0
      164.0
      22000.0
      23000.0
      27000.0
      1144337
      2701.0
      2012.0
      23000.0
      8.5
      ...
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
    
    
      4
      462.0
      132.0
      475.0
      530.0
      640.0
      212204
      738.0
      2012.0
      632.0
      6.6
      ...
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
    
  

5 rows × 145 columns

Train - test split



In [9]:

    
train_df, test_df = train_test_split(imdb_df, test_size=TEST_SIZE, random_state=SEED)



In [10]:

    
train_features = train_df.drop('imdb_score', axis=1)
train_targets = train_df.loc[:, 'imdb_score']
test_features = test_df.drop('imdb_score', axis=1)
test_targets = test_df.loc[:, 'imdb_score']

Naive grid search



In [11]:

    
naive_cv_parameters = {'max_depth':[4, 6, 8, 10],
                 'n_estimators': [10, 15, 20, 25],
                 'learning_rate': [0.2, 0.4, 0.6, 0.8], 
                 'gamma': [0.2, 0.4, 0.6, 0.8]
}



In [12]:

    
naive_gs = train_grid_search(naive_cv_parameters, train_features, train_targets)









    



Fitting 5 folds for each of 256 candidates, totalling 1280 fits






    



[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    9.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   56.5s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  2.2min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  3.7min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  5.4min
[Parallel(n_jobs=4)]: Done 1280 out of 1280 | elapsed:  5.6min finished

Expert grid search

This grid search is based on the recommmendations of the following article:



In [13]:

    
expert_cv_parameters = {'max_depth':[4, 6, 10, 15],
                 'n_estimators': [10, 50, 100, 500],
                 'learning_rate': [0.01, 0.025, 0.05, 0.1],
                 'gamma': [0.05, 0.5, 0.9, 1.]
}



In [14]:

    
expert_gs = train_grid_search(expert_cv_parameters, train_features, train_targets)









    



Fitting 5 folds for each of 256 candidates, totalling 1280 fits






    



[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  8.3min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed: 18.7min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed: 33.8min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed: 53.5min
[Parallel(n_jobs=4)]: Done 1280 out of 1280 | elapsed: 56.0min finished

Hyperopt

Link to the theory:

score => f(x)
optimize => defines the hyperparameters space and the optimization strategy (here TPE)



In [15]:

    
def score(params):
    params["gamma"] = np.log(params["gamma"])
    params["learning_rate"] = np.log(params["learning_rate"])
    params["n_estimators"] = int(params["n_estimators"])
    params["max_depth"] = int(params["max_depth"])
    xgb_regressor = xgb.XGBRegressor(silent=False, **params)
    score = cross_val_score(xgb_regressor, train_features, train_targets, 
                            cv=5, verbose=0, 
                            n_jobs=4).mean()
    # Try - score instead of 1 - score
    loss = - score
    return {'loss': loss, 
            'status': STATUS_OK}



In [16]:

    
def optimize(trials):
    space = {'n_estimators' : hp.quniform('n_estimators', 10, 1000, 1),
             'learning_rate' : hp.loguniform('learning_rate', 0.01, 1),
             'max_depth' : hp.quniform('max_depth', 3, 15, 1),
             'gamma': hp.loguniform('gamma', 0.01, 1)}
    best = fmin(score, space, algo=tpe.suggest, 
                trials=trials, 
                max_evals=MAX_EVALS)
    return best



In [17]:

    
trials = Trials()
optimal_param = optimize(trials)



In [22]:

    
hyperopt_params = {'max_depth': int(optimal_param['max_depth']),
                 'n_estimators':  int(optimal_param['n_estimators']),
               'learning_rate': np.log(optimal_param['learning_rate']),
                'gamma': np.log(optimal_param['gamma'])
                }

Results



In [25]:

    
def get_model_results(hyperparameters):
    xgb_regressor = xgb.XGBRegressor(**hyperparameters)
    cv_scores = cross_val_score(xgb_regressor, train_features, train_targets, cv=5, verbose=0, n_jobs=4)
    xgb_regressor.fit(train_features, train_targets)
    train_score = compute_rmse(xgb_regressor, train_features, train_targets)
    test_score = compute_rmse(xgb_regressor, test_features, test_targets)
    return {'hyperparameters': hyperparameters,
            'cv_scores': cv_scores, 
            'train_score': train_score,
            'mean_cv_score' : cv_scores.mean(),
            'test_score': test_score}



In [ ]:

    
configurations = {'model':['naive_grid', 'expert_grid', 'hyperopt_tpe'], 
                  'hyperparameters': [naive_gs.best_params_, expert_gs.best_params_, hyperopt_params]}



In [ ]:

    
results = [get_model_results(hp) for hp in configurations['hyperparameters']]



In [33]:

    
results_df = pd.DataFrame(results).assign(opt_method=lambda df: pd.Series(['naive_grid', 'expert_grid', 'hyperopt_tpe']))

Save the results



In [35]:

    
results_df.to_csv('../data/hyperparmaters_selection_results', index=False)

Load the results



In [35]:

    
results_df = pd.read_csv('../data/hyperparmaters_selection_results.csv')



In [37]:

    
results_df









    Out[37]:






  
    
      
      cv_scores
      mean_cv_score
      hyperparameters
      test_score
      train_score
      opt_method
    
  
  
    
      0
      [0.62770871, 0.53916399, 0.59777503, 0.5928037...
      0.583
      {'n_estimators': 25, 'learning_rate': 0.2, 'ma...
      0.706723
      0.529957
      naive_grid
    
    
      1
      [0.6539893, 0.56968, 0.63451029, 0.61879454, 0...
      0.609
      {'n_estimators': 500, 'learning_rate': 0.05, '...
      0.692121
      0.375190
      expert_grid
    
    
      2
      [0.65268997, 0.55835934, 0.63858404, 0.6084628...
      0.605
      {'n_estimators': 199, 'learning_rate': 0.04851...
      0.678374
      0.392884
      hyperopt_tpe

	num_critic_for_reviews	duration	director_facebook_likes	actor_3_facebook_likes	actor_1_facebook_likes	num_voted_users	num_user_for_reviews	title_year	actor_2_facebook_likes	imdb_score	...	PG-13
0	723.0	178.0	0.0	855.0	1000.0	886204	3054.0	2009.0	936.0	7.9	...	1
1	302.0	169.0	563.0	1000.0	40000.0	471220	1238.0	2007.0	5000.0	7.1	...	1
2	602.0	148.0	0.0	161.0	11000.0	275868	994.0	2015.0	393.0	6.8	...	1
3	813.0	164.0	22000.0	23000.0	27000.0	1144337	2701.0	2012.0	23000.0	8.5	...	1
4	462.0	132.0	475.0	530.0	640.0	212204	738.0	2012.0	632.0	6.6	...	1

	cv_scores	mean_cv_score	hyperparameters	test_score	train_score	opt_method
0	[0.62770871, 0.53916399, 0.59777503, 0.5928037...	0.583	{'n_estimators': 25, 'learning_rate': 0.2, 'ma...	0.706723	0.529957	naive_grid
1	[0.6539893, 0.56968, 0.63451029, 0.61879454, 0...	0.609	{'n_estimators': 500, 'learning_rate': 0.05, '...	0.692121	0.375190	expert_grid
2	[0.65268997, 0.55835934, 0.63858404, 0.6084628...	0.605	{'n_estimators': 199, 'learning_rate': 0.04851...	0.678374	0.392884	hyperopt_tpe