In this notebook, we explore different techniques to optimize hyperparameters working with an IMDb dataset and trying to predict the score each movie received.
In [2]:
# Data analysis imports
import pandas as pd
import numpy as np
# Machine learning imports
import xgboost as xgb
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score
# Hyperopt imports
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
# Plotting imports
import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline
In [5]:
# Some constants
SEED = 314
TEST_SIZE = 0.2
MAX_EVALS = 100
In [6]:
# Some utility functions
def compute_rmse(model, features, targets):
prediction = model.predict(features)
rmse = np.sqrt(np.mean((prediction - targets) ** 2))
return rmse
def train_grid_search(cv_parameters, features, targets):
xgb_regressor = xgb.XGBRegressor({'eval_score':'rmse'})
grid_search = GridSearchCV(xgb_regressor, cv_parameters, cv=5, verbose=1, n_jobs=4)
grid_search.fit(features, targets)
return grid_search
In [7]:
imdb_df = pd.read_csv('../data/processed_movie_metadata.csv')
In [8]:
imdb_df.head()
Out[8]:
In [9]:
train_df, test_df = train_test_split(imdb_df, test_size=TEST_SIZE, random_state=SEED)
In [10]:
train_features = train_df.drop('imdb_score', axis=1)
train_targets = train_df.loc[:, 'imdb_score']
test_features = test_df.drop('imdb_score', axis=1)
test_targets = test_df.loc[:, 'imdb_score']
In [11]:
naive_cv_parameters = {'max_depth':[4, 6, 8, 10],
'n_estimators': [10, 15, 20, 25],
'learning_rate': [0.2, 0.4, 0.6, 0.8],
'gamma': [0.2, 0.4, 0.6, 0.8]
}
In [12]:
naive_gs = train_grid_search(naive_cv_parameters, train_features, train_targets)
This grid search is based on the recommmendations of the following article:
In [13]:
expert_cv_parameters = {'max_depth':[4, 6, 10, 15],
'n_estimators': [10, 50, 100, 500],
'learning_rate': [0.01, 0.025, 0.05, 0.1],
'gamma': [0.05, 0.5, 0.9, 1.]
}
In [14]:
expert_gs = train_grid_search(expert_cv_parameters, train_features, train_targets)
In [15]:
def score(params):
params["gamma"] = np.log(params["gamma"])
params["learning_rate"] = np.log(params["learning_rate"])
params["n_estimators"] = int(params["n_estimators"])
params["max_depth"] = int(params["max_depth"])
xgb_regressor = xgb.XGBRegressor(silent=False, **params)
score = cross_val_score(xgb_regressor, train_features, train_targets,
cv=5, verbose=0,
n_jobs=4).mean()
# Try - score instead of 1 - score
loss = - score
return {'loss': loss,
'status': STATUS_OK}
In [16]:
def optimize(trials):
space = {'n_estimators' : hp.quniform('n_estimators', 10, 1000, 1),
'learning_rate' : hp.loguniform('learning_rate', 0.01, 1),
'max_depth' : hp.quniform('max_depth', 3, 15, 1),
'gamma': hp.loguniform('gamma', 0.01, 1)}
best = fmin(score, space, algo=tpe.suggest,
trials=trials,
max_evals=MAX_EVALS)
return best
In [17]:
trials = Trials()
optimal_param = optimize(trials)
In [22]:
hyperopt_params = {'max_depth': int(optimal_param['max_depth']),
'n_estimators': int(optimal_param['n_estimators']),
'learning_rate': np.log(optimal_param['learning_rate']),
'gamma': np.log(optimal_param['gamma'])
}
In [25]:
def get_model_results(hyperparameters):
xgb_regressor = xgb.XGBRegressor(**hyperparameters)
cv_scores = cross_val_score(xgb_regressor, train_features, train_targets, cv=5, verbose=0, n_jobs=4)
xgb_regressor.fit(train_features, train_targets)
train_score = compute_rmse(xgb_regressor, train_features, train_targets)
test_score = compute_rmse(xgb_regressor, test_features, test_targets)
return {'hyperparameters': hyperparameters,
'cv_scores': cv_scores,
'train_score': train_score,
'mean_cv_score' : cv_scores.mean(),
'test_score': test_score}
In [ ]:
configurations = {'model':['naive_grid', 'expert_grid', 'hyperopt_tpe'],
'hyperparameters': [naive_gs.best_params_, expert_gs.best_params_, hyperopt_params]}
In [ ]:
results = [get_model_results(hp) for hp in configurations['hyperparameters']]
In [33]:
results_df = pd.DataFrame(results).assign(opt_method=lambda df: pd.Series(['naive_grid', 'expert_grid', 'hyperopt_tpe']))
In [35]:
results_df.to_csv('../data/hyperparmaters_selection_results', index=False)
In [35]:
results_df = pd.read_csv('../data/hyperparmaters_selection_results.csv')
In [37]:
results_df
Out[37]: