The accompanying notebook for the "How We Optimize Hyperparmeters At Qucit" blog post.
Running the optimization part takes a long time (about a day).
In [2]:
# Data analysis imports
import pandas as pd
import numpy as np
# Machine learning imports
import xgboost as xgb
import sklearn
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer
# Hyperopt imports
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, rand
# Plotting imports
import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline
In [3]:
pd.set_option('max_colwidth',100)
sns.set(font_scale=1.5)
In [4]:
SEED = 314
TEST_SIZE = 0.2
MAX_EVALS = 10
In [5]:
gs_hp_grid = {'max_depth':[4, 6, 8, 10],
'n_estimators': [10, 15, 20, 25],
'learning_rate': [0.2, 0.4, 0.6, 0.8],
'gamma': [0.2, 0.4, 0.6, 0.8]
}
hyperopt_hp_grid = {'n_estimators' : hp.quniform('n_estimators', 10, 1000, 1),
'learning_rate' : hp.loguniform('learning_rate', 0.001, 0.1),
'max_depth' : hp.quniform('max_depth', 3, 15, 1),
'gamma': hp.loguniform('gamma', 0.01, 1)}
In [6]:
mse_scorer = make_scorer(mean_squared_error)
In [7]:
def load_airlines_delays():
airlines_df = pd.read_csv('../data/DelayedFlights.csv.zip', compression='zip').dropna()
features = airlines_df.drop(['Unnamed: 0', 'ArrDelay', u'CarrierDelay',
u'WeatherDelay', u'NASDelay', u'SecurityDelay',
u'LateAircraftDelay'], axis=1)
# Don't select the columns having object type
columns = features.columns[features.dtypes != 'object']
targets = airlines_df[['ArrDelay']]
return features.loc[:, columns], targets
In [8]:
def compute_rmse(model, features, targets):
prediction = model.predict(features)
rmse = np.sqrt(mean_squared_error(targets, prediction))
return rmse
def train_grid_search(cv_parameters, features, targets):
xgb_regressor = xgb.XGBRegressor({'eval_score':'rmse'})
grid_search = GridSearchCV(xgb_regressor, cv_parameters, cv=5,
verbose=1,
n_jobs=4, scoring=mse_scorer)
grid_search.fit(features, targets)
return grid_search
In [9]:
def transform_params(params):
params["gamma"] = np.log(params["gamma"])
params["learning_rate"] = np.log(params["learning_rate"])
params["n_estimators"] = int(params["n_estimators"])
params["max_depth"] = int(params["max_depth"])
return params
In [10]:
def loss(params):
params = transform_params(params)
xgb_regressor = xgb.XGBRegressor(silent=False, **params)
cv_mse = cross_val_score(xgb_regressor, train_features, train_targets,
cv=5, verbose=0, n_jobs=4,
scoring=mse_scorer)
rmse = np.sqrt(cv_mse.mean())
return {'loss': rmse,
'status': STATUS_OK}
def optimize(trials, space):
best = fmin(loss, space, algo=tpe.suggest,
trials=trials,
max_evals=MAX_EVALS)
return best
def random_optimize(trials, space):
best = fmin(loss, space, algo=rand.suggest,
trials=trials,
max_evals=MAX_EVALS)
return best
In [11]:
def get_model_results(hyperparameters):
xgb_regressor = xgb.XGBRegressor(**hyperparameters)
mse_cv_scores = cross_val_score(xgb_regressor, train_features, train_targets,
cv=5, verbose=0,
n_jobs=4, scoring=mse_scorer)
rmse_cv_scores = np.sqrt(mse_cv_scores)
xgb_regressor.fit(train_features, train_targets)
train_rmse = compute_rmse(xgb_regressor, train_features, train_targets)
test_rmse = compute_rmse(xgb_regressor, test_features, test_targets)
return {'optimal_hyperparameters': hyperparameters,
'train_rmse': train_rmse,
'mean_cv_rmse' : np.sqrt(mse_cv_scores.mean()),
'std_cv_rmse': mse_cv_scores.std() / float(np.sqrt(len(mse_cv_scores))),
'test_rmse': test_rmse}
In [12]:
features, targets = load_airlines_delays()
In [13]:
train_features, test_features, train_targets, test_targets = train_test_split(features, targets,
test_size=TEST_SIZE,
random_state=SEED)
In [ ]:
grid_search = train_grid_search(gs_hp_grid, train_features, train_targets)
In [159]:
trials = Trials()
hyperopt_optimal_hp = optimize(trials, hyperopt_hp_grid)
In [ ]:
hyperopt_optimal_hp = transform_params(hyperopt_optimal_hp)
In [165]:
def get_results_df():
optimization_methods = ['grid_search', 'hyperopt_tpe']
optimal_hyperparameters= [grid_search.best_params_,
hyperopt_optimal_hp]
results = [get_model_results(optimal_hp) for optimal_hp in optimal_hyperparameters]
return (pd.DataFrame(results)
.assign(opt_method=lambda df: pd.Series(optimization_methods))
.loc[:,
['optimal_hyperparameters', 'test_rmse',
'mean_cv_rmse', 'std_cv_rmse',
'train_rmse', 'opt_method']])
In [166]:
results_df = get_results_df()
In [169]:
results_df.to_csv('../data/airlines_opt_hp_results.csv', index=False)