The accompanying notebook for the "How We Optimize Hyperparmeters At Qucit" blog post.
Running the optimization part takes a long time (about a day).

Load libraries and set constants

Load libraries


In [2]:
# Data analysis imports
import pandas as pd
import numpy as np

# Machine learning imports
import xgboost as xgb
import sklearn
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer

# Hyperopt imports
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, rand

# Plotting imports
import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline

In [3]:
pd.set_option('max_colwidth',100)
sns.set(font_scale=1.5)

Some constants


In [4]:
SEED = 314
TEST_SIZE = 0.2
MAX_EVALS = 10

Hyperparameters grids


In [5]:
gs_hp_grid = {'max_depth':[4, 6, 8, 10],
              'n_estimators': [10, 15, 20, 25],
              'learning_rate': [0.2, 0.4, 0.6, 0.8], 
              'gamma': [0.2, 0.4, 0.6, 0.8]
}

hyperopt_hp_grid = {'n_estimators' : hp.quniform('n_estimators', 10, 1000, 1),
             'learning_rate' : hp.loguniform('learning_rate', 0.001, 0.1),
             'max_depth' : hp.quniform('max_depth', 3, 15, 1),
             'gamma': hp.loguniform('gamma', 0.01, 1)}

Some utiliy functions

Score


In [6]:
mse_scorer = make_scorer(mean_squared_error)

Data loading


In [7]:
def load_airlines_delays():
    airlines_df = pd.read_csv('../data/DelayedFlights.csv.zip', compression='zip').dropna()
    features = airlines_df.drop(['Unnamed: 0', 'ArrDelay', u'CarrierDelay',
                                 u'WeatherDelay', u'NASDelay', u'SecurityDelay',
                                 u'LateAircraftDelay'], axis=1)
    # Don't select the columns having object type
    columns = features.columns[features.dtypes != 'object']
    targets = airlines_df[['ArrDelay']]
    return features.loc[:, columns], targets

In [8]:
def compute_rmse(model, features, targets):
    prediction = model.predict(features)
    rmse = np.sqrt(mean_squared_error(targets, prediction))
    return rmse

def train_grid_search(cv_parameters, features, targets):
    xgb_regressor = xgb.XGBRegressor({'eval_score':'rmse'})
    grid_search = GridSearchCV(xgb_regressor, cv_parameters, cv=5, 
                               verbose=1, 
                               n_jobs=4, scoring=mse_scorer)
    grid_search.fit(features, targets)
    return grid_search

Hyperopt


In [9]:
def transform_params(params):
    params["gamma"] = np.log(params["gamma"])
    params["learning_rate"] = np.log(params["learning_rate"])
    params["n_estimators"] = int(params["n_estimators"])
    params["max_depth"] = int(params["max_depth"])
    return params

In [10]:
def loss(params):
    params = transform_params(params)
    xgb_regressor = xgb.XGBRegressor(silent=False, **params)
    cv_mse = cross_val_score(xgb_regressor, train_features, train_targets, 
                          cv=5, verbose=0, n_jobs=4, 
                          scoring=mse_scorer)
    rmse = np.sqrt(cv_mse.mean())
    return {'loss': rmse, 
            'status': STATUS_OK}
def optimize(trials, space):
    best = fmin(loss, space, algo=tpe.suggest, 
                trials=trials, 
                max_evals=MAX_EVALS)
    return best
def random_optimize(trials, space):
    best = fmin(loss, space, algo=rand.suggest,
                trials=trials, 
                max_evals=MAX_EVALS)
    return best

Model results


In [11]:
def get_model_results(hyperparameters):
    xgb_regressor = xgb.XGBRegressor(**hyperparameters)
    mse_cv_scores = cross_val_score(xgb_regressor, train_features, train_targets, 
                                cv=5, verbose=0, 
                                n_jobs=4, scoring=mse_scorer)
    rmse_cv_scores = np.sqrt(mse_cv_scores)
    xgb_regressor.fit(train_features, train_targets)
    train_rmse = compute_rmse(xgb_regressor, train_features, train_targets)
    test_rmse = compute_rmse(xgb_regressor, test_features, test_targets)
    return {'optimal_hyperparameters': hyperparameters,
            'train_rmse': train_rmse,
            'mean_cv_rmse' : np.sqrt(mse_cv_scores.mean()),
            'std_cv_rmse':  mse_cv_scores.std() / float(np.sqrt(len(mse_cv_scores))),
            'test_rmse': test_rmse}

Load data and train/test split


In [12]:
features, targets = load_airlines_delays()

In [13]:
train_features, test_features, train_targets, test_targets = train_test_split(features, targets, 
                                                                              test_size=TEST_SIZE, 
                                                                              random_state=SEED)

Optimize hyperparameters


In [ ]:
grid_search = train_grid_search(gs_hp_grid, train_features, train_targets)

In [159]:
trials = Trials()
hyperopt_optimal_hp = optimize(trials, hyperopt_hp_grid)

In [ ]:
hyperopt_optimal_hp = transform_params(hyperopt_optimal_hp)

Results


In [165]:
def get_results_df():
    optimization_methods = ['grid_search', 'hyperopt_tpe']
    optimal_hyperparameters= [grid_search.best_params_, 
                              hyperopt_optimal_hp]
    results = [get_model_results(optimal_hp) for optimal_hp in optimal_hyperparameters]
    return (pd.DataFrame(results)
              .assign(opt_method=lambda df: pd.Series(optimization_methods))
              .loc[:, 
                   ['optimal_hyperparameters', 'test_rmse', 
                    'mean_cv_rmse', 'std_cv_rmse', 
                    'train_rmse', 'opt_method']])

In [166]:
results_df = get_results_df()

Save results


In [169]:
results_df.to_csv('../data/airlines_opt_hp_results.csv', index=False)