The overview is that we will split the dataset into args.num_folds
distinct partitions ("folds"). (This example description will use args.num_folds=10
.) Eight of the folds will be used for training, one will be used for validation, and one for testing. This is performed for each set of hyperparameters. Finally, that process is repeated using each fold as the validation and test set. In this work, when fold i
is the validation set, then fold i+1%10
is the test set; the other folds are taken as the training set. Other strategies can be used, but this results in exactly a single prediction for each data record in each of the validation and test set for each set of hyperparameters.
The rough purpose of each of the dataset types is as follows.
In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
# initialize a logger for ipython
import pyllars.logging_utils as logging_utils
logger = logging_utils.get_ipython_logger()
# create an argparse namespace to hold parameters
from argparse import Namespace
args = Namespace()
# create (or connect to) a dask cluster
import pyllars.dask_utils as dask_utils
cluster_location = "LOCAL"
cluster_restart=False
dask_utils.add_dask_values_to_args(
args,
cluster_location=cluster_location,
num_procs=3,
num_threads_per_proc=1
)
dask_client, cluster = dask_utils.connect(args)
# machine learning imports
import sklearn.datasets
import sklearn.model_selection
import sklearn.pipeline
import sklearn.preprocessing
import xgboost
# other tools and helpers
import itertools
import json
In [2]:
import pyllars.ml_utils as ml_utils
In [3]:
import numpy as np
import pandas as pd
In [ ]:
In [4]:
args.random_state = 8675309 # several steps require a seed. we will use the same one to avoid unexpected results.
args.num_folds = 10 # use 10-fold cross-validation
args.evaluation_metric = 'mean_squared_error'
args.selection_strategy = np.argmin
In [5]:
# standard pydata imports
import joblib
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import pathlib
import seaborn as sns; sns.set(style='white', color_codes=True)
import tqdm
In [6]:
from typing import Any, Callable, Container, Dict, Iterable, List, NamedTuple, Optional, Set, Tuple
In [7]:
# this function *is not* in ml_utils
# we can do whatever we want here
def evaluate_hyperparameters_helper(
hv:List,
args:Namespace,
estimator_template:sklearn.base.BaseEstimator,
data:pd.DataFrame,
collect_metrics:Callable,
train_folds:Optional[Any]=None,
split_field:str='fold',
target_field:str='target',
target_transform:Optional[Callable]=None,
target_inverse_transform:Optional[Callable]=None,
collect_metrics_kwargs:Optional[Dict]=None,
fields_to_ignore:Optional[Container[str]]=None) -> NamedTuple:
# these come from our iterator
hyperparameters = hv[0]
validation_folds = hv[1]
# we know we are doing 10-fold cv
test_folds = (validation_folds + 1) % args.num_folds
res = ml_utils.evaluate_hyperparameters(
estimator_template=estimator_template,
hyperparameters=hyperparameters,
validation_folds=validation_folds,
test_folds=test_folds,
data=data,
collect_metrics=collect_metrics,
train_folds=train_folds,
split_field=split_field,
target_field=target_field,
target_transform=target_transform,
target_inverse_transform=target_inverse_transform,
collect_metrics_kwargs=collect_metrics_kwargs,
fields_to_ignore=fields_to_ignore
)
return res
In [ ]:
In [ ]:
In [ ]:
In [8]:
def load_dataset():
data = sklearn.datasets.load_boston()
df = pd.DataFrame(data['data'], columns=data['feature_names'])
df['target'] = data['target']
return df
In [9]:
df = load_dataset()
###
# Determine the fold of row.
#
# N.B. This could be performed once in an "offline" step
# and stored as another column in the dataframe
###
folds = ml_utils.get_cv_folds(
df['target'],
num_splits=args.num_folds,
use_stratified=False, # stratified does not work for regression
shuffle=True,
random_state=args.random_state # ensure we always shuffle the same way
)
# add a column to indicate the fold of each row
df['fold'] = folds
In [10]:
df.head()
Out[10]:
In [ ]:
In [11]:
# We could also include things like PCA here. However, one
# advantage of trees (and forests) is that we can assign
# importances to features; if we use dimensionality reduction
# or other techniques which significantly change the
# interpretation of the features, though, we largely lose
# that advantage.
estimator_template = sklearn.pipeline.Pipeline([
('scaler',sklearn.preprocessing.StandardScaler()),
('xgb', xgboost.XGBRegressor())
])
# In practice, this is a small hyperparameter grid; xgboost has
# many more hyperparameters that can be worth investigating.
#
# A quick overview of some of the most important hyperparameters and
# their interpretation is available here:
# https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
hyperparam_grid = sklearn.model_selection.ParameterGrid({
'xgb__n_estimators': [50, 100, 500],
'xgb__learning_rate': [.01,0.1],
})
As described in the introduction, we will test each set of hyperparameters in our grid on each fold (while training on eight of the other folds). For selecting our final set of hyperparameters, we will use performance on the validation fold. (More details are given on this procedure below.)
Concretely, we will accomplish this by iterating over the cross-product of the sets of hyperparameters and validation set folds. (As described, given the validation fold index, we can determine the training and testing folds.)
In [12]:
# for simplicity, just create lists of everything
# In principle, a lazy generator or something more fancy could
# be used; in practice, this is usually not necessary unless
# the hyperparameter grid contains very large objects.
hyperparam_grid = list(hyperparam_grid)
folds = list(range(args.num_folds))
# an iterator over (hyperparameter, validation_fold) tuples
hp_fold_it = itertools.product(hyperparam_grid, folds)
hp_fold_it = list(hp_fold_it)
In [ ]:
In [13]:
df.head()
Out[13]:
In [14]:
hyperparameters, validation_fold = hp_fold_it[51]
test_fold = (validation_fold + 1) % 10
res = ml_utils.evaluate_hyperparameters(
estimator_template=estimator_template,
hyperparameters=hyperparameters,
validation_folds=validation_fold,
test_folds=test_fold,
data=df,
split_field='fold',
target_field='target',
collect_metrics=ml_utils.collect_regression_metrics,
target_transform=np.log1p,
target_inverse_transform=np.expm1
)
res.metrics_val
Out[14]:
In [ ]:
In [15]:
res = evaluate_hyperparameters_helper(
hp_fold_it[51],
args=args,
estimator_template=estimator_template,
data=df,
split_field='fold',
target_field='target',
collect_metrics=ml_utils.collect_regression_metrics,
target_transform=np.log1p,
target_inverse_transform=np.expm1
)
res.metrics_val
Out[15]:
In [16]:
res.hyperparameters_str
Out[16]:
In [17]:
f_res = dask_utils.apply_iter(
hp_fold_it,
dask_client,
evaluate_hyperparameters_helper,
args=args,
estimator_template=estimator_template,
data=df,
split_field='fold',
target_field='target',
collect_metrics=ml_utils.collect_regression_metrics,
target_transform=np.log1p,
target_inverse_transform=np.expm1,
return_futures=True
)
In [19]:
dask_utils.check_status(f_res)
Out[19]:
In [20]:
all_res = dask_utils.collect_results(f_res)
In [21]:
all_res[51].metrics_val
Out[21]:
In [ ]:
In [ ]:
In [22]:
def _get_res(res):
ret_val = {
'validation_{}'.format(k): v
for k,v in res.metrics_val.items()
}
ret_test = {
'test_{}'.format(k): v
for k,v in res.metrics_test.items()
}
ret = ret_val
ret.update(ret_test)
hp_string = json.dumps(res.hyperparameters)
ret['hyperparameters_str'] = hp_string
ret['hyperparameters'] = res.hyperparameters
ret['validation_fold'] = res.fold_val
ret['test_fold'] = res.fold_test
return ret
In [23]:
###
# Create the results data frame
###
results = [
_get_res(res) for res in all_res
]
df_results = pd.DataFrame(results)
In [ ]:
df_results.head()
In [ ]:
In [24]:
###
# Based on the performance on the validation set, select
# the best hyperparameters.
###
hp_groups = df_results.groupby('hyperparameters_str')
validation_evaluation_metric = "validation_{}".format(args.evaluation_metric)
test_evaluation_metric = "test_{}".format(args.evaluation_metric)
val_performance = hp_groups[validation_evaluation_metric].mean()
# now, select the best
val_best = args.selection_strategy(val_performance)
val_best
In [ ]:
In [27]:
m_val_best = (df_results['hyperparameters_str'] == val_best)
In [ ]:
In [29]:
df_results[m_val_best]
Out[29]:
In [30]:
# go back and select the predictions for the best hyperparameters
best_res = [
res for res in all_res
if res.hyperparameters_str == val_best
]
In [ ]:
len(best_res)
In [31]:
best_res[0].predictions_test
Out[31]:
In [32]:
best_res[0].true_test
Out[32]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
hp_groups['test_mean_absolute_error'].mean()
In [ ]:
In [ ]:
df_results.columns
In [ ]:
In [ ]:
In [ ]:
In [ ]:
###
# Extract masks for the training, validation, and testing
# sets.
###
splits = ml_utils.get_train_val_test_splits(
df,
validation_splits=validation_set,
test_splits=test_set,
split_field='fold'
)
###
# Create the data matrices necessary for the various
# sklearn operations we will perform later.
###
# we do not want to use metadata fields
fields_to_ignore = [
'fold'
]
fold_data = ml_utils.get_fold_data(
df,
target_field='target',
m_train=splits.training,
m_test=splits.test,
m_validation=splits.validation,
fields_to_ignore=fields_to_ignore
)
In [ ]:
###
# Based on the template of our estimator pipeline template
# and hyperparameters, create a concrete estimator with the
# specified hyperparameters.
###
estimator = sklearn.clone(estimator_template)
estimator = estimator.set_params(**hyperparameters)
In [ ]:
###
# Transform the target variable by taking the log(1+y).
#
# This operation may not help in all domains.
###
y_train = np.log1p(fold_data.y_train)
In [ ]:
###
# Fit the estimator on the training set
###
estimator_fit = estimator.fit(fold_data.X_train, y_train)
In [ ]:
###
# Use the fit estimator to make predictions on *both* the
# validation and testing set. We will use both of these later.
###
y_pred = estimator_fit.predict(fold_data.X_test)
y_val = estimator_fit.predict(fold_data.X_validation)
# make sure to transform the predictions back to the original
# scale using exp(y-1)
y_pred = np.expm1(y_pred)
y_val = np.expm1(y_val)
In [ ]:
In [ ]:
###
# Collect various evaluation metrics of the trained model
# on the validation and testing data.
###
metrics_val = ml_utils.collect_regression_metrics(fold_data.y_validation, y_val, prefix="val_")
metrics_test = ml_utils.collect_regression_metrics(fold_data.y_test, y_pred, prefix="test_")
In [ ]:
metrics_test
In [ ]:
metrics_val
In [ ]:
###
# Construct a summary of the hyperparameters, validation
# testing set, as well as the performance of the trained
# model on those sets. We will use the performance on the
# validation set across all folds in order to select the
# optimal hyperparameters for evaluation on the test set.
###
ret = {
'val_set': validation_set,
'test_set': test_set,
'hyperparameter': str(hyperparameters),
}
ret.update(metrics_test)
ret.update(metrics_val)
In [ ]: