Objective:
Wine Quality Dataset:
Steps:
Full Technical Reference:
In [1]:
# Start and connect to a local H2O cluster
import h2o
h2o.init(nthreads = -1)
In [2]:
# Import wine quality data from a local CSV file
wine = h2o.import_file("winequality-white.csv")
wine.head(5)
Out[2]:
In [3]:
# Define features (or predictors)
features = list(wine.columns) # we want to use all the information
features.remove('quality') # we need to exclude the target 'quality' (otherwise there is nothing to predict)
features
Out[3]:
In [4]:
# Split the H2O data frame into training/test sets
# so we can evaluate out-of-bag performance
wine_split = wine.split_frame(ratios = [0.8], seed = 1234)
wine_train = wine_split[0] # using 80% for training
wine_test = wine_split[1] # using the rest 20% for out-of-bag evaluation
In [5]:
wine_train.shape
Out[5]:
In [6]:
wine_test.shape
Out[6]:
In [7]:
# Build a Gradient Boosting Machines (GBM) model with default settings
# Import the function for GBM
from h2o.estimators.gbm import H2OGradientBoostingEstimator
# Set up GBM for regression
# Add a seed for reproducibility
gbm_default = H2OGradientBoostingEstimator(model_id = 'gbm_default',
seed = 1234)
# Use .train() to build the model
gbm_default.train(x = features,
y = 'quality',
training_frame = wine_train)
In [8]:
# Check the model performance on test dataset
gbm_default.model_performance(wine_test)
Out[8]:
In [9]:
# Build a GBM with manual settings
# Set up GBM for regression
# Add a seed for reproducibility
gbm_manual = H2OGradientBoostingEstimator(model_id = 'gbm_manual',
seed = 1234,
ntrees = 100,
sample_rate = 0.9,
col_sample_rate = 0.9)
# Use .train() to build the model
gbm_manual.train(x = features,
y = 'quality',
training_frame = wine_train)
In [10]:
# Check the model performance on test dataset
gbm_manual.model_performance(wine_test)
Out[10]:
In [11]:
# Build a GBM with manual settings & cross-validation
# Set up GBM for regression
# Add a seed for reproducibility
gbm_manual_cv = H2OGradientBoostingEstimator(model_id = 'gbm_manual_cv',
seed = 1234,
ntrees = 100,
sample_rate = 0.9,
col_sample_rate = 0.9,
nfolds = 5)
# Use .train() to build the model
gbm_manual_cv.train(x = features,
y = 'quality',
training_frame = wine_train)
In [12]:
# Check the cross-validation model performance
gbm_manual_cv
Out[12]:
In [13]:
# Check the model performance on test dataset
gbm_manual_cv.model_performance(wine_test)
# It should be the same as gbm_manual above as the model is trained with same parameters
Out[13]:
In [14]:
# Build a GBM with manual settings, CV and early stopping
# Set up GBM for regression
# Add a seed for reproducibility
gbm_manual_cv_es = H2OGradientBoostingEstimator(model_id = 'gbm_manual_cv_es',
seed = 1234,
ntrees = 10000, # increase the number of trees
sample_rate = 0.9,
col_sample_rate = 0.9,
nfolds = 5,
stopping_metric = 'mse', # let early stopping feature determine
stopping_rounds = 15, # the optimal number of trees
score_tree_interval = 1) # by looking at the MSE metric
# Use .train() to build the model
gbm_manual_cv_es.train(x = features,
y = 'quality',
training_frame = wine_train)
In [15]:
# Check the model summary
gbm_manual_cv_es.summary()
Out[15]:
In [16]:
# Check the cross-validation model performance
gbm_manual_cv_es
Out[16]:
In [17]:
# Check the model performance on test dataset
gbm_manual_cv_es.model_performance(wine_test)
Out[17]:
In [18]:
# import Grid Search
from h2o.grid.grid_search import H2OGridSearch
In [19]:
# define the criteria for full grid search
search_criteria = {'strategy': "Cartesian"}
In [20]:
# define the range of hyper-parameters for grid search
hyper_params = {'sample_rate': [0.7, 0.8, 0.9],
'col_sample_rate': [0.7, 0.8, 0.9]}
In [21]:
# Set up GBM grid search
# Add a seed for reproducibility
gbm_full_grid = H2OGridSearch(
H2OGradientBoostingEstimator(
model_id = 'gbm_full_grid',
seed = 1234,
ntrees = 10000,
nfolds = 5,
stopping_metric = 'mse',
stopping_rounds = 15,
score_tree_interval = 1),
search_criteria = search_criteria, # full grid search
hyper_params = hyper_params)
In [22]:
# Use .train() to start the grid search
gbm_full_grid.train(x = features,
y = 'quality',
training_frame = wine_train)
In [23]:
# Sort and show the grid search results
gbm_full_grid_sorted = gbm_full_grid.get_grid(sort_by='mse', decreasing=False)
print(gbm_full_grid_sorted)
In [24]:
# Extract the best model from full grid search
best_model_id = gbm_full_grid_sorted.model_ids[0]
best_gbm_from_full_grid = h2o.get_model(best_model_id)
best_gbm_from_full_grid.summary()
Out[24]:
In [25]:
# Check the model performance on test dataset
best_gbm_from_full_grid.model_performance(wine_test)
Out[25]:
In [26]:
# define the criteria for random grid search
search_criteria = {'strategy': "RandomDiscrete",
'max_models': 9,
'seed': 1234}
In [27]:
# define the range of hyper-parameters for grid search
# 27 combinations in total
hyper_params = {'sample_rate': [0.7, 0.8, 0.9],
'col_sample_rate': [0.7, 0.8, 0.9],
'max_depth': [3, 5, 7]}
In [28]:
# Set up GBM grid search
# Add a seed for reproducibility
gbm_rand_grid = H2OGridSearch(
H2OGradientBoostingEstimator(
model_id = 'gbm_rand_grid',
seed = 1234,
ntrees = 10000,
nfolds = 5,
stopping_metric = 'mse',
stopping_rounds = 15,
score_tree_interval = 1),
search_criteria = search_criteria, # full grid search
hyper_params = hyper_params)
In [29]:
# Use .train() to start the grid search
gbm_rand_grid.train(x = features,
y = 'quality',
training_frame = wine_train)
In [30]:
# Sort and show the grid search results
gbm_rand_grid_sorted = gbm_rand_grid.get_grid(sort_by='mse', decreasing=False)
print(gbm_rand_grid_sorted)
In [31]:
# Extract the best model from random grid search
best_model_id = gbm_rand_grid_sorted.model_ids[0]
best_gbm_from_rand_grid = h2o.get_model(best_model_id)
best_gbm_from_rand_grid.summary()
Out[31]:
In [32]:
# Check the model performance on test dataset
best_gbm_from_rand_grid.model_performance(wine_test)
Out[32]:
In [33]:
print('GBM with Default Settings :', gbm_default.model_performance(wine_test).mse())
print('GBM with Manual Settings :', gbm_manual.model_performance(wine_test).mse())
print('GBM with Manual Settings & CV :', gbm_manual_cv.model_performance(wine_test).mse())
print('GBM with Manual Settings, CV & Early Stopping :', gbm_manual_cv_es.model_performance(wine_test).mse())
print('GBM with CV, Early Stopping & Full Grid Search :',
best_gbm_from_full_grid.model_performance(wine_test).mse())
print('GBM with CV, Early Stopping & Random Grid Search :',
best_gbm_from_rand_grid.model_performance(wine_test).mse())