Objective:
Titanic Dataset:
Steps:
Full Technical Reference:
In [1]:
# Import all required modules
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
from h2o.grid.grid_search import H2OGridSearch
# Start and connect to a local H2O cluster
h2o.init(nthreads = -1)
In [2]:
# Import Titanic data (local CSV)
titanic = h2o.import_file("kaggle_titanic.csv")
titanic.head(5)
Out[2]:
In [3]:
# Convert 'Survived' and 'Pclass' to categorical values
titanic['Survived'] = titanic['Survived'].asfactor()
titanic['Pclass'] = titanic['Pclass'].asfactor()
In [4]:
# Define features (or predictors) manually
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
In [5]:
# Split the H2O data frame into training/test sets
# so we can evaluate out-of-bag performance
titanic_split = titanic.split_frame(ratios = [0.8], seed = 1234)
titanic_train = titanic_split[0] # using 80% for training
titanic_test = titanic_split[1] # using the rest 20% for out-of-bag evaluation
In [6]:
titanic_train.shape
Out[6]:
In [7]:
titanic_test.shape
Out[7]:
In [8]:
# define the criteria for random grid search
search_criteria = {'strategy': "RandomDiscrete",
'max_models': 9,
'seed': 1234}
In [9]:
# define the range of hyper-parameters for GBM grid search
# 27 combinations in total
hyper_params = {'sample_rate': [0.7, 0.8, 0.9],
'col_sample_rate': [0.7, 0.8, 0.9],
'max_depth': [3, 5, 7]}
In [10]:
# Set up GBM grid search
# Add a seed for reproducibility
gbm_rand_grid = H2OGridSearch(
H2OGradientBoostingEstimator(
model_id = 'gbm_rand_grid',
seed = 1234,
ntrees = 10000,
nfolds = 5,
fold_assignment = "Modulo", # needed for stacked ensembles
keep_cross_validation_predictions = True, # needed for stacked ensembles
stopping_metric = 'mse',
stopping_rounds = 15,
score_tree_interval = 1),
search_criteria = search_criteria, # full grid search
hyper_params = hyper_params)
In [11]:
# Use .train() to start the grid search
gbm_rand_grid.train(x = features,
y = 'Survived',
training_frame = titanic_train)
In [12]:
# Sort and show the grid search results
gbm_rand_grid_sorted = gbm_rand_grid.get_grid(sort_by='auc', decreasing=True)
print(gbm_rand_grid_sorted)
In [13]:
# Extract the best model from random grid search
best_gbm_model_id = gbm_rand_grid_sorted.model_ids[0]
best_gbm_from_rand_grid = h2o.get_model(best_gbm_model_id)
best_gbm_from_rand_grid.summary()
Out[13]:
In [14]:
# define the range of hyper-parameters for DRF grid search
# 27 combinations in total
hyper_params = {'sample_rate': [0.5, 0.6, 0.7],
'col_sample_rate_per_tree': [0.7, 0.8, 0.9],
'max_depth': [3, 5, 7]}
In [15]:
# Set up DRF grid search
# Add a seed for reproducibility
drf_rand_grid = H2OGridSearch(
H2ORandomForestEstimator(
model_id = 'drf_rand_grid',
seed = 1234,
ntrees = 200,
nfolds = 5,
fold_assignment = "Modulo", # needed for stacked ensembles
keep_cross_validation_predictions = True), # needed for stacked ensembles
search_criteria = search_criteria, # full grid search
hyper_params = hyper_params)
In [16]:
# Use .train() to start the grid search
drf_rand_grid.train(x = features,
y = 'Survived',
training_frame = titanic_train)
In [17]:
# Sort and show the grid search results
drf_rand_grid_sorted = drf_rand_grid.get_grid(sort_by='auc', decreasing=True)
print(drf_rand_grid_sorted)
In [18]:
# Extract the best model from random grid search
best_drf_model_id = drf_rand_grid_sorted.model_ids[0]
best_drf_from_rand_grid = h2o.get_model(best_drf_model_id)
best_drf_from_rand_grid.summary()
Out[18]:
In [19]:
# Define a list of models to be stacked
# i.e. best model from each grid
all_ids = [best_gbm_model_id, best_drf_model_id]
In [20]:
# Set up Stacked Ensemble
ensemble = H2OStackedEnsembleEstimator(model_id = "my_ensemble",
base_models = all_ids)
In [21]:
# use .train to start model stacking
# GLM as the default metalearner
ensemble.train(x = features,
y = 'Survived',
training_frame = titanic_train)
In [22]:
print('Best GBM model from Grid (AUC) : ', best_gbm_from_rand_grid.model_performance(titanic_test).auc())
print('Best DRF model from Grid (AUC) : ', best_drf_from_rand_grid.model_performance(titanic_test).auc())
print('Stacked Ensembles (AUC) : ', ensemble.model_performance(titanic_test).auc())