Copyright (C) 2017 J. Patrick Hall, jphall@gwu.edu
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
In [1]:
# imports
import h2o
import numpy as np
import pandas as pd
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.grid.grid_search import H2OGridSearch
In [2]:
# display matplotlib graphics in notebook
%matplotlib inline
In [3]:
# start and connect to h2o server
h2o.init()
In [4]:
# location of "dirty" file
# decision trees handle dirty data elegantly
path = '/Users/phall/workspace/GWU_data_mining/02_analytical_data_prep/data/loan.csv'
In [5]:
# define input variable measurement levels
# strings automatically parsed as enums (nominal)
# numbers automatically parsed as numeric
col_types = {'bad_loan': 'enum'}
In [6]:
frame = h2o.import_file(path=path, col_types=col_types) # multi-threaded import
In [7]:
frame.describe()
In [8]:
# correct MORTGAGE/mortgage problem using gsub() and trim() functions
print(frame['home_ownership'].table())
frame['home_ownership'] = frame['home_ownership'].gsub(pattern='mortgage',
replacement='MORTGAGE')
frame['home_ownership'] = frame['home_ownership'].trim()
print(frame['home_ownership'].table())
In [9]:
# split into 40% training, 30% validation, and 30% test
train, valid, test = frame.split_frame([0.4, 0.3])
In [10]:
# assign target and inputs
y = 'bad_loan'
X = [name for name in frame.columns if name not in ['id', '_WARN_', y]]
print(y)
print(X)
In [11]:
# set target to factor - for binary classification
train[y] = train[y].asfactor()
valid[y] = valid[y].asfactor()
test[y] = test[y].asfactor()
In [12]:
# random forest
# initialize rf model
rf_model = H2ORandomForestEstimator(
ntrees=500, # Up to 500 decision trees in the forest
max_depth=30, # trees can grow to depth of 30
stopping_rounds=5, # stop after validation error does not decrease for 5 iterations/new trees
score_each_iteration=True, # score validation error on every iteration/new tree
model_id='rf_model') # for easy lookup in flow
# train rf model
rf_model.train(
x=X,
y=y,
training_frame=train,
validation_frame=valid)
# print model information
rf_model
# view detailed results at http://localhost:54321/flow/index.html
Out[12]:
In [13]:
# measure rf AUC
print(rf_model.auc(train=True))
print(rf_model.auc(valid=True))
print(rf_model.model_performance(test_data=test).auc())
In [14]:
# GBM with random hyperparameter search
# train many different GBM models with random hyperparameters
# and select best model based on validation error
# define random grid search parameters
hyper_parameters = {'ntrees':list(range(0, 500, 50)),
'max_depth':list(range(0, 20, 2)),
'sample_rate':[s/float(10) for s in range(1, 11)],
'col_sample_rate':[s/float(10) for s in range(1, 11)]}
# define search strategy
search_criteria = {'strategy':'RandomDiscrete',
'max_models':20,
'max_runtime_secs':600}
# initialize grid search
gsearch = H2OGridSearch(H2OGradientBoostingEstimator,
hyper_params=hyper_parameters,
search_criteria=search_criteria)
# execute training w/ grid search
gsearch.train(x=X,
y=y,
training_frame=train,
validation_frame=valid)
# view detailed results at http://localhost:54321/flow/index.html
In [15]:
# show grid search results
gsearch.show()
# select best model
gbm_model = gsearch.get_grid()[0]
# print model information
gbm_model
Out[15]:
In [16]:
# measure gbm AUC
print(gbm_model.auc(train=True))
print(gbm_model.auc(valid=True))
print(gbm_model.model_performance(test_data=test).auc())
In [17]:
# partial dependence plots are a powerful machine learning interpretation tool
# to calculate partial dependence across the domain a variable
# hold column of interest at constant value
# find the mean prediction of the model with this column constant
# repeat for multiple values of the variable of interest
# h2o has a built-in function for partial dependence as well
par_dep_dti1 = gbm_model.partial_plot(data=train, cols=['dti'], server=True, plot=True)
In [18]:
# shutdown h2o
h2o.cluster().shutdown(prompt=False)