In [1]:
import h2o
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from h2o.estimators.deepwater import H2ODeepWaterEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator

import os.path

PATH=os.path.expanduser("~/CC_default/")

h2o.init(nthreads=-1)
if not H2ODeepWaterEstimator.available(): exit


Checking whether there is an H2O instance running at http://localhost:54321. connected.
H2O cluster uptime: 11 mins 37 secs
H2O cluster version: 3.11.0.99999
H2O cluster version age: 2 hours and 5 minutes
H2O cluster name: arno
H2O cluster total nodes: 1
H2O cluster free memory: 13.55 Gb
H2O cluster total cores: 12
H2O cluster allowed cores: 12
H2O cluster status: locked, healthy
H2O connection url: http://localhost:54321
H2O connection proxy: None
Python version: 2.7.12 final

In [2]:
df = h2o.import_file(PATH+"default of credit card clients.csv")
df.show()
df.dim


Parse progress: |█████████████████████████████████████████████████████████████████████████████| 100%
ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6 BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6 default payment next month
1 20000 2 2 1 24 2 2 -1 -1 -2 -2 3913 3102 689 0 0 0 0 689 0 0 0 0 1
2 120000 2 2 2 26 -1 2 0 0 0 2 2682 1725 2682 3272 3455 3261 0 1000 1000 1000 0 2000 1
3 90000 2 2 2 34 0 0 0 0 0 0 29239 14027 13559 14331 14948 15549 1518 1500 1000 1000 1000 5000 0
4 50000 2 2 1 37 0 0 0 0 0 0 46990 48233 49291 28314 28959 29547 2000 2019 1200 1100 1069 1000 0
5 50000 1 2 1 57 -1 0 -1 0 0 0 8617 5670 35835 20940 19146 19131 2000 36681 10000 9000 689 679 0
6 50000 1 1 2 37 0 0 0 0 0 0 64400 57069 57608 19394 19619 20024 2500 1815 657 1000 1000 800 0
7 500000 1 1 2 29 0 0 0 0 0 0 367965 412023 445007 542653 483003 473944 55000 40000 38000 20239 13750 13770 0
8 100000 2 2 2 23 0 -1 -1 0 0 -1 11876 380 601 221 -159 567 380 601 0 581 1687 1542 0
9 140000 2 3 1 28 0 0 2 0 0 0 11285 14096 12108 12211 11793 3719 3329 0 432 1000 1000 1000 0
10 20000 1 3 2 35 -2 -2 -2 -2 -1 -1 0 0 0 0 13007 13912 0 0 0 13007 1122 0 0
Out[2]:
[30000, 25]

In [3]:
response = "default payment next month"
for i in ['SEX','EDUCATION','MARRIAGE',response]: 
    df[i] = df[i].asfactor() 
predictors = list(set(df.names) - set([response, 'ID']))

In [4]:
r = df.runif(seed=42)
train = df[r  < 0.8]                 ## 80% for training
valid = df[(r >= 0.8) & (r < 0.9)]   ## 10% for early stopping (only enabled by default for Deep Water)
test  = df[r  >= 0.9]                ## 10% for final testing
print(train.dim)
print(valid.dim)
print(test .dim)


[24087, 25]
[2871, 25]
[3042, 25]

For the sake of simplicity, let's build 4 different models out of the box

  • H2O Deep Water using MXNet GPU backend
  • H2O Deep Learning using H2O's Java multi-threaded backend
  • H2O Gradient Boosting
  • H2O Random Forest

In [26]:
dw = H2ODeepWaterEstimator(seed=1234)   ## GPU, manual learning rate and momentum, needs tuning
dw.train(x=predictors, y=response, training_frame=train, validation_frame=valid)
dw.model_performance(valid=True).auc()


deepwater Model Build progress: |█████████████████████████████████████████████████████████████| 100%
Out[26]:
0.7183863806723174

In [25]:
dl = H2ODeepLearningEstimator()   ## CPU, features adaptive learning rate
dl.train(x=predictors, y=response, training_frame=train, validation_frame=valid)
dl.model_performance(valid=True).auc()


deeplearning Model Build progress: |██████████████████████████████████████████████████████████| 100%
Out[25]:
0.7673301187799112

In [21]:
gbm = H2OGradientBoostingEstimator()
gbm.train(x=predictors, y=response, training_frame=train, validation_frame=valid)
gbm.model_performance(valid=True).auc()


gbm Model Build progress: |███████████████████████████████████████████████████████████████████| 100%
Out[21]:
0.7792643285475408

In [22]:
drf = H2ORandomForestEstimator()
drf.train(x=predictors, y=response, training_frame=train, validation_frame=valid)
drf.model_performance(valid=True).auc()


drf Model Build progress: |███████████████████████████████████████████████████████████████████| 100%
Out[22]:
0.7734810153952604

In [27]:
pdw = dw.predict(test)
pdw


deepwater prediction progress: |██████████████████████████████████████████████████████████████| 100%
predict p0 p1
00.797962 0.202038
00.995913 0.00408731
10.08555510.914445
00.901873 0.098127
00.883492 0.116508
10.135638 0.864362
10.559447 0.440553
00.983139 0.0168613
10.274472 0.725528
10.208343 0.791657
Out[27]:


In [28]:
pdl = dl.predict(test)
pgbm = gbm.predict(test)
pdrf = drf.predict(test)


deeplearning prediction progress: |███████████████████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████████████████████████| 100%
drf prediction progress: |████████████████████████████████████████████████████████████████████| 100%

In [29]:
h2o.make_metrics(actual=test[response], predicted=pdw[2]).auc()


Out[29]:
0.7120599602559889

In [30]:
h2o.make_metrics(actual=test[response], predicted=pdl[2]).auc()


Out[30]:
0.7583451470058991

In [31]:
h2o.make_metrics(actual=test[response], predicted=pgbm[2]).auc()


Out[31]:
0.7701031153671627

In [32]:
h2o.make_metrics(actual=test[response], predicted=pdrf[2]).auc()


Out[32]:
0.764227260636217

In [33]:
h2o.make_metrics(actual=test[response], predicted=0.5*(pgbm+pdrf)[2]).auc()


Out[33]:
0.7726921090926161

Let's make a simple ensemble that consists of a weighted blend of all 4 models


In [35]:
h2o.make_metrics(actual=test[response], predicted=0.1*(4*pgbm+3*pdrf+1*pdw+2*pdl)[2]).auc()


Out[35]:
0.7759119998748221