In [1]:
import h2o
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from h2o.estimators.deepwater import H2ODeepWaterEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
import os.path
PATH=os.path.expanduser("~/CC_default/")
h2o.init(nthreads=-1)
if not H2ODeepWaterEstimator.available(): exit
In [2]:
df = h2o.import_file(PATH+"default of credit card clients.csv")
df.show()
df.dim
Out[2]:
In [3]:
response = "default payment next month"
for i in ['SEX','EDUCATION','MARRIAGE',response]:
df[i] = df[i].asfactor()
predictors = list(set(df.names) - set([response, 'ID']))
In [4]:
r = df.runif(seed=42)
train = df[r < 0.8] ## 80% for training
valid = df[(r >= 0.8) & (r < 0.9)] ## 10% for early stopping (only enabled by default for Deep Water)
test = df[r >= 0.9] ## 10% for final testing
print(train.dim)
print(valid.dim)
print(test .dim)
In [26]:
dw = H2ODeepWaterEstimator(seed=1234) ## GPU, manual learning rate and momentum, needs tuning
dw.train(x=predictors, y=response, training_frame=train, validation_frame=valid)
dw.model_performance(valid=True).auc()
Out[26]:
In [25]:
dl = H2ODeepLearningEstimator() ## CPU, features adaptive learning rate
dl.train(x=predictors, y=response, training_frame=train, validation_frame=valid)
dl.model_performance(valid=True).auc()
Out[25]:
In [21]:
gbm = H2OGradientBoostingEstimator()
gbm.train(x=predictors, y=response, training_frame=train, validation_frame=valid)
gbm.model_performance(valid=True).auc()
Out[21]:
In [22]:
drf = H2ORandomForestEstimator()
drf.train(x=predictors, y=response, training_frame=train, validation_frame=valid)
drf.model_performance(valid=True).auc()
Out[22]:
In [27]:
pdw = dw.predict(test)
pdw
Out[27]:
In [28]:
pdl = dl.predict(test)
pgbm = gbm.predict(test)
pdrf = drf.predict(test)
In [29]:
h2o.make_metrics(actual=test[response], predicted=pdw[2]).auc()
Out[29]:
In [30]:
h2o.make_metrics(actual=test[response], predicted=pdl[2]).auc()
Out[30]:
In [31]:
h2o.make_metrics(actual=test[response], predicted=pgbm[2]).auc()
Out[31]:
In [32]:
h2o.make_metrics(actual=test[response], predicted=pdrf[2]).auc()
Out[32]:
In [33]:
h2o.make_metrics(actual=test[response], predicted=0.5*(pgbm+pdrf)[2]).auc()
Out[33]:
Let's make a simple ensemble that consists of a weighted blend of all 4 models
In [35]:
h2o.make_metrics(actual=test[response], predicted=0.1*(4*pgbm+3*pdrf+1*pdw+2*pdl)[2]).auc()
Out[35]: