In [1]:
import h2o
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from h2o.estimators.deepwater import H2ODeepWaterEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
import subprocess
h2o.init(nthreads=-1)
if not H2ODeepWaterEstimator.available(): exit
As an example we are going to use a dataset from Kaggle's competition.
In this challenge, BNP Paribas Cardif is providing an anonymized database with two categories of claims:
In terms of machine learning it's a binary classifiaction problem. As performance metric we are going to use logarithmic loss.
Data can be downloaded here: https://www.kaggle.com/c/bnp-paribas-cardif-claims-management/download/train.csv.zip
In [2]:
# upload dataset in H2O and show some rows as an example
df = h2o.import_file("train.csv")
df.show()
df.dim
Out[2]:
In [3]:
# "target" is a column we would like to predict
response = "target"
cols = []
# let's encode "target" column as enum (factor)
for i in cols + [response]:
df[i] = df[i].asfactor()
predictors = list(set(df.names) - set([response, 'ID']))
In [4]:
# dataset split
r = df.runif(seed=42)
train = df[r < 0.8] ## 80% for training
valid = df[(r >= 0.8) & (r < 0.9)] ## 10% for early stopping (only enabled by default for Deep Water)
test = df[r >= 0.9] ## 10% for final testing
print(train.dim)
print(valid.dim)
print(test .dim)
In [5]:
#neural net definition. P
#lease note how easy you can use Keras layers and TensorFlow layers in the same graph definition
import tensorflow as tf
import json
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.normalization import BatchNormalization
from keras import backend as K
from keras.objectives import categorical_crossentropy
from tensorflow.python.framework import ops
def keras_model(size, classes, n_gpus = 1, layers = 3, neurons = 256):
# always create a new graph inside ipython or
# the default one will be used and can lead to
# unexpected behavior
graph = tf.Graph()
with graph.as_default():
# Input fed via H2O
inp = tf.placeholder(tf.float32, [None, size])
# Actual labels used for training fed via H2O
labels = tf.placeholder(tf.float32, [None, classes])
if n_gpus > 1:
inp_arr = tf.split(inp, n_gpus, axis=0)
labels_arr = tf.split(labels, n_gpus, axis=0)
else:
inp_arr = [ inp ]
labels_arr = [ labels ]
classes_arr = [ classes ] * n_gpus
logits_arr = [ 0.0 ] * n_gpus
predictions_arr = [ 0.0 ] * n_gpus
for gpu in range(n_gpus):
with tf.device('/gpu:'+str(gpu)):
with tf.name_scope('tower_'+str(gpu)) as scope:
x = Dense(neurons)(inp_arr[gpu])
x = tf.contrib.layers.batch_norm(x)
x = Activation('relu')(x)
for i in range(layers):
sl = x
x = Dense(neurons)(x)
x = tf.contrib.layers.batch_norm(x)
x = Activation('relu')(x)
x = tf.nn.dropout(x, 0.5)
out = Dense(classes)(x)
logits_arr[gpu] = out
predictions_arr[gpu] = tf.nn.softmax(out)
with tf.device('/cpu:0'):
out = tf.concat(logits_arr, 0)
predictions = tf.concat(predictions_arr, 0)
loss = tf.reduce_mean(tf.losses.softmax_cross_entropy(labels, out))
train_step = tf.train.AdamOptimizer(1e-3).minimize(loss)
init_op = tf.global_variables_initializer()
# Metadata required by H2O
tf.add_to_collection(ops.GraphKeys.INIT_OP, init_op.name)
tf.add_to_collection(ops.GraphKeys.TRAIN_OP, train_step)
tf.add_to_collection("logits", out)
tf.add_to_collection("predictions", predictions)
meta = json.dumps({
"inputs": {"batch_image_input": inp.name,
"categorical_labels": labels.name
},
"outputs": {"categorical_logits": out.name,
"layers": ','.join([m.name for m in tf.get_default_graph().get_operations()])},
"parameters": {},
})
tf.add_to_collection("meta", meta)
# Save the meta file with the graph
saver = tf.train.Saver()
filename = "/tmp/keras_tensorflow.meta"
tf.train.export_meta_graph(filename, saver_def=saver.as_saver_def())
return filename
In [6]:
NGPUS = int(subprocess.check_output("nvidia-smi -L | wc -l", shell=True))
print("GPUs:", NGPUS)
In [7]:
#will take ~10 minutes to converge on 2 GPUs (GeForce 1080)
#194 is a length of input layer. We encode all categorical using "binary" encoding from H2O.
filename = keras_model(194, 2, NGPUS, layers = 5, neurons = 4096)
In [8]:
%%time
batch_size = 512
dw = H2ODeepWaterEstimator(
seed=1234,
backend = "tensorflow",
epochs = 100,
network_definition_file=filename,
mini_batch_size = batch_size*NGPUS,
categorical_encoding = "binary",
)
dw.train(
x=predictors,
y=response,
training_frame=train,
validation_frame=valid,
)
print("Validation Logloss:",dw.model_performance(valid=True).logloss())
In [9]:
pdw = dw.predict(test)
print("Test LogLoss:", h2o.make_metrics(actual=test[response], predicted=pdw[2]).logloss())
In [ ]: