In [43]:
import h2o
import os
import time
h2o.init(max_mem_size = "40G") #specify max number of bytes. uses all cores by default.
h2o.remove_all()
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.glrm import H2OGeneralizedLowRankEstimator
import pandas as pd
import numpy as np
from utils import get_allstate_train_valid_test_testids
from sklearn.metrics import explained_variance_score,r2_score,mean_absolute_error,mean_squared_error
from sklearn.cross_validation import train_test_split
In [45]:
#Read Input CSV file
shift = 203
train, valid, test, testids = get_allstate_train_valid_test_testids(0.15, shift, True)
test["loss"] = 0.0
trainframe = H2OFrame(train)
validframe = H2OFrame(valid)
testframe = H2OFrame(test)
del(train)
del(valid)
del(test)
In [49]:
#Perform training on Training set 1 and use these initial 'Set 1' models to add predictions to
#Training set 2
y = "loss"
x = list(trainframe.columns)
model = H2OGradientBoostingEstimator(
model_id="gbm4",
ntrees=600,
learn_rate=0.3,
max_depth=7,
sample_rate=0.7,
col_sample_rate=0.7,
stopping_rounds=2,
stopping_tolerance=0.01, #10-fold increase in threshold as defined in rf_v1
score_each_iteration=True,
seed=200000,
nfolds=5,
keep_cross_validation_predictions=True
)
model.train(x, y, training_frame=trainframe, validation_frame=validframe)
In [54]:
model.model_performance(validframe).mae()
Out[54]:
In [56]:
trainframe["loss"]
Out[56]:
In [65]:
train_predictions = model.predict(trainframe)
valid_predictions = model.predict(validframe)
mae_train = mean_absolute_error(np.exp(trainframe["loss"].as_data_frame()) - shift, np.exp(train_predictions.as_data_frame()) - shift)
mae_valid = mean_absolute_error(np.exp(validframe["loss"].as_data_frame()) - shift, np.exp(valid_predictions.as_data_frame()) - shift)
print("MAE score on training data = {}".format(mae_train))
print("MAE score on validation data = {}".format(mae_valid))
In [62]:
train_predictions.as_data_frame()
Out[62]:
In [40]:
#combined_submission_test_predictions_df
combined_submission_test_predictions_df["mean"] = combined_submission_test_predictions_df.mean(axis=1)
combined_submission_test_predictions_df["loss"] = np.exp(combined_submission_test_predictions_df["mean"]) - shift
submission_df = pd.concat([testids, combined_submission_test_predictions_df["loss"]], axis=1)
timestr = time.strftime("%Y%m%d-%H%M%S")
#combined_submission_test_predictions_df = combined_submission_test_predictions_df.drop("loss", axis=1)
submission_df
Out[40]:
In [42]:
submission_df.to_csv("./data/allstate/sub_h20ensemble_{}.csv".format(timestr), index=False)
In [ ]: