In [43]:
import h2o
import os
import time
h2o.init(max_mem_size = "40G")             #specify max number of bytes. uses all cores by default.
h2o.remove_all()  
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.glrm import H2OGeneralizedLowRankEstimator
import pandas as pd
import numpy as np
from utils import get_allstate_train_valid_test_testids
from sklearn.metrics import explained_variance_score,r2_score,mean_absolute_error,mean_squared_error
from sklearn.cross_validation import train_test_split


Checking whether there is an H2O instance running at http://localhost:54321. connected.
H2O cluster uptime: 3 days 2 hours 4 mins
H2O cluster version: 3.10.0.9
H2O cluster version age: 10 days
H2O cluster name: H2O_from_python_arvc_lgnmyd
H2O cluster total nodes: 1
H2O cluster free memory: 32.74 Gb
H2O cluster total cores: 16
H2O cluster allowed cores: 16
H2O cluster status: locked, healthy
H2O connection url: http://localhost:54321
H2O connection proxy: None
Python version: 3.5.2 final

In [45]:
#Read Input CSV file
shift = 203

train, valid, test, testids = get_allstate_train_valid_test_testids(0.15, shift, True)
test["loss"] = 0.0

trainframe = H2OFrame(train)
validframe = H2OFrame(valid)
testframe = H2OFrame(test)
del(train)
del(valid)
del(test)


Train shape is: (188318, 132)
Test shape is: (125546, 131)
/home/arvc/t81_558_deep_learning/utils.py:139: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  train.drop("type", axis=1, inplace=True)
/home/arvc/t81_558_deep_learning/utils.py:140: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  test.drop("type", axis=1, inplace=True)
Final Train shape is: (160070, 131)
Final Valid shape is: (28248, 131)
Final Test shape is: (125546, 131)
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%

In [49]:
#Perform training on Training set 1 and use these initial 'Set 1' models to add predictions to 
#Training set 2
y = "loss"
x = list(trainframe.columns)

model = H2OGradientBoostingEstimator(
                 model_id="gbm4",
                 ntrees=600,
                 learn_rate=0.3,
                 max_depth=7,
                 sample_rate=0.7,
                 col_sample_rate=0.7,
                 stopping_rounds=2,
                 stopping_tolerance=0.01, #10-fold increase in threshold as defined in rf_v1
                 score_each_iteration=True,
                 seed=200000,
                 nfolds=5,                 
                 keep_cross_validation_predictions=True
                )

model.train(x, y, training_frame=trainframe, validation_frame=validframe)


gbm Model Build progress: |███████████████████████████████████████████████| 100%

In [54]:
model.model_performance(validframe).mae()


Out[54]:
0.38491020838594814

In [56]:
trainframe["loss"]


loss
7.29443
8.59782
7.11822
7.74877
7.99921
7.58744
7.08369
8.34475
7.39041
7.31987
Out[56]:


In [65]:
train_predictions =  model.predict(trainframe)
valid_predictions = model.predict(validframe)
mae_train = mean_absolute_error(np.exp(trainframe["loss"].as_data_frame()) - shift, np.exp(train_predictions.as_data_frame()) - shift)
mae_valid = mean_absolute_error(np.exp(validframe["loss"].as_data_frame()) - shift, np.exp(valid_predictions.as_data_frame()) - shift)
print("MAE score on training data = {}".format(mae_train))
print("MAE score on validation data = {}".format(mae_valid))


gbm prediction progress: |████████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%
MAE score on training data = 1159.0975833601667
MAE score on validation data = 1174.1963097874454

In [62]:
train_predictions.as_data_frame()


Out[62]:
predict
0 7.313045
1 8.715261
2 8.370818
3 7.485808
4 7.436798
5 7.716525
6 7.134688
7 7.594073
8 7.865664
9 7.143172
10 7.911610
11 7.750579
12 7.377628
13 7.877256
14 8.213953
15 8.823814
16 8.108171
17 7.148776
18 7.569802
19 7.703567
20 7.633688
21 7.199683
22 8.126220
23 7.735961
24 7.359045
25 7.344116
26 8.537987
27 7.674298
28 7.253800
29 7.654978
... ...
160040 7.287595
160041 7.909491
160042 8.116276
160043 8.201698
160044 7.690551
160045 7.510437
160046 7.400744
160047 7.812802
160048 7.327763
160049 7.834313
160050 7.405205
160051 6.838343
160052 7.375982
160053 7.567122
160054 7.999318
160055 7.931788
160056 7.839884
160057 8.821832
160058 7.421470
160059 7.950294
160060 7.782227
160061 7.143331
160062 9.179534
160063 7.266155
160064 7.740214
160065 7.794769
160066 7.903367
160067 7.965387
160068 7.435353
160069 7.558592

160070 rows × 1 columns


In [40]:
#combined_submission_test_predictions_df
combined_submission_test_predictions_df["mean"] = combined_submission_test_predictions_df.mean(axis=1)
combined_submission_test_predictions_df["loss"] = np.exp(combined_submission_test_predictions_df["mean"]) - shift

submission_df = pd.concat([testids, combined_submission_test_predictions_df["loss"]], axis=1)
timestr = time.strftime("%Y%m%d-%H%M%S")
#combined_submission_test_predictions_df = combined_submission_test_predictions_df.drop("loss", axis=1)
submission_df


Out[40]:
id loss
0 4 1548.874424
1 6 2100.750091
2 9 7566.529585
3 12 6306.962277
4 15 759.593519
5 17 2084.184688
6 21 2240.450655
7 28 1023.063362
8 32 2132.554510
9 43 3120.678507
10 46 3000.490483
11 50 1019.210293
12 54 1195.716924
13 62 2157.487692
14 70 2126.469577
15 71 5299.369236
16 75 2105.586377
17 77 2559.207887
18 81 2103.052539
19 83 2324.650326
20 87 1750.666306
21 97 1842.003007
22 103 1264.914090
23 119 1092.417882
24 120 1916.063366
25 127 979.379476
26 138 3894.858683
27 141 2937.914150
28 148 845.135935
29 150 2348.530787
... ... ...
125516 587482 1378.506822
125517 587484 4209.170686
125518 587489 1797.861431
125519 587494 1509.837297
125520 587509 1324.169594
125521 587511 685.009485
125522 587515 1625.911429
125523 587517 2359.663723
125524 587519 1586.384250
125525 587524 377.263676
125526 587531 5996.625254
125527 587532 5764.036405
125528 587534 2253.014073
125529 587538 2668.270781
125530 587540 3242.710784
125531 587548 1161.646514
125532 587549 5458.486410
125533 587560 4066.903165
125534 587561 1683.231364
125535 587581 1361.224690
125536 587583 2303.201345
125537 587587 1811.183459
125538 587596 1718.971547
125539 587610 1844.916504
125540 587613 1490.942167
125541 587617 2365.861896
125542 587621 2858.633542
125543 587627 2477.302280
125544 587629 1051.178043
125545 587634 3449.172704

125546 rows × 2 columns


In [42]:
submission_df.to_csv("./data/allstate/sub_h20ensemble_{}.csv".format(timestr), index=False)

In [ ]: