notebook.community

Edit and run



In [43]:

    
import h2o
import os
import time
h2o.init(max_mem_size = "40G")             #specify max number of bytes. uses all cores by default.
h2o.remove_all()  
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.glrm import H2OGeneralizedLowRankEstimator
import pandas as pd
import numpy as np
from utils import get_allstate_train_valid_test_testids
from sklearn.metrics import explained_variance_score,r2_score,mean_absolute_error,mean_squared_error
from sklearn.cross_validation import train_test_split









    



Checking whether there is an H2O instance running at http://localhost:54321. connected.






    




H2O cluster uptime:
3 days 2 hours 4 mins
H2O cluster version:
3.10.0.9
H2O cluster version age:
10 days 
H2O cluster name:
H2O_from_python_arvc_lgnmyd
H2O cluster total nodes:
1
H2O cluster free memory:
32.74 Gb
H2O cluster total cores:
16
H2O cluster allowed cores:
16
H2O cluster status:
locked, healthy
H2O connection url:
http://localhost:54321
H2O connection proxy:
None
Python version:
3.5.2 final



In [45]:

    
#Read Input CSV file
shift = 203

train, valid, test, testids = get_allstate_train_valid_test_testids(0.15, shift, True)
test["loss"] = 0.0

trainframe = H2OFrame(train)
validframe = H2OFrame(valid)
testframe = H2OFrame(test)
del(train)
del(valid)
del(test)









    



Train shape is: (188318, 132)
Test shape is: (125546, 131)






    



/home/arvc/t81_558_deep_learning/utils.py:139: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  train.drop("type", axis=1, inplace=True)
/home/arvc/t81_558_deep_learning/utils.py:140: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  test.drop("type", axis=1, inplace=True)






    



Final Train shape is: (160070, 131)
Final Valid shape is: (28248, 131)
Final Test shape is: (125546, 131)
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%



In [49]:

    
#Perform training on Training set 1 and use these initial 'Set 1' models to add predictions to 
#Training set 2
y = "loss"
x = list(trainframe.columns)

model = H2OGradientBoostingEstimator(
                 model_id="gbm4",
                 ntrees=600,
                 learn_rate=0.3,
                 max_depth=7,
                 sample_rate=0.7,
                 col_sample_rate=0.7,
                 stopping_rounds=2,
                 stopping_tolerance=0.01, #10-fold increase in threshold as defined in rf_v1
                 score_each_iteration=True,
                 seed=200000,
                 nfolds=5,                 
                 keep_cross_validation_predictions=True
                )

model.train(x, y, training_frame=trainframe, validation_frame=validframe)









    



gbm Model Build progress: |███████████████████████████████████████████████| 100%



In [54]:

    
model.model_performance(validframe).mae()









    Out[54]:





0.38491020838594814



In [56]:

    
trainframe["loss"]



In [65]:

    
train_predictions =  model.predict(trainframe)
valid_predictions = model.predict(validframe)
mae_train = mean_absolute_error(np.exp(trainframe["loss"].as_data_frame()) - shift, np.exp(train_predictions.as_data_frame()) - shift)
mae_valid = mean_absolute_error(np.exp(validframe["loss"].as_data_frame()) - shift, np.exp(valid_predictions.as_data_frame()) - shift)
print("MAE score on training data = {}".format(mae_train))
print("MAE score on validation data = {}".format(mae_valid))









    



gbm prediction progress: |████████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%
MAE score on training data = 1159.0975833601667
MAE score on validation data = 1174.1963097874454



In [62]:

    
train_predictions.as_data_frame()









    Out[62]:






  
    
      
      predict
    
  
  
    
      0
      7.313045
    
    
      1
      8.715261
    
    
      2
      8.370818
    
    
      3
      7.485808
    
    
      4
      7.436798
    
    
      5
      7.716525
    
    
      6
      7.134688
    
    
      7
      7.594073
    
    
      8
      7.865664
    
    
      9
      7.143172
    
    
      10
      7.911610
    
    
      11
      7.750579
    
    
      12
      7.377628
    
    
      13
      7.877256
    
    
      14
      8.213953
    
    
      15
      8.823814
    
    
      16
      8.108171
    
    
      17
      7.148776
    
    
      18
      7.569802
    
    
      19
      7.703567
    
    
      20
      7.633688
    
    
      21
      7.199683
    
    
      22
      8.126220
    
    
      23
      7.735961
    
    
      24
      7.359045
    
    
      25
      7.344116
    
    
      26
      8.537987
    
    
      27
      7.674298
    
    
      28
      7.253800
    
    
      29
      7.654978
    
    
      ...
      ...
    
    
      160040
      7.287595
    
    
      160041
      7.909491
    
    
      160042
      8.116276
    
    
      160043
      8.201698
    
    
      160044
      7.690551
    
    
      160045
      7.510437
    
    
      160046
      7.400744
    
    
      160047
      7.812802
    
    
      160048
      7.327763
    
    
      160049
      7.834313
    
    
      160050
      7.405205
    
    
      160051
      6.838343
    
    
      160052
      7.375982
    
    
      160053
      7.567122
    
    
      160054
      7.999318
    
    
      160055
      7.931788
    
    
      160056
      7.839884
    
    
      160057
      8.821832
    
    
      160058
      7.421470
    
    
      160059
      7.950294
    
    
      160060
      7.782227
    
    
      160061
      7.143331
    
    
      160062
      9.179534
    
    
      160063
      7.266155
    
    
      160064
      7.740214
    
    
      160065
      7.794769
    
    
      160066
      7.903367
    
    
      160067
      7.965387
    
    
      160068
      7.435353
    
    
      160069
      7.558592
    
  

160070 rows × 1 columns



In [40]:

    
#combined_submission_test_predictions_df
combined_submission_test_predictions_df["mean"] = combined_submission_test_predictions_df.mean(axis=1)
combined_submission_test_predictions_df["loss"] = np.exp(combined_submission_test_predictions_df["mean"]) - shift

submission_df = pd.concat([testids, combined_submission_test_predictions_df["loss"]], axis=1)
timestr = time.strftime("%Y%m%d-%H%M%S")
#combined_submission_test_predictions_df = combined_submission_test_predictions_df.drop("loss", axis=1)
submission_df









    Out[40]:






  
    
      
      id
      loss
    
  
  
    
      0
      4
      1548.874424
    
    
      1
      6
      2100.750091
    
    
      2
      9
      7566.529585
    
    
      3
      12
      6306.962277
    
    
      4
      15
      759.593519
    
    
      5
      17
      2084.184688
    
    
      6
      21
      2240.450655
    
    
      7
      28
      1023.063362
    
    
      8
      32
      2132.554510
    
    
      9
      43
      3120.678507
    
    
      10
      46
      3000.490483
    
    
      11
      50
      1019.210293
    
    
      12
      54
      1195.716924
    
    
      13
      62
      2157.487692
    
    
      14
      70
      2126.469577
    
    
      15
      71
      5299.369236
    
    
      16
      75
      2105.586377
    
    
      17
      77
      2559.207887
    
    
      18
      81
      2103.052539
    
    
      19
      83
      2324.650326
    
    
      20
      87
      1750.666306
    
    
      21
      97
      1842.003007
    
    
      22
      103
      1264.914090
    
    
      23
      119
      1092.417882
    
    
      24
      120
      1916.063366
    
    
      25
      127
      979.379476
    
    
      26
      138
      3894.858683
    
    
      27
      141
      2937.914150
    
    
      28
      148
      845.135935
    
    
      29
      150
      2348.530787
    
    
      ...
      ...
      ...
    
    
      125516
      587482
      1378.506822
    
    
      125517
      587484
      4209.170686
    
    
      125518
      587489
      1797.861431
    
    
      125519
      587494
      1509.837297
    
    
      125520
      587509
      1324.169594
    
    
      125521
      587511
      685.009485
    
    
      125522
      587515
      1625.911429
    
    
      125523
      587517
      2359.663723
    
    
      125524
      587519
      1586.384250
    
    
      125525
      587524
      377.263676
    
    
      125526
      587531
      5996.625254
    
    
      125527
      587532
      5764.036405
    
    
      125528
      587534
      2253.014073
    
    
      125529
      587538
      2668.270781
    
    
      125530
      587540
      3242.710784
    
    
      125531
      587548
      1161.646514
    
    
      125532
      587549
      5458.486410
    
    
      125533
      587560
      4066.903165
    
    
      125534
      587561
      1683.231364
    
    
      125535
      587581
      1361.224690
    
    
      125536
      587583
      2303.201345
    
    
      125537
      587587
      1811.183459
    
    
      125538
      587596
      1718.971547
    
    
      125539
      587610
      1844.916504
    
    
      125540
      587613
      1490.942167
    
    
      125541
      587617
      2365.861896
    
    
      125542
      587621
      2858.633542
    
    
      125543
      587627
      2477.302280
    
    
      125544
      587629
      1051.178043
    
    
      125545
      587634
      3449.172704
    
  

125546 rows × 2 columns



In [42]:

    
submission_df.to_csv("./data/allstate/sub_h20ensemble_{}.csv".format(timestr), index=False)



In [ ]:

H2O cluster uptime:	3 days 2 hours 4 mins
H2O cluster version:	3.10.0.9
H2O cluster version age:	10 days
H2O cluster name:	H2O_from_python_arvc_lgnmyd
H2O cluster total nodes:	1
H2O cluster free memory:	32.74 Gb
H2O cluster total cores:	16
H2O cluster allowed cores:	16
H2O cluster status:	locked, healthy
H2O connection url:	http://localhost:54321
H2O connection proxy:	None
Python version:	3.5.2 final

	predict
0	7.313045
1	8.715261
2	8.370818
3	7.485808
4	7.436798
5	7.716525
6	7.134688
7	7.594073
8	7.865664
9	7.143172
10	7.911610
11	7.750579
12	7.377628
13	7.877256
14	8.213953
15	8.823814
16	8.108171
17	7.148776
18	7.569802
19	7.703567
20	7.633688
21	7.199683
22	8.126220
23	7.735961
24	7.359045
25	7.344116
26	8.537987
27	7.674298
28	7.253800
29	7.654978
...	...
160040	7.287595
160041	7.909491
160042	8.116276
160043	8.201698
160044	7.690551
160045	7.510437
160046	7.400744
160047	7.812802
160048	7.327763
160049	7.834313
160050	7.405205
160051	6.838343
160052	7.375982
160053	7.567122
160054	7.999318
160055	7.931788
160056	7.839884
160057	8.821832
160058	7.421470
160059	7.950294
160060	7.782227
160061	7.143331
160062	9.179534
160063	7.266155
160064	7.740214
160065	7.794769
160066	7.903367
160067	7.965387
160068	7.435353
160069	7.558592

	id	loss
0	4	1548.874424
1	6	2100.750091
2	9	7566.529585
3	12	6306.962277
4	15	759.593519
5	17	2084.184688
6	21	2240.450655
7	28	1023.063362
8	32	2132.554510
9	43	3120.678507
10	46	3000.490483
11	50	1019.210293
12	54	1195.716924
13	62	2157.487692
14	70	2126.469577
15	71	5299.369236
16	75	2105.586377
17	77	2559.207887
18	81	2103.052539
19	83	2324.650326
20	87	1750.666306
21	97	1842.003007
22	103	1264.914090
23	119	1092.417882
24	120	1916.063366
25	127	979.379476
26	138	3894.858683
27	141	2937.914150
28	148	845.135935
29	150	2348.530787
...	...	...
125516	587482	1378.506822
125517	587484	4209.170686
125518	587489	1797.861431
125519	587494	1509.837297
125520	587509	1324.169594
125521	587511	685.009485
125522	587515	1625.911429
125523	587517	2359.663723
125524	587519	1586.384250
125525	587524	377.263676
125526	587531	5996.625254
125527	587532	5764.036405
125528	587534	2253.014073
125529	587538	2668.270781
125530	587540	3242.710784
125531	587548	1161.646514
125532	587549	5458.486410
125533	587560	4066.903165
125534	587561	1683.231364
125535	587581	1361.224690
125536	587583	2303.201345
125537	587587	1811.183459
125538	587596	1718.971547
125539	587610	1844.916504
125540	587613	1490.942167
125541	587617	2365.861896
125542	587621	2858.633542
125543	587627	2477.302280
125544	587629	1051.178043
125545	587634	3449.172704