In [69]:
# This tutorial is losely based on https://xiaoxiaowang87.github.io/monotonicity_constraint/

import h2o
from h2o.estimators import H2OXGBoostEstimator
from h2o.estimators import H2OGradientBoostingEstimator

import numpy as np
import pandas as pd

import matplotlib
import matplotlib as mpl
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.datasets.california_housing import fetch_california_housing

In [48]:
cal_housing = fetch_california_housing()

In [49]:
h2o.init()


Checking whether there is an H2O instance running at http://localhost:54321. connected.
H2O cluster uptime: 45 mins 51 secs
H2O cluster timezone: America/Los_Angeles
H2O data parsing timezone: UTC
H2O cluster version: 3.22.0.99999
H2O cluster version age: 1 hour and 57 minutes
H2O cluster name: mkurka
H2O cluster total nodes: 1
H2O cluster free memory: 11.25 Gb
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster status: locked, healthy
H2O connection url: http://localhost:54321
H2O connection proxy: None
H2O internal security: False
H2O API Extensions: XGBoost, Algos, AutoML, Core V3, Core V4
Python version: 2.7.14 final

In [50]:
data = h2o.H2OFrame(cal_housing.data, column_names=cal_housing.feature_names)
data["target"] = h2o.H2OFrame(cal_housing.target)


Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%

In [51]:
train, test = data.split_frame([0.6], seed=123)

In [52]:
train.summary()


MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude target
type real int real real int real real real real
mins 0.4999 1.0 0.8461538461540.375 3.0 0.69230769230832.54 -124.35 0.14999
mean 3.8808057743428.592896835 5.45159062686 1.09888842652 1426.404123383.1598055712 35.6381468954-119.5720600792.06838035113
maxs 15.0001 52.0 132.533333333 34.0666666667 35682.0 1243.33333333 41.95 -114.31 5.00001
sigma 1.9013827314712.59437023142.51012114666 0.5030624226741160.4098169613.3502637259 2.141498258752.00443036004 1.15487252367
zeros 0 0 0 0 0 0 0 0 0
missing0 0 0 0 0 0 0 0 0
0 8.3252 41.0 6.98412698413 1.02380952381 322.0 2.55555555556 37.88 -122.23 4.526
1 8.3014 21.0 6.2381370826 0.9718804920912401.0 2.10984182777 37.86 -122.22 3.585
2 7.2574 52.0 8.28813559322 1.07344632768 496.0 2.80225988701 37.85 -122.24 3.521
3 3.8462 52.0 6.28185328185 1.08108108108 565.0 2.18146718147 37.85 -122.25 3.422
4 4.0368 52.0 4.76165803109 1.10362694301 413.0 2.13989637306 37.85 -122.25 2.697
5 3.12 52.0 4.79752704791 1.06182380216 1157.0 1.78825347759 37.84 -122.25 2.414
6 3.2705 52.0 4.77247956403 1.02452316076 1504.0 2.04904632153 37.85 -122.26 2.418
7 3.075 52.0 5.32264957265 1.01282051282 1098.0 2.34615384615 37.85 -122.26 2.135
8 2.1202 52.0 4.05280528053 0.96699669967 648.0 2.13861386139 37.85 -122.27 1.555
9 1.9911 50.0 5.34367541766 1.08591885442 990.0 2.36276849642 37.84 -122.26 1.587

In [53]:
test.summary()


MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude target
type real int real real int real real real real
mins 0.4999 1.0 1.0 0.3333333333336.0 1.1693290734832.56 -124.3 0.14999
mean 3.8553671652728.70983825855.394886764571.09333302879 1424.076371152.9360351104435.6223701812-119.5661473912.06882668004
maxs 15.0001 52.0 141.90909090925.6363636364 15507.0 83.171428571441.95 -114.56 5.00001
sigma 1.8974750829 12.57267694472.418629934250.4261364696911088.969955671.266283334932.127645430912.00229059869 1.15264118508
zeros 0 0 0 0 0 0 0 0 0
missing0 0 0 0 0 0 0 0 0
0 5.6431 52.0 5.817351598171.07305936073 558.0 2.5479452054837.85 -122.25 3.413
1 3.6591 52.0 4.931906614790.9513618677041094.0 2.1284046692637.84 -122.25 2.992
2 2.0804 42.0 4.294117647061.11764705882 1206.0 2.0268907563 37.84 -122.26 2.267
3 3.6912 52.0 4.970588235290.9901960784311551.0 2.1722689075637.84 -122.25 2.611
4 3.2031 52.0 5.4776119403 1.07960199005 910.0 2.2636815920437.85 -122.26 2.815
5 2.6736 52.0 4.0 1.09770114943 345.0 1.9827586206937.84 -122.26 1.913
6 1.9167 52.0 4.262903225811.00967741935 1212.0 1.9548387096837.85 -122.26 1.592
7 2.125 50.0 4.242424242421.07196969697 697.0 2.6401515151537.85 -122.26 1.4
8 2.775 52.0 5.939577039271.04833836858 793.0 2.3957703927537.85 -122.27 1.525
9 1.808 52.0 4.780856423171.0604534005 1102.0 2.7758186398 37.85 -122.28 1.055

In [54]:
feature_names = ['MedInc', 'AveOccup', 'HouseAge']
monotone_constraints = {"MedInc": 1, "AveOccup": -1, "HouseAge": 1}

In [55]:
xgb_mono = H2OXGBoostEstimator(monotone_constraints=monotone_constraints)
xgb_mono.train(x=feature_names, y="target", training_frame=train, validation_frame=test)


xgboost Model Build progress: |███████████████████████████████████████████| 100%

In [56]:
xgb_mono.model_performance()


ModelMetricsRegression: xgboost
** Reported on train data. **

MSE: 0.450582394315
RMSE: 0.671254343983
MAE: 0.493647644764
RMSLE: 0.217638157557
Mean Residual Deviance: 0.450582394315
Out[56]:


In [57]:
xgb_mono.model_performance(valid=True)


ModelMetricsRegression: xgboost
** Reported on validation data. **

MSE: 0.491813275045
RMSE: 0.701294000434
MAE: 0.518155694956
RMSLE: 0.22897543713
Mean Residual Deviance: 0.491813275045
Out[57]:


In [58]:
gbm_mono = H2OGradientBoostingEstimator(monotone_constraints=monotone_constraints)
gbm_mono.train(x=feature_names, y="target", training_frame=train, validation_frame=test)


gbm Model Build progress: |███████████████████████████████████████████████| 100%

In [59]:
gbm_mono.model_performance()


ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 0.449986623705
RMSE: 0.670810423074
MAE: 0.494184218288
RMSLE: 0.21741684144
Mean Residual Deviance: 0.449986623705
Out[59]:


In [60]:
gbm_mono.model_performance(valid=True)


ModelMetricsRegression: gbm
** Reported on validation data. **

MSE: 0.481829630737
RMSE: 0.694139489395
MAE: 0.515524158621
RMSLE: 0.226177098203
Mean Residual Deviance: 0.481829630737
Out[60]:


In [61]:
xgb_mono.varimp_plot()



In [62]:
gbm_mono.varimp_plot()



In [63]:
pd.DataFrame.from_items(
    [('H2O XGBoost', [xgb_mono.rmse(), xgb_mono.rmse(valid=True)]),
     ('H2O GBM', [gbm_mono.rmse(), gbm_mono.rmse(valid=True)])],
    columns=['Train RMSE', 'Test RMSE'], orient="index")


Out[63]:
Train RMSE Test RMSE
H2O XGBoost 0.671254 0.701294
H2O GBM 0.670810 0.694139

In [64]:
xgb_mono.partial_plot(data=train, cols=feature_names, nbins=100)


PartialDependencePlot progress: |█████████████████████████████████████████| 100%
PartialDependence: Partial Dependence Plot of model XGBoost_model_python_1543517619333_7 on column 'MedInc'

medinc mean_response stddev_response std_error_mean_response
0.4999 0.9042832 0.4050025 0.0036345
0.6463667 1.0116680 0.3172816 0.0028473
0.7928333 1.0117064 0.3174680 0.0028490
0.9393 1.0137833 0.3192623 0.0028651
1.0857667 1.0144802 0.3185222 0.0028585
--- --- --- ---
14.4142333 4.8378433 0.4268355 0.0038305
14.5607 4.8378433 0.4268355 0.0038305
14.7071667 4.8378433 0.4268355 0.0038305
14.8536333 4.8379129 0.4268688 0.0038308
15.0001 4.8379129 0.4268688 0.0038308
See the whole table with table.as_data_frame()
PartialDependence: Partial Dependence Plot of model XGBoost_model_python_1543517619333_7 on column 'AveOccup'

aveoccup mean_response stddev_response std_error_mean_response
0.6923077 4.7145734 1.6517448 0.0148229
13.2442372 1.5843531 0.8072629 0.0072445
25.7961668 1.5843531 0.8072629 0.0072445
38.3480963 1.5843531 0.8072629 0.0072445
50.9000259 1.5843531 0.8072629 0.0072445
--- --- --- ---
1193.1256151 -1.1011310 0.7744430 0.0069499
1205.6775447 -1.1011310 0.7744430 0.0069499
1218.2294742 -1.1011310 0.7744430 0.0069499
1230.7814038 -1.1011310 0.7744430 0.0069499
1243.3333333 -1.1011310 0.7744430 0.0069499
See the whole table with table.as_data_frame()
PartialDependence: Partial Dependence Plot of model XGBoost_model_python_1543517619333_7 on column 'HouseAge'

houseage mean_response stddev_response std_error_mean_response
1.0 1.4583174 0.8679680 0.0077892
2.0 1.4583174 0.8679680 0.0077892
3.0 1.6582169 0.8473530 0.0076042
4.0 1.7134339 0.8369214 0.0075106
5.0 1.7160654 0.8339999 0.0074844
--- --- --- ---
48.0 2.3312915 0.9915394 0.0088982
49.0 2.3327401 0.9897468 0.0088821
50.0 2.3413390 0.9806541 0.0088005
51.0 2.4553383 0.9887533 0.0088732
52.0 2.5113788 0.9656211 0.0086656
See the whole table with table.as_data_frame()
Out[64]:
[, , ]

In [65]:
gbm_mono.partial_plot(data=train, cols=feature_names, nbins=100)


PartialDependencePlot progress: |█████████████████████████████████████████| 100%
PartialDependence: Partial Dependence Plot of model GBM_model_python_1543517619333_8 on column 'MedInc'

medinc mean_response stddev_response std_error_mean_response
0.4999 1.0842446 0.2019811 0.0018126
0.6463667 1.0842446 0.2019811 0.0018126
0.7928333 1.0842446 0.2019811 0.0018126
0.9393 1.0842446 0.2019811 0.0018126
1.0857667 1.0842446 0.2019811 0.0018126
--- --- --- ---
14.4142333 4.7717826 0.2258950 0.0020272
14.5607 4.7717826 0.2258950 0.0020272
14.7071667 4.7717826 0.2258950 0.0020272
14.8536333 4.7717826 0.2258950 0.0020272
15.0001 4.7717826 0.2258950 0.0020272
See the whole table with table.as_data_frame()
PartialDependence: Partial Dependence Plot of model GBM_model_python_1543517619333_8 on column 'AveOccup'

aveoccup mean_response stddev_response std_error_mean_response
0.6923077 3.1477957 0.9659091 0.0086682
13.2442372 1.6114909 0.7182429 0.0064456
25.7961668 1.6114909 0.7182429 0.0064456
38.3480963 1.6114909 0.7182429 0.0064456
50.9000259 1.6114909 0.7182429 0.0064456
--- --- --- ---
1193.1256151 1.6114909 0.7182429 0.0064456
1205.6775447 1.6114909 0.7182429 0.0064456
1218.2294742 1.6114909 0.7182429 0.0064456
1230.7814038 1.6114909 0.7182429 0.0064456
1243.3333333 1.6114909 0.7182429 0.0064456
See the whole table with table.as_data_frame()
PartialDependence: Partial Dependence Plot of model GBM_model_python_1543517619333_8 on column 'HouseAge'

houseage mean_response stddev_response std_error_mean_response
1.0 1.7299145 0.7237023 0.0064946
2.0 1.7299145 0.7237023 0.0064946
3.0 1.7659286 0.7296043 0.0065476
4.0 1.7807129 0.7446737 0.0066828
5.0 1.7826570 0.7470610 0.0067042
--- --- --- ---
48.0 2.3636941 0.9871769 0.0088590
49.0 2.3636941 0.9871769 0.0088590
50.0 2.3636941 0.9871769 0.0088590
51.0 2.4644215 0.9782047 0.0087785
52.0 2.5585947 0.9338781 0.0083807
See the whole table with table.as_data_frame()
Out[65]:
[, , ]

In [ ]: