In [1]:
# Credit Card Example

In [2]:
# Start and connect to a local H2O cluster
import h2o
h2o.init(nthreads = -1)


Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_72"; Java(TM) SE Runtime Environment (build 1.8.0_72-b15); Java HotSpot(TM) 64-Bit Server VM (build 25.72-b15, mixed mode)
  Starting server from /Users/jofaichow/anaconda/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/4z/p7yt7_4n4fj1jlyq6g4qhfbw0000gn/T/tmpxu48ip3w
  JVM stdout: /var/folders/4z/p7yt7_4n4fj1jlyq6g4qhfbw0000gn/T/tmpxu48ip3w/h2o_jofaichow_started_from_python.out
  JVM stderr: /var/folders/4z/p7yt7_4n4fj1jlyq6g4qhfbw0000gn/T/tmpxu48ip3w/h2o_jofaichow_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.
H2O cluster uptime: 02 secs
H2O cluster version: 3.14.0.2
H2O cluster version age: 15 days
H2O cluster name: H2O_from_python_jofaichow_d3n4et
H2O cluster total nodes: 1
H2O cluster free memory: 3.556 Gb
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster status: accepting new members, healthy
H2O connection url: http://127.0.0.1:54321
H2O connection proxy: None
H2O internal security: False
H2O API Extensions: XGBoost, Algos, AutoML, Core V3, Core V4
Python version: 3.6.1 final

In [3]:
# Import datasets from s3
df_train = h2o.import_file("https://github.com/woobe/h2o_tutorials/raw/master/datasets/credit_card_train.csv")
df_test = h2o.import_file("https://github.com/woobe/h2o_tutorials/raw/master/datasets/credit_card_test.csv")


Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%

In [4]:
# Look at datasets
df_train.summary()
df_test.summary()


LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6 BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6 DEFAULT_PAYMENT_NEXT_MONTH
type int enum int int int int int int int int int int int int int int int int int int int int int enum
mins 10000.0 0.0 0.0 21.0 -2.0 -2.0 -2.0 -2.0 -2.0 -2.0 -165580.0 -69777.0 -157264.0 -170000.0 -81334.0 -339603.0 0.0 0.0 0.0 0.0 0.0 0.0
mean 165471.4666666669 1.84999999999999631.555787037037035735.40532407407389-0.005231481481481603-0.12236111111111114-0.15537037037037077-0.21060185185185143-0.2499537037037024-0.278055555555552850566.4499999998148656.5150462965646412.0668518519842411.40537037039 40025.8481944442638615.211805555455591.28000000003255827.203842592559 4991.376666666621 4750.457175925917 4797.252500000018 5142.894444444431
maxs 1000000.0 6.0 3.0 79.0 8.0 8.0 8.0 8.0 8.0 8.0 964511.0 983931.0 1664089.0 891586.0 927171.0 961664.0 505000.0 1684259.0 896040.0 497000.0 417990.0 527143.0
sigma 128853.31483927186 0.77955969627804230.52250507847644629.2767542164099341.1266896421092234 1.200868545026891 1.2072703090074455 1.1721763980004694 1.1449731299849761 1.163634002402579 72759.4177653826570553.0142590343368567.1706606219463313.39709620146560523.1932269094259526.3094847417915306.32939941038221146.62110875425416320.31134623795915125.60052869915315201.57368714549217229.362092294108
zeros 0 9 37 0 10563 11284 11309 11905 12148 11548 1439 1848 2093 2284 2557 2931 3850 3932 4351 4709 4887 5291
missing0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 20000.0 Female2.0 1.0 24.0 2.0 2.0 -1.0 -1.0 -2.0 -2.0 3913.0 3102.0 689.0 0.0 0.0 0.0 0.0 689.0 0.0 0.0 0.0 0.0 Yes
1 120000.0 Female2.0 2.0 26.0 -1.0 2.0 0.0 0.0 0.0 2.0 2682.0 1725.0 2682.0 3272.0 3455.0 3261.0 0.0 1000.0 1000.0 1000.0 0.0 2000.0 Yes
2 90000.0 Female2.0 2.0 34.0 0.0 0.0 0.0 0.0 0.0 0.0 29239.0 14027.0 13559.0 14331.0 14948.0 15549.0 1518.0 1500.0 1000.0 1000.0 1000.0 5000.0 No
3 50000.0 Female2.0 1.0 37.0 0.0 0.0 0.0 0.0 0.0 0.0 46990.0 48233.0 49291.0 28314.0 28959.0 29547.0 2000.0 2019.0 1200.0 1100.0 1069.0 1000.0 No
4 50000.0 Male 2.0 1.0 57.0 -1.0 0.0 -1.0 0.0 0.0 0.0 8617.0 5670.0 35835.0 20940.0 19146.0 19131.0 2000.0 36681.0 10000.0 9000.0 689.0 679.0 No
5 100000.0 Female2.0 2.0 23.0 0.0 -1.0 -1.0 0.0 0.0 -1.0 11876.0 380.0 601.0 221.0 -159.0 567.0 380.0 601.0 0.0 581.0 1687.0 1542.0 No
6 140000.0 Female3.0 1.0 28.0 0.0 0.0 2.0 0.0 0.0 0.0 11285.0 14096.0 12108.0 12211.0 11793.0 3719.0 3329.0 0.0 432.0 1000.0 1000.0 1000.0 No
7 20000.0 Male 3.0 2.0 35.0 -2.0 -2.0 -2.0 -2.0 -1.0 -1.0 0.0 0.0 0.0 0.0 13007.0 13912.0 0.0 0.0 0.0 13007.0 1122.0 0.0 No
8 200000.0 Female3.0 2.0 34.0 0.0 0.0 2.0 0.0 0.0 -1.0 11073.0 9787.0 5535.0 2513.0 1828.0 3731.0 2306.0 12.0 50.0 300.0 3738.0 66.0 No
9 630000.0 Female2.0 2.0 41.0 -1.0 0.0 -1.0 -1.0 -1.0 -1.0 12137.0 6500.0 6500.0 6500.0 6500.0 2870.0 1000.0 6500.0 6500.0 6500.0 2870.0 0.0 No
LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6 BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6
type int enum int int int int int int int int int int int int int int int int int int int int int
mins 10000.0 0.0 0.0 21.0 -2.0 -2.0 -2.0 -2.0 -2.0 -2.0 -4370.0 -22960.0 -20320.0 -20320.0 -23003.0 -51443.0 0.0 0.0 0.0 0.0 0.0 0.0
mean 165744.06002501058 1.82451021258858041.570237598999581 35.160483534806160.015839933305544 -0.13338891204668596-0.1492288453522303-0.2213422259274701-0.2792830345977483-0.277615673197165650891.3584827010548571.807002918 45980.2717799083 41995.9241350563639790.60900375154638115.5727386412255109.279699874966 5710.406836181736 4771.841600666943 4682.428511879945 4661.113380575236 5609.861192163388
maxs 760000.0 6.0 3.0 75.0 8.0 7.0 7.0 7.0 6.0 6.0 581775.0 572677.0 565550.0 572805.0 823540.0 501370.0 235728.0 361560.0 221876.0 158556.0 326889.0 528666.0
sigma 131629.37146225898 0.78378021535370310.51819076599440999.2184862915528521.09388284753263251.1982952076029703 1.1749711322139944 1.1150399668262556 1.0624535248275457 1.1082200941910225 71673.6348750480868667.3831969489364757.20825323980560861.2765336150458735.62514893364 55726.16980178843 12727.41650943824217343.14630772328413915.07872228156912497.38576018279315882.34507641809820899.679013349487
zeros 0 2 4 0 1175 1259 1267 1345 1372 1328 168 201 232 263 283 327 420 430 502 491 520 555
missing0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 50000.0 Male 1.0 2.0 37.0 0.0 0.0 0.0 0.0 0.0 0.0 64400.0 57069.0 57608.0 19394.0 19619.0 20024.0 2500.0 1815.0 657.0 1000.0 1000.0 800.0
1 500000.0 Male 1.0 2.0 29.0 0.0 0.0 0.0 0.0 0.0 0.0 367965.0 412023.0 445007.0 542653.0 483003.0 473944.0 55000.0 40000.0 38000.0 20239.0 13750.0 13770.0
2 260000.0 Female1.0 2.0 51.0 -1.0 -1.0 -1.0 -1.0 -1.0 2.0 12261.0 21670.0 9966.0 8517.0 22287.0 13668.0 21818.0 9966.0 8583.0 22301.0 0.0 3640.0
3 50000.0 Male 2.0 2.0 33.0 2.0 0.0 0.0 0.0 0.0 0.0 30518.0 29618.0 22102.0 22734.0 23217.0 23680.0 1718.0 1500.0 1000.0 1000.0 1000.0 716.0
4 150000.0 Female5.0 2.0 46.0 0.0 0.0 -1.0 0.0 0.0 -2.0 4463.0 3034.0 1170.0 1170.0 0.0 0.0 1013.0 1170.0 0.0 0.0 0.0 0.0
5 20000.0 Male 1.0 2.0 24.0 0.0 0.0 0.0 0.0 0.0 0.0 17447.0 18479.0 19476.0 19865.0 20480.0 20063.0 1318.0 1315.0 704.0 928.0 912.0 1069.0
6 130000.0 Female2.0 1.0 51.0 -1.0 -1.0 -2.0 -2.0 -1.0 -1.0 99.0 0.0 0.0 0.0 2353.0 0.0 0.0 0.0 0.0 2353.0 0.0 0.0
7 320000.0 Male 2.0 2.0 29.0 2.0 2.0 2.0 2.0 2.0 2.0 58267.0 59246.0 60184.0 58622.0 62307.0 63526.0 2500.0 2500.0 0.0 4800.0 2400.0 1600.0
8 50000.0 Male 3.0 2.0 25.0 -1.0 0.0 0.0 0.0 0.0 0.0 42838.0 37225.0 36087.0 9636.0 9590.0 10030.0 1759.0 1779.0 320.0 500.0 1000.0 1000.0
9 130000.0 Female1.0 1.0 35.0 0.0 0.0 0.0 -1.0 -1.0 -1.0 81313.0 117866.0 17740.0 1330.0 7095.0 1190.0 40000.0 5000.0 1330.0 7095.0 1190.0 2090.0

In [5]:
# Define features and target
features = list(df_test.columns)
target = "DEFAULT_PAYMENT_NEXT_MONTH"

In [6]:
# Train a GBM model
from h2o.estimators.gbm import H2OGradientBoostingEstimator
model_gbm = H2OGradientBoostingEstimator(seed = 1234)
model_gbm.train(x = features, y = target, training_frame = df_train)
print(model_gbm)


gbm Model Build progress: |███████████████████████████████████████████████| 100%
Model Details
=============
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  GBM_model_python_1504729446296_1


ModelMetricsBinomial: gbm
** Reported on train data. **

MSE: 0.12702139321348918
RMSE: 0.35640060776251375
LogLoss: 0.40828487643875344
Mean Per-Class Error: 0.26254617593150686
AUC: 0.8120294350665728
Gini: 0.6240588701331455
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.24157847787281111: 
No Yes Error Rate
No 14020.0 2747.0 0.1638 (2747.0/16767.0)
Yes 1786.0 3047.0 0.3695 (1786.0/4833.0)
Total 15806.0 5794.0 0.2099 (4533.0/21600.0)
Maximum Metrics: Maximum metrics at their respective thresholds

metric threshold value idx
max f1 0.2415785 0.5734450 234.0
max f2 0.1414777 0.6624305 306.0
max f0point5 0.4410041 0.6204729 148.0
max accuracy 0.4410041 0.8307870 148.0
max precision 0.8956553 1.0 0.0
max recall 0.0405048 1.0 394.0
max specificity 0.8956553 1.0 0.0
max absolute_mcc 0.3760018 0.4566250 172.0
max min_per_class_accuracy 0.1878690 0.7279123 269.0
max mean_per_class_accuracy 0.2229026 0.7374538 245.0
Gains/Lift Table: Avg response rate: 22.38 %

group cumulative_data_fraction lower_threshold lift cumulative_lift response_rate cumulative_response_rate capture_rate cumulative_capture_rate gain cumulative_gain
1 0.01 0.7940448 4.2830540 4.2830540 0.9583333 0.9583333 0.0428305 0.0428305 328.3054004 328.3054004
2 0.02 0.7652384 4.0140699 4.1485620 0.8981481 0.9282407 0.0401407 0.0829712 301.4069936 314.8561970
3 0.03 0.7436861 3.6416305 3.9795848 0.8148148 0.8904321 0.0364163 0.1193875 264.1630457 297.9584799
4 0.04 0.7264950 3.6209394 3.8899234 0.8101852 0.8703704 0.0362094 0.1555969 262.0939375 288.9923443
5 0.05 0.7063101 3.3933375 3.7906062 0.7592593 0.8481481 0.0339334 0.1895303 239.3337472 279.0606249
6 0.1 0.5883743 3.0002069 3.3954066 0.6712963 0.7597222 0.1500103 0.3395407 200.0206911 239.5406580
7 0.15 0.4069486 2.2636044 3.0181392 0.5064815 0.6753086 0.1131802 0.4527209 126.3604387 201.8139182
8 0.2 0.3143049 1.7256363 2.6950134 0.3861111 0.6030093 0.0862818 0.5390027 72.5636251 169.5013449
9 0.3 0.2198094 1.2911235 2.2270501 0.2888889 0.4983025 0.1291124 0.6681150 29.1123526 122.7050141
10 0.4000463 0.1759793 0.8293284 1.8774984 0.1855622 0.4200903 0.0829712 0.7510863 -17.0671554 87.7498401
11 0.5 0.1491582 0.7452240 1.6511484 0.1667439 0.3694444 0.0744879 0.8255742 -25.4776032 65.1148355
12 0.6 0.1268450 0.5731430 1.4714808 0.1282407 0.3292438 0.0573143 0.8828885 -42.6857025 47.1480792
13 0.7 0.1057468 0.4800331 1.3298454 0.1074074 0.2975529 0.0480033 0.9308918 -51.9966894 32.9845408
14 0.8 0.0878685 0.3517484 1.2075833 0.0787037 0.2701968 0.0351748 0.9660666 -64.8251604 20.7583282
15 0.9 0.0720348 0.2255328 1.0984666 0.0504630 0.2457819 0.0225533 0.9886199 -77.4467205 9.8466561
16 1.0 0.0269546 0.1138010 1.0 0.0254630 0.22375 0.0113801 1.0 -88.6199048 0.0
Scoring History: 
timestamp duration number_of_trees training_rmse training_logloss training_auc training_lift training_classification_error
2017-09-06 21:24:15 0.090 sec 0.0 0.4167564 0.5316134 0.5 1.0 0.77625
2017-09-06 21:24:16 0.479 sec 1.0 0.4077295 0.5112276 0.7619704 3.4901396 0.1968981
2017-09-06 21:24:16 0.627 sec 2.0 0.4004482 0.4961908 0.7703013 3.4843390 0.2026389
2017-09-06 21:24:16 0.709 sec 3.0 0.3945478 0.4846704 0.7727510 3.4862484 0.2039815
2017-09-06 21:24:16 0.788 sec 4.0 0.3895914 0.4752930 0.7772916 3.7880833 0.2036111
--- --- --- --- --- --- --- --- ---
2017-09-06 21:24:18 3.085 sec 21.0 0.3638493 0.4255805 0.7913738 4.2839122 0.2024537
2017-09-06 21:24:18 3.247 sec 22.0 0.3634567 0.4247080 0.7920916 4.2633164 0.2012037
2017-09-06 21:24:19 3.429 sec 23.0 0.3630635 0.4237819 0.7930764 4.2633164 0.2006481
2017-09-06 21:24:19 3.695 sec 24.0 0.3627059 0.4229763 0.7937797 4.2416718 0.2006481
2017-09-06 21:24:21 5.936 sec 50.0 0.3564006 0.4082849 0.8120294 4.2830540 0.2098611
See the whole table with table.as_data_frame()
Variable Importances: 
variable relative_importance scaled_importance percentage
PAY_0 2932.4401855 1.0 0.5636116
PAY_2 531.0946045 0.1811101 0.1020758
LIMIT_BAL 208.8082428 0.0712063 0.0401327
BILL_AMT1 196.2221069 0.0669143 0.0377137
PAY_3 187.8168335 0.0640480 0.0360982
--- --- --- ---
BILL_AMT4 40.7689896 0.0139028 0.0078358
BILL_AMT3 32.8795280 0.0112123 0.0063194
BILL_AMT5 31.7188263 0.0108165 0.0060963
MARRIAGE 29.6317406 0.0101048 0.0056952
SEX 20.2798748 0.0069157 0.0038978
See the whole table with table.as_data_frame()


In [7]:
# Use GBM model for making predictions
yhat_test = model_gbm.predict(df_test)
yhat_test.head(5)


gbm prediction progress: |████████████████████████████████████████████████| 100%
predict No Yes
No 0.8563570.143643
No 0.9165290.0834714
No 0.92938 0.0706196
Yes 0.38585 0.61415
No 0.92741 0.07259
Out[7]:


In [8]:
# (Extra) Use H2O's AutoML
from h2o.automl import H2OAutoML
aml = H2OAutoML(max_runtime_secs = 60, seed = 1234)
aml.train(x = features, y = target, training_frame = df_train)


AutoML progress: |████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%

In [9]:
# Print leaderboard
print(aml.leaderboard)


model_id auc logloss
StackedEnsemble_0_AutoML_20170906_212422 0.773794 0.438848
GBM_grid_0_AutoML_20170906_212422_model_00.771119 0.441382
DRF_0_AutoML_20170906_212422 0.763554 0.449797
XRT_0_AutoML_20170906_212422 0.759662 0.448924
GLM_grid_0_AutoML_20170906_212422_model_00.71226 0.474675
GLM_grid_0_AutoML_20170906_212422_model_10.71226 0.474675


In [10]:
# Use best model for making predictions
best_model = aml.leader
yhat_test = best_model.predict(df_test)
yhat_test.head(5)


stackedensemble prediction progress: |████████████████████████████████████| 100%
predict No Yes
No 0.8498410.150159
No 0.8844430.115557
No 0.8938770.106123
Yes 0.3072240.692776
No 0.8689060.131094
Out[10]: