In [1]:
# Credit Card Example
In [2]:
# Start and connect to a local H2O cluster
import h2o
h2o.init(nthreads = -1)
Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
Java Version: java version "1.8.0_72"; Java(TM) SE Runtime Environment (build 1.8.0_72-b15); Java HotSpot(TM) 64-Bit Server VM (build 25.72-b15, mixed mode)
Starting server from /Users/jofaichow/anaconda/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
Ice root: /var/folders/4z/p7yt7_4n4fj1jlyq6g4qhfbw0000gn/T/tmpxu48ip3w
JVM stdout: /var/folders/4z/p7yt7_4n4fj1jlyq6g4qhfbw0000gn/T/tmpxu48ip3w/h2o_jofaichow_started_from_python.out
JVM stderr: /var/folders/4z/p7yt7_4n4fj1jlyq6g4qhfbw0000gn/T/tmpxu48ip3w/h2o_jofaichow_started_from_python.err
Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.
H2O cluster uptime:
02 secs
H2O cluster version:
3.14.0.2
H2O cluster version age:
15 days
H2O cluster name:
H2O_from_python_jofaichow_d3n4et
H2O cluster total nodes:
1
H2O cluster free memory:
3.556 Gb
H2O cluster total cores:
8
H2O cluster allowed cores:
8
H2O cluster status:
accepting new members, healthy
H2O connection url:
http://127.0.0.1:54321
H2O connection proxy:
None
H2O internal security:
False
H2O API Extensions:
XGBoost, Algos, AutoML, Core V3, Core V4
Python version:
3.6.1 final
In [3]:
# Import datasets from s3
df_train = h2o.import_file("https://github.com/woobe/h2o_tutorials/raw/master/datasets/credit_card_train.csv")
df_test = h2o.import_file("https://github.com/woobe/h2o_tutorials/raw/master/datasets/credit_card_test.csv")
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
In [4]:
# Look at datasets
df_train.summary()
df_test.summary()
LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6 BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6 DEFAULT_PAYMENT_NEXT_MONTH
type int enum int int int int int int int int int int int int int int int int int int int int int enum
mins 10000.0 0.0 0.0 21.0 -2.0 -2.0 -2.0 -2.0 -2.0 -2.0 -165580.0 -69777.0 -157264.0 -170000.0 -81334.0 -339603.0 0.0 0.0 0.0 0.0 0.0 0.0
mean 165471.4666666669 1.8499999999999963 1.5557870370370357 35.40532407407389 -0.005231481481481603 -0.12236111111111114 -0.15537037037037077 -0.21060185185185143 -0.2499537037037024 -0.2780555555555528 50566.44999999981 48656.51504629656 46412.06685185198 42411.40537037039 40025.84819444426 38615.21180555545 5591.2800000000325 5827.203842592559 4991.376666666621 4750.457175925917 4797.252500000018 5142.894444444431
maxs 1000000.0 6.0 3.0 79.0 8.0 8.0 8.0 8.0 8.0 8.0 964511.0 983931.0 1664089.0 891586.0 927171.0 961664.0 505000.0 1684259.0 896040.0 497000.0 417990.0 527143.0
sigma 128853.31483927186 0.7795596962780423 0.5225050784764462 9.276754216409934 1.1266896421092234 1.200868545026891 1.2072703090074455 1.1721763980004694 1.1449731299849761 1.163634002402579 72759.41776538265 70553.01425903433 68567.17066062194 63313.397096201465 60523.19322690942 59526.30948474179 15306.329399410382 21146.621108754254 16320.311346237959 15125.600528699153 15201.573687145492 17229.362092294108
zeros 0 9 37 0 10563 11284 11309 11905 12148 11548 1439 1848 2093 2284 2557 2931 3850 3932 4351 4709 4887 5291
missing 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 20000.0 Female 2.0 1.0 24.0 2.0 2.0 -1.0 -1.0 -2.0 -2.0 3913.0 3102.0 689.0 0.0 0.0 0.0 0.0 689.0 0.0 0.0 0.0 0.0 Yes
1 120000.0 Female 2.0 2.0 26.0 -1.0 2.0 0.0 0.0 0.0 2.0 2682.0 1725.0 2682.0 3272.0 3455.0 3261.0 0.0 1000.0 1000.0 1000.0 0.0 2000.0 Yes
2 90000.0 Female 2.0 2.0 34.0 0.0 0.0 0.0 0.0 0.0 0.0 29239.0 14027.0 13559.0 14331.0 14948.0 15549.0 1518.0 1500.0 1000.0 1000.0 1000.0 5000.0 No
3 50000.0 Female 2.0 1.0 37.0 0.0 0.0 0.0 0.0 0.0 0.0 46990.0 48233.0 49291.0 28314.0 28959.0 29547.0 2000.0 2019.0 1200.0 1100.0 1069.0 1000.0 No
4 50000.0 Male 2.0 1.0 57.0 -1.0 0.0 -1.0 0.0 0.0 0.0 8617.0 5670.0 35835.0 20940.0 19146.0 19131.0 2000.0 36681.0 10000.0 9000.0 689.0 679.0 No
5 100000.0 Female 2.0 2.0 23.0 0.0 -1.0 -1.0 0.0 0.0 -1.0 11876.0 380.0 601.0 221.0 -159.0 567.0 380.0 601.0 0.0 581.0 1687.0 1542.0 No
6 140000.0 Female 3.0 1.0 28.0 0.0 0.0 2.0 0.0 0.0 0.0 11285.0 14096.0 12108.0 12211.0 11793.0 3719.0 3329.0 0.0 432.0 1000.0 1000.0 1000.0 No
7 20000.0 Male 3.0 2.0 35.0 -2.0 -2.0 -2.0 -2.0 -1.0 -1.0 0.0 0.0 0.0 0.0 13007.0 13912.0 0.0 0.0 0.0 13007.0 1122.0 0.0 No
8 200000.0 Female 3.0 2.0 34.0 0.0 0.0 2.0 0.0 0.0 -1.0 11073.0 9787.0 5535.0 2513.0 1828.0 3731.0 2306.0 12.0 50.0 300.0 3738.0 66.0 No
9 630000.0 Female 2.0 2.0 41.0 -1.0 0.0 -1.0 -1.0 -1.0 -1.0 12137.0 6500.0 6500.0 6500.0 6500.0 2870.0 1000.0 6500.0 6500.0 6500.0 2870.0 0.0 No
LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6 BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6
type int enum int int int int int int int int int int int int int int int int int int int int int
mins 10000.0 0.0 0.0 21.0 -2.0 -2.0 -2.0 -2.0 -2.0 -2.0 -4370.0 -22960.0 -20320.0 -20320.0 -23003.0 -51443.0 0.0 0.0 0.0 0.0 0.0 0.0
mean 165744.06002501058 1.8245102125885804 1.570237598999581 35.16048353480616 0.015839933305544 -0.13338891204668596 -0.1492288453522303 -0.2213422259274701 -0.2792830345977483 -0.2776156731971656 50891.35848270105 48571.807002918 45980.2717799083 41995.92413505636 39790.609003751546 38115.572738641225 5109.279699874966 5710.406836181736 4771.841600666943 4682.428511879945 4661.113380575236 5609.861192163388
maxs 760000.0 6.0 3.0 75.0 8.0 7.0 7.0 7.0 6.0 6.0 581775.0 572677.0 565550.0 572805.0 823540.0 501370.0 235728.0 361560.0 221876.0 158556.0 326889.0 528666.0
sigma 131629.37146225898 0.7837802153537031 0.5181907659944099 9.218486291552852 1.0938828475326325 1.1982952076029703 1.1749711322139944 1.1150399668262556 1.0624535248275457 1.1082200941910225 71673.63487504808 68667.38319694893 64757.208253239805 60861.27653361504 58735.62514893364 55726.16980178843 12727.416509438242 17343.146307723284 13915.078722281569 12497.385760182793 15882.345076418098 20899.679013349487
zeros 0 2 4 0 1175 1259 1267 1345 1372 1328 168 201 232 263 283 327 420 430 502 491 520 555
missing 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 50000.0 Male 1.0 2.0 37.0 0.0 0.0 0.0 0.0 0.0 0.0 64400.0 57069.0 57608.0 19394.0 19619.0 20024.0 2500.0 1815.0 657.0 1000.0 1000.0 800.0
1 500000.0 Male 1.0 2.0 29.0 0.0 0.0 0.0 0.0 0.0 0.0 367965.0 412023.0 445007.0 542653.0 483003.0 473944.0 55000.0 40000.0 38000.0 20239.0 13750.0 13770.0
2 260000.0 Female 1.0 2.0 51.0 -1.0 -1.0 -1.0 -1.0 -1.0 2.0 12261.0 21670.0 9966.0 8517.0 22287.0 13668.0 21818.0 9966.0 8583.0 22301.0 0.0 3640.0
3 50000.0 Male 2.0 2.0 33.0 2.0 0.0 0.0 0.0 0.0 0.0 30518.0 29618.0 22102.0 22734.0 23217.0 23680.0 1718.0 1500.0 1000.0 1000.0 1000.0 716.0
4 150000.0 Female 5.0 2.0 46.0 0.0 0.0 -1.0 0.0 0.0 -2.0 4463.0 3034.0 1170.0 1170.0 0.0 0.0 1013.0 1170.0 0.0 0.0 0.0 0.0
5 20000.0 Male 1.0 2.0 24.0 0.0 0.0 0.0 0.0 0.0 0.0 17447.0 18479.0 19476.0 19865.0 20480.0 20063.0 1318.0 1315.0 704.0 928.0 912.0 1069.0
6 130000.0 Female 2.0 1.0 51.0 -1.0 -1.0 -2.0 -2.0 -1.0 -1.0 99.0 0.0 0.0 0.0 2353.0 0.0 0.0 0.0 0.0 2353.0 0.0 0.0
7 320000.0 Male 2.0 2.0 29.0 2.0 2.0 2.0 2.0 2.0 2.0 58267.0 59246.0 60184.0 58622.0 62307.0 63526.0 2500.0 2500.0 0.0 4800.0 2400.0 1600.0
8 50000.0 Male 3.0 2.0 25.0 -1.0 0.0 0.0 0.0 0.0 0.0 42838.0 37225.0 36087.0 9636.0 9590.0 10030.0 1759.0 1779.0 320.0 500.0 1000.0 1000.0
9 130000.0 Female 1.0 1.0 35.0 0.0 0.0 0.0 -1.0 -1.0 -1.0 81313.0 117866.0 17740.0 1330.0 7095.0 1190.0 40000.0 5000.0 1330.0 7095.0 1190.0 2090.0
In [5]:
# Define features and target
features = list(df_test.columns)
target = "DEFAULT_PAYMENT_NEXT_MONTH"
In [6]:
# Train a GBM model
from h2o.estimators.gbm import H2OGradientBoostingEstimator
model_gbm = H2OGradientBoostingEstimator(seed = 1234)
model_gbm.train(x = features, y = target, training_frame = df_train)
print(model_gbm)
gbm Model Build progress: |███████████████████████████████████████████████| 100%
Model Details
=============
H2OGradientBoostingEstimator : Gradient Boosting Machine
Model Key: GBM_model_python_1504729446296_1
ModelMetricsBinomial: gbm
** Reported on train data. **
MSE: 0.12702139321348918
RMSE: 0.35640060776251375
LogLoss: 0.40828487643875344
Mean Per-Class Error: 0.26254617593150686
AUC: 0.8120294350665728
Gini: 0.6240588701331455
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.24157847787281111:
No
Yes
Error
Rate
No
14020.0
2747.0
0.1638
(2747.0/16767.0)
Yes
1786.0
3047.0
0.3695
(1786.0/4833.0)
Total
15806.0
5794.0
0.2099
(4533.0/21600.0)
Maximum Metrics: Maximum metrics at their respective thresholds
metric
threshold
value
idx
max f1
0.2415785
0.5734450
234.0
max f2
0.1414777
0.6624305
306.0
max f0point5
0.4410041
0.6204729
148.0
max accuracy
0.4410041
0.8307870
148.0
max precision
0.8956553
1.0
0.0
max recall
0.0405048
1.0
394.0
max specificity
0.8956553
1.0
0.0
max absolute_mcc
0.3760018
0.4566250
172.0
max min_per_class_accuracy
0.1878690
0.7279123
269.0
max mean_per_class_accuracy
0.2229026
0.7374538
245.0
Gains/Lift Table: Avg response rate: 22.38 %
group
cumulative_data_fraction
lower_threshold
lift
cumulative_lift
response_rate
cumulative_response_rate
capture_rate
cumulative_capture_rate
gain
cumulative_gain
1
0.01
0.7940448
4.2830540
4.2830540
0.9583333
0.9583333
0.0428305
0.0428305
328.3054004
328.3054004
2
0.02
0.7652384
4.0140699
4.1485620
0.8981481
0.9282407
0.0401407
0.0829712
301.4069936
314.8561970
3
0.03
0.7436861
3.6416305
3.9795848
0.8148148
0.8904321
0.0364163
0.1193875
264.1630457
297.9584799
4
0.04
0.7264950
3.6209394
3.8899234
0.8101852
0.8703704
0.0362094
0.1555969
262.0939375
288.9923443
5
0.05
0.7063101
3.3933375
3.7906062
0.7592593
0.8481481
0.0339334
0.1895303
239.3337472
279.0606249
6
0.1
0.5883743
3.0002069
3.3954066
0.6712963
0.7597222
0.1500103
0.3395407
200.0206911
239.5406580
7
0.15
0.4069486
2.2636044
3.0181392
0.5064815
0.6753086
0.1131802
0.4527209
126.3604387
201.8139182
8
0.2
0.3143049
1.7256363
2.6950134
0.3861111
0.6030093
0.0862818
0.5390027
72.5636251
169.5013449
9
0.3
0.2198094
1.2911235
2.2270501
0.2888889
0.4983025
0.1291124
0.6681150
29.1123526
122.7050141
10
0.4000463
0.1759793
0.8293284
1.8774984
0.1855622
0.4200903
0.0829712
0.7510863
-17.0671554
87.7498401
11
0.5
0.1491582
0.7452240
1.6511484
0.1667439
0.3694444
0.0744879
0.8255742
-25.4776032
65.1148355
12
0.6
0.1268450
0.5731430
1.4714808
0.1282407
0.3292438
0.0573143
0.8828885
-42.6857025
47.1480792
13
0.7
0.1057468
0.4800331
1.3298454
0.1074074
0.2975529
0.0480033
0.9308918
-51.9966894
32.9845408
14
0.8
0.0878685
0.3517484
1.2075833
0.0787037
0.2701968
0.0351748
0.9660666
-64.8251604
20.7583282
15
0.9
0.0720348
0.2255328
1.0984666
0.0504630
0.2457819
0.0225533
0.9886199
-77.4467205
9.8466561
16
1.0
0.0269546
0.1138010
1.0
0.0254630
0.22375
0.0113801
1.0
-88.6199048
0.0
Scoring History:
timestamp
duration
number_of_trees
training_rmse
training_logloss
training_auc
training_lift
training_classification_error
2017-09-06 21:24:15
0.090 sec
0.0
0.4167564
0.5316134
0.5
1.0
0.77625
2017-09-06 21:24:16
0.479 sec
1.0
0.4077295
0.5112276
0.7619704
3.4901396
0.1968981
2017-09-06 21:24:16
0.627 sec
2.0
0.4004482
0.4961908
0.7703013
3.4843390
0.2026389
2017-09-06 21:24:16
0.709 sec
3.0
0.3945478
0.4846704
0.7727510
3.4862484
0.2039815
2017-09-06 21:24:16
0.788 sec
4.0
0.3895914
0.4752930
0.7772916
3.7880833
0.2036111
---
---
---
---
---
---
---
---
---
2017-09-06 21:24:18
3.085 sec
21.0
0.3638493
0.4255805
0.7913738
4.2839122
0.2024537
2017-09-06 21:24:18
3.247 sec
22.0
0.3634567
0.4247080
0.7920916
4.2633164
0.2012037
2017-09-06 21:24:19
3.429 sec
23.0
0.3630635
0.4237819
0.7930764
4.2633164
0.2006481
2017-09-06 21:24:19
3.695 sec
24.0
0.3627059
0.4229763
0.7937797
4.2416718
0.2006481
2017-09-06 21:24:21
5.936 sec
50.0
0.3564006
0.4082849
0.8120294
4.2830540
0.2098611
See the whole table with table.as_data_frame()
Variable Importances:
variable
relative_importance
scaled_importance
percentage
PAY_0
2932.4401855
1.0
0.5636116
PAY_2
531.0946045
0.1811101
0.1020758
LIMIT_BAL
208.8082428
0.0712063
0.0401327
BILL_AMT1
196.2221069
0.0669143
0.0377137
PAY_3
187.8168335
0.0640480
0.0360982
---
---
---
---
BILL_AMT4
40.7689896
0.0139028
0.0078358
BILL_AMT3
32.8795280
0.0112123
0.0063194
BILL_AMT5
31.7188263
0.0108165
0.0060963
MARRIAGE
29.6317406
0.0101048
0.0056952
SEX
20.2798748
0.0069157
0.0038978
See the whole table with table.as_data_frame()
In [7]:
# Use GBM model for making predictions
yhat_test = model_gbm.predict(df_test)
yhat_test.head(5)
gbm prediction progress: |████████████████████████████████████████████████| 100%
predict No Yes
No 0.856357 0.143643
No 0.916529 0.0834714
No 0.92938 0.0706196
Yes 0.38585 0.61415
No 0.92741 0.07259
Out[7]:
In [8]:
# (Extra) Use H2O's AutoML
from h2o.automl import H2OAutoML
aml = H2OAutoML(max_runtime_secs = 60, seed = 1234)
aml.train(x = features, y = target, training_frame = df_train)
AutoML progress: |████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
In [9]:
# Print leaderboard
print(aml.leaderboard)
model_id auc logloss
StackedEnsemble_0_AutoML_20170906_212422 0.773794 0.438848
GBM_grid_0_AutoML_20170906_212422_model_0 0.771119 0.441382
DRF_0_AutoML_20170906_212422 0.763554 0.449797
XRT_0_AutoML_20170906_212422 0.759662 0.448924
GLM_grid_0_AutoML_20170906_212422_model_0 0.71226 0.474675
GLM_grid_0_AutoML_20170906_212422_model_1 0.71226 0.474675
In [10]:
# Use best model for making predictions
best_model = aml.leader
yhat_test = best_model.predict(df_test)
yhat_test.head(5)
stackedensemble prediction progress: |████████████████████████████████████| 100%
predict No Yes
No 0.849841 0.150159
No 0.884443 0.115557
No 0.893877 0.106123
Yes 0.307224 0.692776
No 0.868906 0.131094
Out[10]:
Content source: woobe/h2o_tutorials
Similar notebooks: