License

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.



In [1]:

    
# imports
import h2o 
import numpy as np
import pandas as pd
from h2o.estimators.gbm import H2OGradientBoostingEstimator 
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.grid.grid_search import H2OGridSearch



In [2]:

    
# display matplotlib graphics in notebook
%matplotlib inline



In [3]:

    
# start and connect to h2o server
h2o.init()









    



Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_112"; Java(TM) SE Runtime Environment (build 1.8.0_112-b16); Java HotSpot(TM) 64-Bit Server VM (build 25.112-b16, mixed mode)
  Starting server from /Users/phall/anaconda/lib/python3.5/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/tc/0ss1l73113j3wdyjsxmy1j2r0000gn/T/tmp5g1s2ls0
  JVM stdout: /var/folders/tc/0ss1l73113j3wdyjsxmy1j2r0000gn/T/tmp5g1s2ls0/h2o_phall_started_from_python.out
  JVM stderr: /var/folders/tc/0ss1l73113j3wdyjsxmy1j2r0000gn/T/tmp5g1s2ls0/h2o_phall_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.






    




H2O cluster uptime:
03 secs
H2O cluster version:
3.10.3.4
H2O cluster version age:
1 month 
H2O cluster name:
H2O_from_python_phall_zsyxfd
H2O cluster total nodes:
1
H2O cluster free memory:
3.556 Gb
H2O cluster total cores:
8
H2O cluster allowed cores:
8
H2O cluster status:
accepting new members, healthy
H2O connection url:
http://127.0.0.1:54321
H2O connection proxy:
None
Python version:
3.5.2 final



In [4]:

    
# location of "dirty" file
# decision trees handle dirty data elegantly
path = '/Users/phall/workspace/GWU_data_mining/02_analytical_data_prep/data/loan.csv'



In [5]:

    
# define input variable measurement levels 
# strings automatically parsed as enums (nominal)
# numbers automatically parsed as numeric
col_types = {'bad_loan': 'enum'}



In [6]:

    
frame = h2o.import_file(path=path, col_types=col_types) # multi-threaded import









    



Parse progress: |█████████████████████████████████████████████████████████| 100%



In [7]:

    
frame.describe()









    



Rows:163987
Cols:16








    






       id               bad_loan  loan_amnt         int_rate          emp_length       home_ownership  annual_inc        purpose           addr_state  dti               delinq_2yrs        revol_util        total_acc         longest_credit_length  verification_status  term_length       


type   int              enum      int               real              int              enum            real              enum              enum        real              int                real              int               int                    enum                 int               
mins   10001.0                    500.0             5.42              0.0                              1896.0                                          0.0               0.0                0.0               1.0               0.0                                         36.0              
mean   91994.0                    13073.209220415742 13.717143207254315 5.686200649105202                 71931.19588595249                               15.880794152061497 0.22746713629788037 54.07622244747627 24.577910646698616 14.8582097058084                            40.980679245283056
maxs   173987.0                   35000.0           26.060000000000002 10.0                             7141778.0                                       39.93             29.0               150.70000000000002 118.0             65.0                                        60.0              
sigma  47339.11363414683           7992.3993793601785 4.3935679462170425 3.610039811481059                 59464.026648950334                               7.583636421364416 0.6949139713078192 25.284135504932134 11.685003948632696 6.949793041523766                           9.732920010298912 
zeros  0                          0                 0                 13810                            0                                               263               135210             1515              0                 11                                          0                 
missing 0                0         4992              4854              10545            2571            4983              2488              2484        5025              4997               5154              4933              4907                   2426                 4987              
0      10001.0          0         5000.0            10.65             10.0             RENT            24000.0           credit_card       AZ          27.650000000000002 0.0                83.7              9.0               26.0                   verified             36.0              
1      10002.0          1         2500.0            15.27             0.0              RENT            30000.0           car               GA          1.0               0.0                9.4               4.0               12.0                   verified             60.0              
2      10003.0          0         2400.0            15.96             10.0             RENT            12252.0           small_business    IL          8.72              0.0                98.5              10.0              10.0                   not verified         36.0              
3      10004.0          0         10000.0           13.49             10.0             RENT            nan               other             CA          20.0              0.0                nan               37.0              15.0                   verified             36.0              
4      10005.0          0         5000.0            7.9               3.0              RENT            36000.0           wedding           AZ          11.200000000000001 0.0                28.3              12.0              nan                    verified             36.0              
5      10006.0          0         3000.0            18.64             9.0              RENT            48000.0           car               CA          5.3500000000000005 0.0                87.5              4.0               4.0                    verified             36.0              
6      10007.0          1         5600.0            21.28             4.0              OWN             40000.0           small_business    CA          5.55              0.0                32.6              13.0              7.0                    verified             60.0              
7      10008.0          1         5375.0            12.69             0.0              RENT            15000.0           other             TX          18.080000000000002 0.0                36.5              3.0               7.0                    verified             60.0              
8      10009.0          0         6500.0            14.65             5.0              OWN             72000.0           debt_consolidation AZ          16.12             0.0                20.6              23.0              13.0                   not verified         60.0              
9      10010.0          0         12000.0           12.69             10.0             OWN             75000.0           debt_consolidation CA          10.78             0.0                67.1              34.0              22.0                   verified             36.0



In [8]:

    
# correct MORTGAGE/mortgage problem using gsub() and trim() functions
print(frame['home_ownership'].table())

frame['home_ownership'] = frame['home_ownership'].gsub(pattern='mortgage',
                                                       replacement='MORTGAGE')
frame['home_ownership'] = frame['home_ownership'].trim()

print(frame['home_ownership'].table())









    






home_ownership    Count


ANY                   1
MORTGAGE          74209
NONE                 30
OTHER               151
OWN               13369
RENT              69416
mortgage           4240








    









    






home_ownership    Count


ANY                   1
MORTGAGE          78449
NONE                 30
OTHER               151
OWN               13369
RENT              69416



In [9]:

    
# split into 40% training, 30% validation, and 30% test
train, valid, test = frame.split_frame([0.4, 0.3])



In [10]:

    
# assign target and inputs
y = 'bad_loan'
X = [name for name in frame.columns if name not in ['id', '_WARN_', y]]
print(y)
print(X)









    



bad_loan
['loan_amnt', 'int_rate', 'emp_length', 'home_ownership', 'annual_inc', 'purpose', 'addr_state', 'dti', 'delinq_2yrs', 'revol_util', 'total_acc', 'longest_credit_length', 'verification_status', 'term_length']



In [11]:

    
# set target to factor - for binary classification
train[y] = train[y].asfactor()
valid[y] = valid[y].asfactor()
test[y] = test[y].asfactor()



In [12]:

    
# random forest

# initialize rf model
rf_model = H2ORandomForestEstimator(
    ntrees=500,                      # Up to 500 decision trees in the forest 
    max_depth=30,                    # trees can grow to depth of 30
    stopping_rounds=5,               # stop after validation error does not decrease for 5 iterations/new trees
    score_each_iteration=True,       # score validation error on every iteration/new tree
    model_id='rf_model')             # for easy lookup in flow

# train rf model
rf_model.train(
    x=X,
    y=y,
    training_frame=train,
    validation_frame=valid)

# print model information
rf_model

# view detailed results at http://localhost:54321/flow/index.html









    



drf Model Build progress: |███████████████████████████████████████████████| 100%
Model Details
=============
H2ORandomForestEstimator :  Distributed Random Forest
Model Key:  rf_model


ModelMetricsBinomial: drf
** Reported on train data. **

MSE: 0.1489042863556249
RMSE: 0.3858811816552148
LogLoss: 0.5180827157657203
Mean Per-Class Error: 0.3884354978859006
AUC: 0.6524497477686478
Gini: 0.3048994955372957
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.18628626443066934: 






    





0
1
Error
Rate
0
30044.0
23039.0
0.434
 (23039.0/53083.0)
1
4286.0
8215.0
0.3429
 (4286.0/12501.0)
Total
34330.0
31254.0
0.4166
 (27325.0/65584.0)






    



Maximum Metrics: Maximum metrics at their respective thresholds







    




metric
threshold
value
idx
max f1
0.1862863
0.3754999
258.0
max f2
0.0833365
0.5525161
342.0
max f0point5
0.3056381
0.3285712
168.0
max accuracy
0.6294603
0.8096030
25.0
max precision
0.6402050
0.5441176
22.0
max recall
0.0000035
1.0
399.0
max specificity
0.875
0.9999812
0.0
max absolute_mcc
0.2442978
0.1786798
211.0
max min_per_class_accuracy
0.2013003
0.6060315
247.0
max mean_per_class_accuracy
0.1862863
0.6115645
258.0






    



Gains/Lift Table: Avg response rate: 19.06 %







    





group
cumulative_data_fraction
lower_threshold
lift
cumulative_lift
response_rate
cumulative_response_rate
capture_rate
cumulative_capture_rate
gain
cumulative_gain

1
0.0100024
0.8226434
5.2463003
5.2463003
1.0
1.0
0.0524758
0.0524758
424.6300296
424.6300296

2
0.0200049
0.7987747
5.2463003
5.2463003
1.0
1.0
0.0524758
0.1049516
424.6300296
424.6300296

3
0.0300073
0.7860458
5.2463003
5.2463003
1.0
1.0
0.0524758
0.1574274
424.6300296
424.6300296

4
0.0403147
0.7696629
5.2463003
5.2463003
1.0
1.0
0.0540757
0.2115031
424.6300296
424.6300296

5
0.0500732
0.7584270
5.2463003
5.2463003
1.0
1.0
0.0511959
0.2626990
424.6300296
424.6300296

6
0.1045072
0.7078652
5.2463003
5.2463003
1.0
1.0
0.2855772
0.5482761
424.6300296
424.6300296

7
0.1500061
0.6570082
5.2463003
5.2463003
1.0
1.0
0.2387009
0.7869770
424.6300296
424.6300296

8
0.2019395
0.2022472
4.1018490
4.9519783
0.7818555
0.9438991
0.2130230
1.0
310.1848998
395.1978254

9
0.3141162
0.1235955
0.0
3.1835348
0.0
0.6068152
0.0
1.0
-100.0
218.3534780

10
0.4004940
0.1011236
0.0
2.4969162
0.0
0.4759385
0.0
1.0
-100.0
149.6916165

11
0.5261192
0.0786517
0.0
1.9007100
0.0
0.3622953
0.0
1.0
-100.0
90.0710042

12
0.6011832
0.0674157
0.0
1.6633864
0.0
0.3170589
0.0
1.0
-100.0
66.3386426

13
0.7000030
0.0516497
0.0
1.4285652
0.0
0.2722995
0.0
1.0
-100.0
42.8565205

14
0.8015522
0.0374532
0.0
1.2475794
0.0
0.2378017
0.0
1.0
-100.0
24.7579372

15
0.9233350
0.0224719
0.0
1.0830306
0.0
0.2064370
0.0
1.0
-100.0
8.3030583

16
1.0
0.0
0.0
1.0
0.0
0.1906105
0.0
1.0
-100.0
0.0






    




ModelMetricsBinomial: drf
** Reported on validation data. **

MSE: 0.14777166398397243
RMSE: 0.38441080107610454
LogLoss: 0.4665519053428598
Mean Per-Class Error: 0.37716586320128553
AUC: 0.6683742817869399
Gini: 0.33674856357387983
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.19475064146786594: 






    





0
1
Error
Rate
0
23607.0
16029.0
0.4044
 (16029.0/39636.0)
1
3339.0
6203.0
0.3499
 (3339.0/9542.0)
Total
26946.0
22232.0
0.3938
 (19368.0/49178.0)






    



Maximum Metrics: Maximum metrics at their respective thresholds







    




metric
threshold
value
idx
max f1
0.1947506
0.3904450
232.0
max f2
0.1113515
0.5613968
314.0
max f0point5
0.3059112
0.3536758
145.0
max accuracy
0.7021051
0.8060718
2.0
max precision
0.7640449
1.0
0.0
max recall
0.0000496
1.0
399.0
max specificity
0.7640449
1.0
0.0
max absolute_mcc
0.2303148
0.1998727
202.0
max min_per_class_accuracy
0.2032390
0.6158038
224.0
max mean_per_class_accuracy
0.1947506
0.6228341
232.0






    



Gains/Lift Table: Avg response rate: 19.40 %







    





group
cumulative_data_fraction
lower_threshold
lift
cumulative_lift
response_rate
cumulative_response_rate
capture_rate
cumulative_capture_rate
gain
cumulative_gain

1
0.0112855
0.5168539
2.4144144
2.4144144
0.4684685
0.4684685
0.0272480
0.0272480
141.4414414
141.4414414

2
0.0200089
0.4791547
2.3426573
2.3831301
0.4545455
0.4623984
0.0204360
0.0476839
134.2657343
138.3130081

3
0.0328196
0.4494382
2.0369963
2.2480221
0.3952381
0.4361834
0.0260952
0.0737791
103.6996337
124.8022114

4
0.0400586
0.4325843
2.0847018
2.2185084
0.4044944
0.4304569
0.0150912
0.0888703
108.4701815
121.8508395

5
0.0524625
0.4157303
2.0699874
2.1833930
0.4016393
0.4236434
0.0256760
0.1145462
106.9987390
118.3392964

6
0.1045183
0.3595506
1.8159255
2.0003741
0.3523438
0.3881323
0.0945294
0.2090757
81.5925481
100.0374139

7
0.1500468
0.3230337
1.7102759
1.9123500
0.3318446
0.3710530
0.0778663
0.2869419
71.0275879
91.2350016

8
0.2060271
0.2921348
1.4639694
1.7905190
0.2840538
0.3474141
0.0819535
0.3688954
46.3969376
79.0518995

9
0.3079019
0.2471910
1.2416552
1.6089176
0.2409182
0.3121780
0.1264934
0.4953888
24.1655151
60.8917631

10
0.4016633
0.2134831
1.0741371
1.4840823
0.2084147
0.2879563
0.1007126
0.5961014
7.4137097
48.4082262

11
0.5130546
0.1797753
0.9906900
1.3769600
0.1922234
0.2671713
0.1103542
0.7064557
-0.9309967
37.6959967

12
0.6000041
0.1529599
0.8196014
1.2961905
0.1590271
0.2514996
0.0712639
0.7777196
-18.0398647
29.6190474

13
0.6999878
0.1237442
0.7274292
1.2149507
0.1411430
0.2357367
0.0727311
0.8504506
-27.2570830
21.4950661

14
0.7999919
0.0998383
0.6025745
1.1383998
0.1169174
0.2208835
0.0602599
0.9107105
-39.7425470
13.8399753

15
0.9065029
0.0674157
0.5057421
1.0640646
0.0981291
0.2064603
0.0538671
0.9645777
-49.4257938
6.4064603

16
1.0
0.0
0.3788604
1.0
0.0735102
0.1940299
0.0354223
1.0
-62.1139626
0.0






    



Scoring History: 






    





timestamp
duration
number_of_trees
training_rmse
training_logloss
training_auc
training_lift
training_classification_error
validation_rmse
validation_logloss
validation_auc
validation_lift
validation_classification_error

2017-03-03 20:28:27
 0.019 sec
0.0
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan

2017-03-03 20:28:28
 0.830 sec
1.0
0.5376840
9.8706286
0.5350765
3.7246827
0.8118898
0.5382119
9.8741835
0.5325894
1.2833840
0.8059701

2017-03-03 20:28:28
 1.424 sec
2.0
0.5213499
8.7506121
0.5397091
5.0012721
0.8122780
0.4686261
5.1233924
0.5528048
1.5258930
0.8059701

2017-03-03 20:28:29
 1.836 sec
3.0
0.5075518
7.7599338
0.5428677
5.2162156
0.8117777
0.4415145
3.4290300
0.5681898
1.8955085
0.4428403

2017-03-03 20:28:29
 2.173 sec
4.0
0.4948075
6.9156472
0.5463457
5.2397933
0.8101394
0.4270037
2.6534140
0.5804040
1.7463446
0.4851763
---
---
---
---
---
---
---
---
---
---
---
---
---
---

2017-03-03 20:30:43
 2 min 15.662 sec
85.0
0.3861816
0.5237025
0.6513079
5.2463003
0.4052055
0.3845064
0.4668363
0.6679149
2.4178538
0.4003416

2017-03-03 20:30:46
 2 min 18.822 sec
86.0
0.3861129
0.5220731
0.6515432
5.2463003
0.4176628
0.3844789
0.4667840
0.6681149
2.4302689
0.3714873

2017-03-03 20:30:49
 2 min 22.102 sec
87.0
0.3860695
0.5195898
0.6516770
5.2463003
0.4166413
0.3844831
0.4667076
0.6680309
2.4408015
0.3860873

2017-03-03 20:30:52
 2 min 25.561 sec
88.0
0.3859759
0.5192995
0.6519903
5.2463003
0.4173274
0.3844690
0.4666676
0.6681041
2.4223077
0.4132336

2017-03-03 20:30:56
 2 min 29.101 sec
89.0
0.3858812
0.5180827
0.6524497
5.2463003
0.4166413
0.3844108
0.4665519
0.6683743
2.4144144
0.3938346






    



See the whole table with table.as_data_frame()
Variable Importances: 






    




variable
relative_importance
scaled_importance
percentage
addr_state
98091.9843750
1.0
0.1760907
int_rate
63342.5156250
0.6457461
0.1137099
dti
53355.0156250
0.5439284
0.0957807
revol_util
51702.1562500
0.5270783
0.0928136
loan_amnt
47027.6718750
0.4794242
0.0844221
total_acc
45383.2265625
0.4626599
0.0814701
longest_credit_length
43434.3593750
0.4427921
0.0779716
annual_inc
41564.2109375
0.4237269
0.0746144
emp_length
35672.1015625
0.3636597
0.0640371
purpose
32235.9804688
0.3286301
0.0578687
home_ownership
14451.3808594
0.1473248
0.0259425
delinq_2yrs
12333.5097656
0.1257341
0.0221406
term_length
9890.6123047
0.1008300
0.0177552
verification_status
8568.9960938
0.0873567
0.0153827






    Out[12]:



In [13]:

    
# measure rf AUC
print(rf_model.auc(train=True))
print(rf_model.auc(valid=True))
print(rf_model.model_performance(test_data=test).auc())









    



0.6524497477686478
0.6683742817869399
0.668192405655755



In [14]:

    
# GBM with random hyperparameter search
# train many different GBM models with random hyperparameters
# and select best model based on validation error

# define random grid search parameters
hyper_parameters = {'ntrees':list(range(0, 500, 50)),
                    'max_depth':list(range(0, 20, 2)),
                    'sample_rate':[s/float(10) for s in range(1, 11)],
                    'col_sample_rate':[s/float(10) for s in range(1, 11)]}

# define search strategy
search_criteria = {'strategy':'RandomDiscrete',
                   'max_models':20,
                   'max_runtime_secs':600}

# initialize grid search
gsearch = H2OGridSearch(H2OGradientBoostingEstimator,
                        hyper_params=hyper_parameters,
                        search_criteria=search_criteria)

# execute training w/ grid search
gsearch.train(x=X,
              y=y,
              training_frame=train,
              validation_frame=valid)

# view detailed results at http://localhost:54321/flow/index.html









    



gbm Grid Build progress: |████████████████████████████████████████████████| 100%



In [15]:

    
# show grid search results
gsearch.show()

# select best model
gbm_model = gsearch.get_grid()[0]

# print model information
gbm_model









    



     col_sample_rate max_depth ntrees sample_rate  \
0                0.7         2    450         0.5   
1                0.3         4    450         1.0   
2                0.2         8    300         0.9   
3                0.7         4    400         0.3   
4                0.2         8    450         0.6   
5                0.4        10    100         0.9   
6                0.8         6    400         0.6   
7                0.4        10    450         0.9   
8                0.2        10      0         0.7   
9                0.9        12    100         0.7   
10               0.2        14    200         0.8   
11               0.2        12    350         0.6   
12               0.5        18     50         0.1   
13               0.9        14    100         1.0   
14               0.2        12    400         0.3   
15               0.2        14    300         0.2   
16               0.7        12    350         0.5   
17               0.9        14    300         1.0   
18               0.9        18    450         0.1   

                                                          model_ids  \
0    Grid_GBM_py_11_sid_a8d9_model_python_1488219708058_445_model_9   
1   Grid_GBM_py_11_sid_a8d9_model_python_1488219708058_445_model_13   
2   Grid_GBM_py_11_sid_a8d9_model_python_1488219708058_445_model_16   
3    Grid_GBM_py_11_sid_a8d9_model_python_1488219708058_445_model_2   
4   Grid_GBM_py_11_sid_a8d9_model_python_1488219708058_445_model_10   
5   Grid_GBM_py_11_sid_a8d9_model_python_1488219708058_445_model_11   
6    Grid_GBM_py_11_sid_a8d9_model_python_1488219708058_445_model_0   
7    Grid_GBM_py_11_sid_a8d9_model_python_1488219708058_445_model_4   
8    Grid_GBM_py_11_sid_a8d9_model_python_1488219708058_445_model_8   
9   Grid_GBM_py_11_sid_a8d9_model_python_1488219708058_445_model_17   
10   Grid_GBM_py_11_sid_a8d9_model_python_1488219708058_445_model_7   
11   Grid_GBM_py_11_sid_a8d9_model_python_1488219708058_445_model_3   
12  Grid_GBM_py_11_sid_a8d9_model_python_1488219708058_445_model_19   
13  Grid_GBM_py_11_sid_a8d9_model_python_1488219708058_445_model_15   
14   Grid_GBM_py_11_sid_a8d9_model_python_1488219708058_445_model_5   
15   Grid_GBM_py_11_sid_a8d9_model_python_1488219708058_445_model_1   
16  Grid_GBM_py_11_sid_a8d9_model_python_1488219708058_445_model_18   
17  Grid_GBM_py_11_sid_a8d9_model_python_1488219708058_445_model_14   
18  Grid_GBM_py_11_sid_a8d9_model_python_1488219708058_445_model_12   

                logloss  
0    0.4559512893895453  
1    0.4562727884725051  
2    0.4614463446372809  
3   0.46462647035317917  
4   0.47144556802198706  
5   0.47211417794373167  
6    0.4740696699423142  
7    0.4794322361742395  
8    0.4913730519047276  
9    0.4987913315785933  
10   0.5044728833438659  
11    0.505131725225945  
12   0.5098834443023351  
13   0.5319261396763075  
14   0.5490384012904712  
15   0.5534834522702895  
16   0.6470349677925481  
17   0.8179871783126142  
18   0.8379398656052428  
Model Details
=============
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  Grid_GBM_py_11_sid_a8d9_model_python_1488219708058_445_model_9


ModelMetricsBinomial: gbm
** Reported on train data. **

MSE: 0.14070909049434702
RMSE: 0.37511210390274935
LogLoss: 0.4440167753159748
Mean Per-Class Error: 0.3439550239972745
AUC: 0.7130151031503764
Gini: 0.42603020630075283
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.2264380977076581: 






    





0
1
Error
Rate
0
39108.0
13824.0
0.2612
 (13824.0/52932.0)
1
5478.0
7171.0
0.4331
 (5478.0/12649.0)
Total
44586.0
20995.0
0.2943
 (19302.0/65581.0)






    



Maximum Metrics: Maximum metrics at their respective thresholds







    




metric
threshold
value
idx
max f1
0.2264381
0.4262870
214.0
max f2
0.1229990
0.5796178
310.0
max f0point5
0.3062214
0.3986584
150.0
max accuracy
0.4505209
0.8105549
68.0
max precision
0.8905700
1.0
0.0
max recall
0.0212808
1.0
397.0
max specificity
0.8905700
1.0
0.0
max absolute_mcc
0.2394033
0.2600720
203.0
max min_per_class_accuracy
0.1968022
0.6549913
240.0
max mean_per_class_accuracy
0.2006910
0.6560450
236.0






    



Gains/Lift Table: Avg response rate: 19.29 %







    





group
cumulative_data_fraction
lower_threshold
lift
cumulative_lift
response_rate
cumulative_response_rate
capture_rate
cumulative_capture_rate
gain
cumulative_gain

1
0.0100029
0.5216692
3.2878450
3.2878450
0.6341463
0.6341463
0.0328880
0.0328880
228.7844985
228.7844985

2
0.0200058
0.4785063
2.7425053
3.0151751
0.5289634
0.5815549
0.0274330
0.0603210
174.2505312
201.5175149

3
0.0300087
0.4506796
2.6160497
2.8821333
0.5045732
0.5558943
0.0261681
0.0864891
161.6049736
188.2133345

4
0.0400116
0.4264394
2.5370150
2.7958538
0.4893293
0.5392530
0.0253775
0.1118666
153.7015001
179.5853759

5
0.0500145
0.4088266
2.1734552
2.6713741
0.4192073
0.5152439
0.0217408
0.1336074
117.3455219
167.1374051

6
0.1000137
0.3492801
2.1108710
2.3911652
0.4071363
0.4611984
0.1055419
0.2391493
111.0870989
139.1165247

7
0.1500130
0.3096653
1.7519439
2.1781131
0.3379079
0.4201057
0.0875959
0.3267452
75.1943862
117.8113110

8
0.2000122
0.2798218
1.5795956
2.0284951
0.3046661
0.3912480
0.0789786
0.4057238
57.9595594
102.8495139

9
0.3000107
0.2338298
1.3740426
1.8103554
0.2650198
0.3491741
0.1374022
0.5431259
37.4042614
81.0355385

10
0.4000091
0.1978020
1.0862684
1.6293405
0.2095151
0.3142607
0.1086252
0.6517511
8.6268441
62.9340549

11
0.5000076
0.1683097
0.8562072
1.4747186
0.1651418
0.2844378
0.0856194
0.7373705
-14.3792779
47.4718599

12
0.6000061
0.1432828
0.8064002
1.3633350
0.1555352
0.2629546
0.0806388
0.8180093
-19.3599847
36.3335022

13
0.7000046
0.1200325
0.6522354
1.2617516
0.1258005
0.2433616
0.0652225
0.8832319
-34.7764582
26.1751577

14
0.8000030
0.0970259
0.5376001
1.1712344
0.1036901
0.2259030
0.0537592
0.9369911
-46.2399898
17.1234368

15
0.9000015
0.0716112
0.4158495
1.0873041
0.0802074
0.2097149
0.0415843
0.9785754
-58.4150509
8.7304137

16
1.0
0.0075980
0.2142495
1.0
0.0413236
0.1928760
0.0214246
1.0
-78.5750548
0.0






    




ModelMetricsBinomial: gbm
** Reported on validation data. **

MSE: 0.14468902453070914
RMSE: 0.3803801053297992
LogLoss: 0.4559512893895453
Mean Per-Class Error: 0.3659766565292937
AUC: 0.687276546121273
Gini: 0.37455309224254596
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.20744746822045296: 






    





0
1
Error
Rate
0
26838.0
12800.0
0.3229
 (12800.0/39638.0)
1
3892.0
5623.0
0.409
 (3892.0/9515.0)
Total
30730.0
18423.0
0.3396
 (16692.0/49153.0)






    



Maximum Metrics: Maximum metrics at their respective thresholds







    




metric
threshold
value
idx
max f1
0.2074475
0.4025342
226.0
max f2
0.1118529
0.5701701
320.0
max f0point5
0.2880076
0.3725038
161.0
max accuracy
0.5084437
0.8078245
45.0
max precision
0.8906333
1.0
0.0
max recall
0.0198176
1.0
397.0
max specificity
0.8906333
1.0
0.0
max absolute_mcc
0.2467105
0.2252884
193.0
max min_per_class_accuracy
0.1931899
0.6323730
238.0
max mean_per_class_accuracy
0.2060035
0.6340233
227.0






    



Gains/Lift Table: Avg response rate: 19.36 %







    





group
cumulative_data_fraction
lower_threshold
lift
cumulative_lift
response_rate
cumulative_response_rate
capture_rate
cumulative_capture_rate
gain
cumulative_gain

1
0.0100096
0.5176575
2.8244144
2.8244144
0.5467480
0.5467480
0.0282712
0.0282712
182.4414382
182.4414382

2
0.0200191
0.4753013
2.4254265
2.6249204
0.4695122
0.5081301
0.0242775
0.0525486
142.5426477
162.4920429

3
0.0300083
0.4458125
2.3461977
2.5321388
0.4541752
0.4901695
0.0234367
0.0759853
134.6197718
153.2138835

4
0.0400179
0.4243443
2.2784309
2.4686796
0.4410569
0.4778851
0.0228061
0.0987914
127.8430933
146.8679614

5
0.0500071
0.4071185
2.2515081
2.4252983
0.4358452
0.4694874
0.0224908
0.1212822
125.1508124
142.5298328

6
0.1000142
0.3487728
1.9503266
2.1878124
0.3775427
0.4235151
0.0975302
0.2188124
95.0326558
118.7812443

7
0.1500010
0.3098027
1.6714878
2.0157509
0.3235653
0.3902075
0.0835523
0.3023647
67.1487793
101.5750912

8
0.2000081
0.2791177
1.5615222
1.9021822
0.3022783
0.3682230
0.0780872
0.3804519
56.1522234
90.2182192

9
0.3000020
0.2334581
1.2654477
1.6899518
0.2449644
0.3271396
0.1265370
0.5069890
26.5447703
68.9951755

10
0.3999959
0.1985471
1.0731081
1.5357487
0.2077314
0.2972890
0.1073043
0.6142932
7.3108060
53.5748675

11
0.5000102
0.1694927
0.9625534
1.4210956
0.1863303
0.2750946
0.0962690
0.7105623
-3.7446591
42.1095629

12
0.6000041
0.1443919
0.8397780
1.3242159
0.1625636
0.2563407
0.0839727
0.7945349
-16.0221998
32.4215928

13
0.6999980
0.1207058
0.7041943
1.2356466
0.1363174
0.2391955
0.0704151
0.8649501
-29.5805680
23.5646561

14
0.7999919
0.0973341
0.5948865
1.1555556
0.1151577
0.2236916
0.0594850
0.9244351
-40.5113455
15.5555633

15
0.8999858
0.0714734
0.4708643
1.0794823
0.0911495
0.2089653
0.0470836
0.9715187
-52.9135738
7.9482253

16
1.0
0.0089131
0.2847729
1.0
0.0551261
0.1935792
0.0284813
1.0
-71.5227103
0.0






    



Scoring History: 






    





timestamp
duration
number_of_trees
training_rmse
training_logloss
training_auc
training_lift
training_classification_error
validation_rmse
validation_logloss
validation_auc
validation_lift
validation_classification_error

2017-02-27 13:27:21
 2 min  7.644 sec
0.0
0.3945565
0.4903664
0.5
1.0
0.8071240
0.3951035
0.4913731
0.5
1.0
0.8064208

2017-02-27 13:27:21
 2 min  7.677 sec
1.0
0.3929673
0.4863895
0.6329222
1.9691157
0.4259008
0.3935590
0.4875034
0.6295053
1.9523460
0.4287226

2017-02-27 13:27:21
 2 min  7.713 sec
2.0
0.3916405
0.4831560
0.6418231
1.9691157
0.4259008
0.3922361
0.4842771
0.6395990
1.9523460
0.3676276

2017-02-27 13:27:21
 2 min  7.759 sec
3.0
0.3905311
0.4804667
0.6521669
2.2899248
0.3842576
0.3911425
0.4816275
0.6507951
2.2156255
0.3819909

2017-02-27 13:27:21
 2 min  7.806 sec
4.0
0.3896000
0.4782076
0.6603972
2.5194389
0.3471127
0.3902049
0.4793561
0.6592532
2.3743628
0.3451264
---
---
---
---
---
---
---
---
---
---
---
---
---
---

2017-02-27 13:27:23
 2 min 10.457 sec
23.0
0.3825886
0.4613529
0.6795715
2.6792775
0.3575121
0.3834098
0.4630676
0.6763330
2.5680488
0.3404675

2017-02-27 13:27:24
 2 min 10.754 sec
24.0
0.3824463
0.4610017
0.6803146
2.6555671
0.3882679
0.3833040
0.4628041
0.6768144
2.6029444
0.3894778

2017-02-27 13:27:24
 2 min 11.044 sec
25.0
0.3823382
0.4606860
0.6806952
2.6310309
0.4075266
0.3832088
0.4625291
0.6770913
2.6502122
0.3907798

2017-02-27 13:27:24
 2 min 11.358 sec
26.0
0.3822191
0.4603409
0.6812002
2.6544313
0.3620103
0.3830831
0.4621609
0.6777876
2.6354201
0.3918581

2017-02-27 13:27:28
 2 min 14.646 sec
450.0
0.3751121
0.4440168
0.7130151
3.2878450
0.2943231
0.3803801
0.4559513
0.6872765
2.8244144
0.3395927






    



See the whole table with table.as_data_frame()
Variable Importances: 






    




variable
relative_importance
scaled_importance
percentage
int_rate
1484.6195068
1.0
0.3178513
addr_state
1155.0074463
0.7779821
0.2472827
term_length
775.4829102
0.5223446
0.1660279
annual_inc
357.4823608
0.2407906
0.0765356
dti
239.3826904
0.1612418
0.0512509
revol_util
176.2075195
0.1186887
0.0377254
purpose
168.4001465
0.1134298
0.0360538
loan_amnt
98.8574371
0.0665877
0.0211650
total_acc
82.5610504
0.0556109
0.0176760
emp_length
52.2061844
0.0351647
0.0111771
home_ownership
36.8080368
0.0247929
0.0078805
longest_credit_length
26.6282063
0.0179360
0.0057010
delinq_2yrs
9.8602972
0.0066416
0.0021111
verification_status
7.2944059
0.0049133
0.0015617






    Out[15]:



In [16]:

    
# measure gbm AUC
print(gbm_model.auc(train=True))
print(gbm_model.auc(valid=True))
print(gbm_model.model_performance(test_data=test).auc())









    



0.7130151031503764
0.687276546121273
0.6824627358010742



In [17]:

    
# partial dependence plots are a powerful machine learning interpretation tool
# to calculate partial dependence across the domain a variable
# hold column of interest at constant value
# find the mean prediction of the model with this column constant
# repeat for multiple values of the variable of interest
# h2o has a built-in function for partial dependence as well
par_dep_dti1 = gbm_model.partial_plot(data=train, cols=['dti'], server=True, plot=True)









    



PartialDependencePlot progress: |█████████████████████████████████████████| 100%



In [18]:

    
# shutdown h2o
h2o.cluster().shutdown(prompt=False)









    



H2O session _sid_a8d9 closed.

H2O cluster uptime:	03 secs
H2O cluster version:	3.10.3.4
H2O cluster version age:	1 month
H2O cluster name:	H2O_from_python_phall_zsyxfd
H2O cluster total nodes:	1
H2O cluster free memory:	3.556 Gb
H2O cluster total cores:	8
H2O cluster allowed cores:	8
H2O cluster status:	accepting new members, healthy
H2O connection url:	http://127.0.0.1:54321
H2O connection proxy:	None
Python version:	3.5.2 final

	id	bad_loan	loan_amnt	int_rate	emp_length	home_ownership	annual_inc	purpose	addr_state	dti	delinq_2yrs	revol_util	total_acc	longest_credit_length	verification_status	term_length
type	int	enum	int	real	int	enum	real	enum	enum	real	int	real	int	int	enum	int
mins	10001.0		500.0	5.42	0.0		1896.0			0.0	0.0	0.0	1.0	0.0		36.0
mean	91994.0		13073.209220415742	13.717143207254315	5.686200649105202		71931.19588595249			15.880794152061497	0.22746713629788037	54.07622244747627	24.577910646698616	14.8582097058084		40.980679245283056
maxs	173987.0		35000.0	26.060000000000002	10.0		7141778.0			39.93	29.0	150.70000000000002	118.0	65.0		60.0
sigma	47339.11363414683		7992.3993793601785	4.3935679462170425	3.610039811481059		59464.026648950334			7.583636421364416	0.6949139713078192	25.284135504932134	11.685003948632696	6.949793041523766		9.732920010298912
zeros	0		0	0	13810		0			263	135210	1515	0	11		0
missing	0	0	4992	4854	10545	2571	4983	2488	2484	5025	4997	5154	4933	4907	2426	4987
0	10001.0	0	5000.0	10.65	10.0	RENT	24000.0	credit_card	AZ	27.650000000000002	0.0	83.7	9.0	26.0	verified	36.0
1	10002.0	1	2500.0	15.27	0.0	RENT	30000.0	car	GA	1.0	0.0	9.4	4.0	12.0	verified	60.0
2	10003.0	0	2400.0	15.96	10.0	RENT	12252.0	small_business	IL	8.72	0.0	98.5	10.0	10.0	not verified	36.0
3	10004.0	0	10000.0	13.49	10.0	RENT	nan	other	CA	20.0	0.0	nan	37.0	15.0	verified	36.0
4	10005.0	0	5000.0	7.9	3.0	RENT	36000.0	wedding	AZ	11.200000000000001	0.0	28.3	12.0	nan	verified	36.0
5	10006.0	0	3000.0	18.64	9.0	RENT	48000.0	car	CA	5.3500000000000005	0.0	87.5	4.0	4.0	verified	36.0
6	10007.0	1	5600.0	21.28	4.0	OWN	40000.0	small_business	CA	5.55	0.0	32.6	13.0	7.0	verified	60.0
7	10008.0	1	5375.0	12.69	0.0	RENT	15000.0	other	TX	18.080000000000002	0.0	36.5	3.0	7.0	verified	60.0
8	10009.0	0	6500.0	14.65	5.0	OWN	72000.0	debt_consolidation	AZ	16.12	0.0	20.6	23.0	13.0	not verified	60.0
9	10010.0	0	12000.0	12.69	10.0	OWN	75000.0	debt_consolidation	CA	10.78	0.0	67.1	34.0	22.0	verified	36.0

home_ownership	Count
ANY	1
MORTGAGE	74209
NONE	30
OTHER	151
OWN	13369
RENT	69416
mortgage	4240

	0	1	Error	Rate
0	30044.0	23039.0	0.434	(23039.0/53083.0)
1	4286.0	8215.0	0.3429	(4286.0/12501.0)
Total	34330.0	31254.0	0.4166	(27325.0/65584.0)

metric	threshold	value	idx
max f1	0.1862863	0.3754999	258.0
max f2	0.0833365	0.5525161	342.0
max f0point5	0.3056381	0.3285712	168.0
max accuracy	0.6294603	0.8096030	25.0
max precision	0.6402050	0.5441176	22.0
max recall	0.0000035	1.0	399.0
max specificity	0.875	0.9999812	0.0
max absolute_mcc	0.2442978	0.1786798	211.0
max min_per_class_accuracy	0.2013003	0.6060315	247.0
max mean_per_class_accuracy	0.1862863	0.6115645	258.0

group	cumulative_data_fraction	lower_threshold	lift	cumulative_lift	response_rate	cumulative_response_rate	capture_rate	cumulative_capture_rate	gain	cumulative_gain
1	0.0100024	0.8226434	5.2463003	5.2463003	1.0	1.0	0.0524758	0.0524758	424.6300296	424.6300296
2	0.0200049	0.7987747	5.2463003	5.2463003	1.0	1.0	0.0524758	0.1049516	424.6300296	424.6300296
3	0.0300073	0.7860458	5.2463003	5.2463003	1.0	1.0	0.0524758	0.1574274	424.6300296	424.6300296
4	0.0403147	0.7696629	5.2463003	5.2463003	1.0	1.0	0.0540757	0.2115031	424.6300296	424.6300296
5	0.0500732	0.7584270	5.2463003	5.2463003	1.0	1.0	0.0511959	0.2626990	424.6300296	424.6300296
6	0.1045072	0.7078652	5.2463003	5.2463003	1.0	1.0	0.2855772	0.5482761	424.6300296	424.6300296
7	0.1500061	0.6570082	5.2463003	5.2463003	1.0	1.0	0.2387009	0.7869770	424.6300296	424.6300296
8	0.2019395	0.2022472	4.1018490	4.9519783	0.7818555	0.9438991	0.2130230	1.0	310.1848998	395.1978254
9	0.3141162	0.1235955	0.0	3.1835348	0.0	0.6068152	0.0	1.0	-100.0	218.3534780
10	0.4004940	0.1011236	0.0	2.4969162	0.0	0.4759385	0.0	1.0	-100.0	149.6916165
11	0.5261192	0.0786517	0.0	1.9007100	0.0	0.3622953	0.0	1.0	-100.0	90.0710042
12	0.6011832	0.0674157	0.0	1.6633864	0.0	0.3170589	0.0	1.0	-100.0	66.3386426
13	0.7000030	0.0516497	0.0	1.4285652	0.0	0.2722995	0.0	1.0	-100.0	42.8565205
14	0.8015522	0.0374532	0.0	1.2475794	0.0	0.2378017	0.0	1.0	-100.0	24.7579372
15	0.9233350	0.0224719	0.0	1.0830306	0.0	0.2064370	0.0	1.0	-100.0	8.3030583
16	1.0	0.0	0.0	1.0	0.0	0.1906105	0.0	1.0	-100.0	0.0

	0	1	Error	Rate
0	23607.0	16029.0	0.4044	(16029.0/39636.0)
1	3339.0	6203.0	0.3499	(3339.0/9542.0)
Total	26946.0	22232.0	0.3938	(19368.0/49178.0)

	timestamp	duration	number_of_trees	training_rmse	training_logloss	training_auc	training_lift	training_classification_error	validation_rmse	validation_logloss	validation_auc	validation_lift	validation_classification_error
	2017-03-03 20:28:27	0.019 sec	0.0	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
	2017-03-03 20:28:28	0.830 sec	1.0	0.5376840	9.8706286	0.5350765	3.7246827	0.8118898	0.5382119	9.8741835	0.5325894	1.2833840	0.8059701
	2017-03-03 20:28:28	1.424 sec	2.0	0.5213499	8.7506121	0.5397091	5.0012721	0.8122780	0.4686261	5.1233924	0.5528048	1.5258930	0.8059701
	2017-03-03 20:28:29	1.836 sec	3.0	0.5075518	7.7599338	0.5428677	5.2162156	0.8117777	0.4415145	3.4290300	0.5681898	1.8955085	0.4428403
	2017-03-03 20:28:29	2.173 sec	4.0	0.4948075	6.9156472	0.5463457	5.2397933	0.8101394	0.4270037	2.6534140	0.5804040	1.7463446	0.4851763
---	---	---	---	---	---	---	---	---	---	---	---	---	---
	2017-03-03 20:30:43	2 min 15.662 sec	85.0	0.3861816	0.5237025	0.6513079	5.2463003	0.4052055	0.3845064	0.4668363	0.6679149	2.4178538	0.4003416
	2017-03-03 20:30:46	2 min 18.822 sec	86.0	0.3861129	0.5220731	0.6515432	5.2463003	0.4176628	0.3844789	0.4667840	0.6681149	2.4302689	0.3714873
	2017-03-03 20:30:49	2 min 22.102 sec	87.0	0.3860695	0.5195898	0.6516770	5.2463003	0.4166413	0.3844831	0.4667076	0.6680309	2.4408015	0.3860873
	2017-03-03 20:30:52	2 min 25.561 sec	88.0	0.3859759	0.5192995	0.6519903	5.2463003	0.4173274	0.3844690	0.4666676	0.6681041	2.4223077	0.4132336
	2017-03-03 20:30:56	2 min 29.101 sec	89.0	0.3858812	0.5180827	0.6524497	5.2463003	0.4166413	0.3844108	0.4665519	0.6683743	2.4144144	0.3938346

variable	relative_importance	scaled_importance	percentage
addr_state	98091.9843750	1.0	0.1760907
int_rate	63342.5156250	0.6457461	0.1137099
dti	53355.0156250	0.5439284	0.0957807
revol_util	51702.1562500	0.5270783	0.0928136
loan_amnt	47027.6718750	0.4794242	0.0844221
total_acc	45383.2265625	0.4626599	0.0814701
longest_credit_length	43434.3593750	0.4427921	0.0779716
annual_inc	41564.2109375	0.4237269	0.0746144
emp_length	35672.1015625	0.3636597	0.0640371
purpose	32235.9804688	0.3286301	0.0578687
home_ownership	14451.3808594	0.1473248	0.0259425
delinq_2yrs	12333.5097656	0.1257341	0.0221406
term_length	9890.6123047	0.1008300	0.0177552
verification_status	8568.9960938	0.0873567	0.0153827

	0	1	Error	Rate
0	39108.0	13824.0	0.2612	(13824.0/52932.0)
1	5478.0	7171.0	0.4331	(5478.0/12649.0)
Total	44586.0	20995.0	0.2943	(19302.0/65581.0)

	0	1	Error	Rate
0	26838.0	12800.0	0.3229	(12800.0/39638.0)
1	3892.0	5623.0	0.409	(3892.0/9515.0)
Total	30730.0	18423.0	0.3396	(16692.0/49153.0)