notebook.community

Edit and run



In [1]:

    
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator



In [2]:

    
h2o.init()









    



Warning: Version mismatch. H2O is version 3.5.0.99999, but the python package is version UNKNOWN.






    




H2O cluster uptime: 
52 minutes 26 seconds 170 milliseconds 
H2O cluster version: 
3.5.0.99999
H2O cluster name: 
ludirehak
H2O cluster total nodes: 
1
H2O cluster total memory: 
4.44 GB
H2O cluster total cores: 
8
H2O cluster allowed cores: 
8
H2O cluster healthy: 
True
H2O Connection ip: 
127.0.0.1
H2O Connection port: 
54321



In [3]:

    
from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.

# Airlines dataset
air = h2o.import_file(path=_locate("smalldata/airlines/AirlinesTrain.csv.zip"))









    



Parse Progress: [##################################################] 100%
Imported /Users/ludirehak/h2o-3/smalldata/airlines/AirlinesTrain.csv.zip. Parsed 24,421 rows and 12 cols



In [4]:

    
# Construct validation and training datasets by sampling (20/80)
r = air[0].runif()
air_train = air[r < 0.8]
air_valid = air[r >= 0.8]

myX = ["Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek"]
myY = "IsDepDelayed"



In [5]:

    
# Build gbm
gbm = H2OGradientBoostingEstimator(distribution="bernoulli", 
                                   ntrees=100, 
                                   max_depth=3, 
                                   learn_rate=0.01)

gbm.train(x               =myX, 
          y               =myY, 
          training_frame  =air_train,
          validation_frame=air_valid)









    



gbm Model Build Progress: [##################################################] 100%



In [6]:

    
# Show various confusion matrices for training dataset (based on metric(s))
print(gbm.confusion_matrix()) # maximum f1 threshold chosen by default

print(gbm.confusion_matrix(metrics="f2"))

print(gbm.confusion_matrix(metrics="precision"))

cms = gbm.confusion_matrix(metrics=["accuracy", "f0point5"])
print(cms[0])
print(cms[1])









    



Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.438866890551:






    





NO
YES
Error
Rate
NO
2172.0
6695.0
0.755
 (6695.0/8867.0)
YES
790.0
9867.0
0.0741
 (790.0/10657.0)
Total
2962.0
16562.0
0.3834
 (7485.0/19524.0)






    




Confusion Matrix (Act/Pred) for max f2 @ threshold = 0.381490353472:






    





NO
YES
Error
Rate
NO
172.0
8695.0
0.9806
 (8695.0/8867.0)
YES
23.0
10634.0
0.0022
 (23.0/10657.0)
Total
195.0
19329.0
0.4465
 (8718.0/19524.0)






    




Confusion Matrix (Act/Pred) for max precision @ threshold = 0.685762034833:






    





NO
YES
Error
Rate
NO
8866.0
1.0
0.0001
 (1.0/8867.0)
YES
10630.0
27.0
0.9975
 (10630.0/10657.0)
Total
19496.0
28.0
0.5445
 (10631.0/19524.0)






    




Confusion Matrix (Act/Pred) for max accuracy @ threshold = 0.509389999822:






    





NO
YES
Error
Rate
NO
4671.0
4196.0
0.4732
 (4196.0/8867.0)
YES
2557.0
8100.0
0.2399
 (2557.0/10657.0)
Total
7228.0
12296.0
0.3459
 (6753.0/19524.0)






    




Confusion Matrix (Act/Pred) for max f0point5 @ threshold = 0.54046757144:






    





NO
YES
Error
Rate
NO
5378.0
3489.0
0.3935
 (3489.0/8867.0)
YES
3297.0
7360.0
0.3094
 (3297.0/10657.0)
Total
8675.0
10849.0
0.3476
 (6786.0/19524.0)



In [7]:

    
# Show various confusion matrices for training dataset (based on threshold(s))
print(gbm.confusion_matrix(thresholds=0.77))

cms = gbm.confusion_matrix(thresholds=[0.1, 0.5, 0.99])
print(cms[0])
print(cms[1])
print(cms[2])









    



Could not find exact threshold 0.77; using closest threshold found 0.685762034833.

Confusion Matrix (Act/Pred) @ threshold = 0.685762034833:






    





NO
YES
Error
Rate
NO
8866.0
1.0
0.0001
 (1.0/8867.0)
YES
10630.0
27.0
0.9975
 (10630.0/10657.0)
Total
19496.0
28.0
0.5445
 (10631.0/19524.0)






    



Could not find exact threshold 0.1; using closest threshold found 0.373879538649.
Could not find exact threshold 0.5; using closest threshold found 0.49962104911.
Could not find exact threshold 0.99; using closest threshold found 0.685762034833.

Confusion Matrix (Act/Pred) @ threshold = 0.373879538649:






    





NO
YES
Error
Rate
NO
0.0
8867.0
1.0
 (8867.0/8867.0)
YES
0.0
10657.0
0.0
 (0.0/10657.0)
Total
0.0
19524.0
0.4542
 (8867.0/19524.0)






    




Confusion Matrix (Act/Pred) @ threshold = 0.49962104911:






    





NO
YES
Error
Rate
NO
4463.0
4404.0
0.4967
 (4404.0/8867.0)
YES
2400.0
8257.0
0.2252
 (2400.0/10657.0)
Total
6863.0
12661.0
0.3485
 (6804.0/19524.0)






    




Confusion Matrix (Act/Pred) @ threshold = 0.685762034833:






    





NO
YES
Error
Rate
NO
8866.0
1.0
0.0001
 (1.0/8867.0)
YES
10630.0
27.0
0.9975
 (10630.0/10657.0)
Total
19496.0
28.0
0.5445
 (10631.0/19524.0)



In [8]:

    
# Show various confusion matrices for validation dataset (based on metric(s))
print(gbm.confusion_matrix(metrics="f2", valid=True))

print(gbm.confusion_matrix(metrics="precision", valid=True))

cms = gbm.confusion_matrix(metrics=["accuracy", "f0point5"], valid=True)
print(cms[0])
print(cms[1])









    



Confusion Matrix (Act/Pred) for max f2 @ threshold = 0.385734623697:






    





NO
YES
Error
Rate
NO
80.0
2119.0
0.9636
 (2119.0/2199.0)
YES
13.0
2685.0
0.0048
 (13.0/2698.0)
Total
93.0
4804.0
0.4354
 (2132.0/4897.0)






    




Confusion Matrix (Act/Pred) for max precision @ threshold = 0.683022938978:






    





NO
YES
Error
Rate
NO
2191.0
8.0
0.0036
 (8.0/2199.0)
YES
2632.0
66.0
0.9755
 (2632.0/2698.0)
Total
4823.0
74.0
0.5391
 (2640.0/4897.0)






    




Confusion Matrix (Act/Pred) for max accuracy @ threshold = 0.518825062343:






    





NO
YES
Error
Rate
NO
1188.0
1011.0
0.4598
 (1011.0/2199.0)
YES
684.0
2014.0
0.2535
 (684.0/2698.0)
Total
1872.0
3025.0
0.3461
 (1695.0/4897.0)






    




Confusion Matrix (Act/Pred) for max f0point5 @ threshold = 0.540424490283:






    





NO
YES
Error
Rate
NO
1316.0
883.0
0.4015
 (883.0/2199.0)
YES
818.0
1880.0
0.3032
 (818.0/2698.0)
Total
2134.0
2763.0
0.3474
 (1701.0/4897.0)



In [9]:

    
# Show various confusion matrices for validation dataset (based on threshold(s))
print(gbm.confusion_matrix(thresholds=0.77))

cms = gbm.confusion_matrix(thresholds=[0.25, 0.33, 0.44])
print(cms[0])
print(cms[1])
print(cms[2])









    



Could not find exact threshold 0.77; using closest threshold found 0.685762034833.

Confusion Matrix (Act/Pred) @ threshold = 0.685762034833:






    





NO
YES
Error
Rate
NO
8866.0
1.0
0.0001
 (1.0/8867.0)
YES
10630.0
27.0
0.9975
 (10630.0/10657.0)
Total
19496.0
28.0
0.5445
 (10631.0/19524.0)






    



Could not find exact threshold 0.25; using closest threshold found 0.373879538649.
Could not find exact threshold 0.33; using closest threshold found 0.373879538649.
Could not find exact threshold 0.44; using closest threshold found 0.44006560762.

Confusion Matrix (Act/Pred) @ threshold = 0.373879538649:






    





NO
YES
Error
Rate
NO
0.0
8867.0
1.0
 (8867.0/8867.0)
YES
0.0
10657.0
0.0
 (0.0/10657.0)
Total
0.0
19524.0
0.4542
 (8867.0/19524.0)






    




Confusion Matrix (Act/Pred) @ threshold = 0.373879538649:






    





NO
YES
Error
Rate
NO
0.0
8867.0
1.0
 (8867.0/8867.0)
YES
0.0
10657.0
0.0
 (0.0/10657.0)
Total
0.0
19524.0
0.4542
 (8867.0/19524.0)






    




Confusion Matrix (Act/Pred) @ threshold = 0.44006560762:






    





NO
YES
Error
Rate
NO
2235.0
6632.0
0.7479
 (6632.0/8867.0)
YES
856.0
9801.0
0.0803
 (856.0/10657.0)
Total
3091.0
16433.0
0.3835
 (7488.0/19524.0)



In [10]:

    
# Show various confusion matrices for validation dataset (based on metric(s) AND threshold(s))
cms = gbm.confusion_matrix(thresholds=0.77, metrics="f1") 
print(cms[0])
print(cms[1])

cms = gbm.confusion_matrix(thresholds=[0.25, 0.33], metrics=["f2", "f0point5"])
print(cms[0])
print(cms[1])
print(cms[2])
print(cms[3])









    



Could not find exact threshold 0.77; using closest threshold found 0.685762034833.

Confusion Matrix (Act/Pred) @ threshold = 0.685762034833:






    





NO
YES
Error
Rate
NO
8866.0
1.0
0.0001
 (1.0/8867.0)
YES
10630.0
27.0
0.9975
 (10630.0/10657.0)
Total
19496.0
28.0
0.5445
 (10631.0/19524.0)






    




Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.438866890551:






    





NO
YES
Error
Rate
NO
2172.0
6695.0
0.755
 (6695.0/8867.0)
YES
790.0
9867.0
0.0741
 (790.0/10657.0)
Total
2962.0
16562.0
0.3834
 (7485.0/19524.0)






    



Could not find exact threshold 0.25; using closest threshold found 0.373879538649.
Could not find exact threshold 0.33; using closest threshold found 0.373879538649.

Confusion Matrix (Act/Pred) @ threshold = 0.373879538649:






    





NO
YES
Error
Rate
NO
0.0
8867.0
1.0
 (8867.0/8867.0)
YES
0.0
10657.0
0.0
 (0.0/10657.0)
Total
0.0
19524.0
0.4542
 (8867.0/19524.0)






    




Confusion Matrix (Act/Pred) @ threshold = 0.373879538649:






    





NO
YES
Error
Rate
NO
0.0
8867.0
1.0
 (8867.0/8867.0)
YES
0.0
10657.0
0.0
 (0.0/10657.0)
Total
0.0
19524.0
0.4542
 (8867.0/19524.0)






    




Confusion Matrix (Act/Pred) for max f2 @ threshold = 0.381490353472:






    





NO
YES
Error
Rate
NO
172.0
8695.0
0.9806
 (8695.0/8867.0)
YES
23.0
10634.0
0.0022
 (23.0/10657.0)
Total
195.0
19329.0
0.4465
 (8718.0/19524.0)






    




Confusion Matrix (Act/Pred) for max f0point5 @ threshold = 0.54046757144:






    





NO
YES
Error
Rate
NO
5378.0
3489.0
0.3935
 (3489.0/8867.0)
YES
3297.0
7360.0
0.3094
 (3297.0/10657.0)
Total
8675.0
10849.0
0.3476
 (6786.0/19524.0)



In [11]:

    
# Test dataset
air_test = h2o.import_file(path=_locate("smalldata/airlines/AirlinesTest.csv.zip"))

# Test performance
gbm_perf = gbm.model_performance(air_test)









    



Parse Progress: [##################################################] 100%
Imported /Users/ludirehak/h2o-3/smalldata/airlines/AirlinesTest.csv.zip. Parsed 2,691 rows and 12 cols



In [12]:

    
# Show various confusion matrices for test dataset (based on metric(s))
print(gbm_perf.confusion_matrix(metrics="f0point5"))

print(gbm_perf.confusion_matrix(metrics="min_per_class_accuracy"))

cms = gbm_perf.confusion_matrix(metrics=["accuracy", "f0point5"])
print(cms[0])
print(cms[1])









    



Confusion Matrix (Act/Pred) for max f0point5 @ threshold = 0.532641218074:






    





NO
YES
Error
Rate
NO
694.0
523.0
0.4297
 (523.0/1217.0)
YES
398.0
1076.0
0.27
 (398.0/1474.0)
Total
1092.0
1599.0
0.3423
 (921.0/2691.0)






    




Confusion Matrix (Act/Pred) for max min_per_class_accuracy @ threshold = 0.550904005776:






    





NO
YES
Error
Rate
NO
779.0
438.0
0.3599
 (438.0/1217.0)
YES
530.0
944.0
0.3596
 (530.0/1474.0)
Total
1309.0
1382.0
0.3597
 (968.0/2691.0)






    




Confusion Matrix (Act/Pred) for max accuracy @ threshold = 0.532641218074:






    





NO
YES
Error
Rate
NO
694.0
523.0
0.4297
 (523.0/1217.0)
YES
398.0
1076.0
0.27
 (398.0/1474.0)
Total
1092.0
1599.0
0.3423
 (921.0/2691.0)






    




Confusion Matrix (Act/Pred) for max accuracy @ threshold = 0.532641218074:






    





NO
YES
Error
Rate
NO
694.0
523.0
0.4297
 (523.0/1217.0)
YES
398.0
1076.0
0.27
 (398.0/1474.0)
Total
1092.0
1599.0
0.3423
 (921.0/2691.0)



In [13]:

    
# Show various confusion matrices for test dataset (based on threshold(s))
print(gbm_perf.confusion_matrix(thresholds=0.5))

cms = gbm_perf.confusion_matrix(thresholds=[0.01, 0.75, .88])
print(cms[0])
print(cms[1])
print(cms[2])









    



Could not find exact threshold 0.5; using closest threshold found 0.499551746996.

Confusion Matrix (Act/Pred) @ threshold = 0.499551746996:






    





NO
YES
Error
Rate
NO
576.0
641.0
0.5267
 (641.0/1217.0)
YES
311.0
1163.0
0.211
 (311.0/1474.0)
Total
887.0
1804.0
0.3538
 (952.0/2691.0)






    



Could not find exact threshold 0.01; using closest threshold found 0.37382486349.
Could not find exact threshold 0.75; using closest threshold found 0.6857620914.
Could not find exact threshold 0.88; using closest threshold found 0.6857620914.

Confusion Matrix (Act/Pred) @ threshold = 0.37382486349:






    





NO
YES
Error
Rate
NO
0.0
1217.0
1.0
 (1217.0/1217.0)
YES
0.0
1474.0
0.0
 (0.0/1474.0)
Total
0.0
2691.0
0.4522
 (1217.0/2691.0)






    




Confusion Matrix (Act/Pred) @ threshold = 0.6857620914:






    





NO
YES
Error
Rate
NO
1216.0
1.0
0.0008
 (1.0/1217.0)
YES
1473.0
1.0
0.9993
 (1473.0/1474.0)
Total
2689.0
2.0
0.5478
 (1474.0/2691.0)






    




Confusion Matrix (Act/Pred) @ threshold = 0.6857620914:






    





NO
YES
Error
Rate
NO
1216.0
1.0
0.0008
 (1.0/1217.0)
YES
1473.0
1.0
0.9993
 (1473.0/1474.0)
Total
2689.0
2.0
0.5478
 (1474.0/2691.0)



In [14]:

    
# Convert a ConfusionMatrix to a python list of lists: [ [tns,fps], [fns,tps] ]
cm = gbm.confusion_matrix()
print(cm.to_list())

cm = gbm_perf.confusion_matrix()
print(cm.to_list())









    



[[2172, 6695], [790, 9867]]
[[389, 828], [172, 1302]]

H2O cluster uptime:	52 minutes 26 seconds 170 milliseconds
H2O cluster version:	3.5.0.99999
H2O cluster name:	ludirehak
H2O cluster total nodes:	1
H2O cluster total memory:	4.44 GB
H2O cluster total cores:	8
H2O cluster allowed cores:	8
H2O cluster healthy:	True
H2O Connection ip:	127.0.0.1
H2O Connection port:	54321

	NO	YES	Error	Rate
NO	2172.0	6695.0	0.755	(6695.0/8867.0)
YES	790.0	9867.0	0.0741	(790.0/10657.0)
Total	2962.0	16562.0	0.3834	(7485.0/19524.0)