notebook.community

Edit and run



In [1]:

    
import h2o



In [2]:

    
h2o.init()









    




H2O cluster uptime: 
3 minutes 37 seconds 60 milliseconds 
H2O cluster version: 
3.5.0.99999
H2O cluster name: 
ece
H2O cluster total nodes: 
1
H2O cluster total memory: 
10.67 GB
H2O cluster total cores: 
8
H2O cluster allowed cores: 
8
H2O cluster healthy: 
True
H2O Connection ip: 
127.0.0.1
H2O Connection port: 
54321



In [3]:

    
from h2o.h2o import _locate # private function. used to find files within h2o git project directory.

# Airlines dataset
air = h2o.import_file(path=_locate("smalldata/airlines/AirlinesTrain.csv.zip"))









    



Parse Progress: [##################################################] 100%
Imported /Users/ece/0xdata/h2o-dev/smalldata/airlines/AirlinesTrain.csv.zip. Parsed 24,421 rows and 12 cols



In [4]:

    
# Construct validation and training datasets by sampling (20/80)
r = air[0].runif()
air_train = air[r < 0.8]
air_valid = air[r >= 0.8]

myX = ["Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek"]
myY = "IsDepDelayed"



In [5]:

    
# Build gbm
gbm = h2o.gbm(x=air_train[myX], 
              y=air_train[myY], 
              validation_x=air_valid[myX],
              validation_y=air_valid[myY],
              distribution="bernoulli", 
              ntrees=100, 
              max_depth=3, 
              learn_rate=0.01)









    



gbm Model Build Progress: [##################################################] 100%



In [6]:

    
# Show various confusion matrices for training dataset (based on metric(s))
print gbm.confusion_matrix() # maximum f1 threshold chosen by default

print gbm.confusion_matrix(metrics="f2")

print gbm.confusion_matrix(metrics="precision")

cms = gbm.confusion_matrix(metrics=["accuracy", "f0point5"])
print cms[0]
print cms[1]









    



Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.459414927132:






    





NO
YES
Error
Rate
NO
3220.0
5687.0
0.6385
 (5687.0/8907.0)
YES
1357.0
9404.0
0.1261
 (1357.0/10761.0)
Total
4577.0
15091.0
0.3581
 (7044.0/19668.0)






    




Confusion Matrix (Act/Pred) for max f2 @ threshold = 0.381066670162:






    





NO
YES
Error
Rate
NO
127.0
8780.0
0.9857
 (8780.0/8907.0)
YES
21.0
10740.0
0.002
 (21.0/10761.0)
Total
148.0
19520.0
0.4475
 (8801.0/19668.0)






    




Confusion Matrix (Act/Pred) for max precision @ threshold = 0.684200366538:






    





NO
YES
Error
Rate
NO
8892.0
15.0
0.0017
 (15.0/8907.0)
YES
10640.0
121.0
0.9888
 (10640.0/10761.0)
Total
19532.0
136.0
0.5417
 (10655.0/19668.0)






    




Confusion Matrix (Act/Pred) for max accuracy @ threshold = 0.505407976974:






    





NO
YES
Error
Rate
NO
4678.0
4229.0
0.4748
 (4229.0/8907.0)
YES
2554.0
8207.0
0.2373
 (2554.0/10761.0)
Total
7232.0
12436.0
0.3449
 (6783.0/19668.0)






    




Confusion Matrix (Act/Pred) for max f0point5 @ threshold = 0.552458757447:






    





NO
YES
Error
Rate
NO
5693.0
3214.0
0.3608
 (3214.0/8907.0)
YES
3595.0
7166.0
0.3341
 (3595.0/10761.0)
Total
9288.0
10380.0
0.3462
 (6809.0/19668.0)



In [7]:

    
# Show various confusion matrices for training dataset (based on threshold(s))
print gbm.confusion_matrix(thresholds=0.77) 

cms = gbm.confusion_matrix(thresholds=[0.1, 0.5, 0.99])
print cms[0]
print cms[1]
print cms[2]









    



Could not find exact threshold 0.77; using closest threshold found 0.684200366538.

Confusion Matrix (Act/Pred) @ threshold = 0.684200366538:






    





NO
YES
Error
Rate
NO
8892.0
15.0
0.0017
 (15.0/8907.0)
YES
10640.0
121.0
0.9888
 (10640.0/10761.0)
Total
19532.0
136.0
0.5417
 (10655.0/19668.0)






    



Could not find exact threshold 0.1; using closest threshold found 0.376167626413.
Could not find exact threshold 0.5; using closest threshold found 0.499996937724.
Could not find exact threshold 0.99; using closest threshold found 0.684200366538.

Confusion Matrix (Act/Pred) @ threshold = 0.376167626413:






    





NO
YES
Error
Rate
NO
0.0
8907.0
1.0
 (8907.0/8907.0)
YES
0.0
10761.0
0.0
 (0.0/10761.0)
Total
0.0
19668.0
0.4529
 (8907.0/19668.0)






    




Confusion Matrix (Act/Pred) @ threshold = 0.499996937724:






    





NO
YES
Error
Rate
NO
4483.0
4424.0
0.4967
 (4424.0/8907.0)
YES
2411.0
8350.0
0.224
 (2411.0/10761.0)
Total
6894.0
12774.0
0.3475
 (6835.0/19668.0)






    




Confusion Matrix (Act/Pred) @ threshold = 0.684200366538:






    





NO
YES
Error
Rate
NO
8892.0
15.0
0.0017
 (15.0/8907.0)
YES
10640.0
121.0
0.9888
 (10640.0/10761.0)
Total
19532.0
136.0
0.5417
 (10655.0/19668.0)



In [8]:

    
# Show various confusion matrices for validation dataset (based on metric(s))
print gbm.confusion_matrix(metrics="f2", valid=True)

print gbm.confusion_matrix(metrics="precision", valid=True)

cms = gbm.confusion_matrix(metrics=["accuracy", "f0point5"], valid=True)
print cms[0]
print cms[1]









    



Confusion Matrix (Act/Pred) for max f2 @ threshold = 0.389262598446:






    





NO
YES
Error
Rate
NO
52.0
2107.0
0.9759
 (2107.0/2159.0)
YES
4.0
2590.0
0.0015
 (4.0/2594.0)
Total
56.0
4697.0
0.4441
 (2111.0/4753.0)






    




Confusion Matrix (Act/Pred) for max precision @ threshold = 0.684273290486:






    





NO
YES
Error
Rate
NO
2159.0
0.0
0.0
 (0.0/2159.0)
YES
2578.0
16.0
0.9938
 (2578.0/2594.0)
Total
4737.0
16.0
0.5424
 (2578.0/4753.0)






    




Confusion Matrix (Act/Pred) for max accuracy @ threshold = 0.522340383226:






    





NO
YES
Error
Rate
NO
1169.0
990.0
0.4585
 (990.0/2159.0)
YES
684.0
1910.0
0.2637
 (684.0/2594.0)
Total
1853.0
2900.0
0.3522
 (1674.0/4753.0)






    




Confusion Matrix (Act/Pred) for max accuracy @ threshold = 0.522340383226:






    





NO
YES
Error
Rate
NO
1169.0
990.0
0.4585
 (990.0/2159.0)
YES
684.0
1910.0
0.2637
 (684.0/2594.0)
Total
1853.0
2900.0
0.3522
 (1674.0/4753.0)



In [9]:

    
# Show various confusion matrices for validation dataset (based on threshold(s))
print gbm.confusion_matrix(thresholds=0.77) 

cms = gbm.confusion_matrix(thresholds=[0.25, 0.33, 0.44])
print cms[0]
print cms[1]
print cms[2]









    



Could not find exact threshold 0.77; using closest threshold found 0.684200366538.

Confusion Matrix (Act/Pred) @ threshold = 0.684200366538:






    





NO
YES
Error
Rate
NO
8892.0
15.0
0.0017
 (15.0/8907.0)
YES
10640.0
121.0
0.9888
 (10640.0/10761.0)
Total
19532.0
136.0
0.5417
 (10655.0/19668.0)






    



Could not find exact threshold 0.25; using closest threshold found 0.376167626413.
Could not find exact threshold 0.33; using closest threshold found 0.376167626413.
Could not find exact threshold 0.44; using closest threshold found 0.440166828406.

Confusion Matrix (Act/Pred) @ threshold = 0.376167626413:






    





NO
YES
Error
Rate
NO
0.0
8907.0
1.0
 (8907.0/8907.0)
YES
0.0
10761.0
0.0
 (0.0/10761.0)
Total
0.0
19668.0
0.4529
 (8907.0/19668.0)






    




Confusion Matrix (Act/Pred) @ threshold = 0.376167626413:






    





NO
YES
Error
Rate
NO
0.0
8907.0
1.0
 (8907.0/8907.0)
YES
0.0
10761.0
0.0
 (0.0/10761.0)
Total
0.0
19668.0
0.4529
 (8907.0/19668.0)






    




Confusion Matrix (Act/Pred) @ threshold = 0.440166828406:






    





NO
YES
Error
Rate
NO
2299.0
6608.0
0.7419
 (6608.0/8907.0)
YES
919.0
9842.0
0.0854
 (919.0/10761.0)
Total
3218.0
16450.0
0.3827
 (7527.0/19668.0)



In [10]:

    
# Show various confusion matrices for validation dataset (based on metric(s) AND threshold(s))
cms = gbm.confusion_matrix(thresholds=0.77, metrics="f1") 
print cms[0]
print cms[1]

cms = gbm.confusion_matrix(thresholds=[0.25, 0.33], metrics=["f2", "f0point5"])
print cms[0]
print cms[1]
print cms[2]
print cms[3]









    



Could not find exact threshold 0.77; using closest threshold found 0.684200366538.

Confusion Matrix (Act/Pred) @ threshold = 0.684200366538:






    





NO
YES
Error
Rate
NO
8892.0
15.0
0.0017
 (15.0/8907.0)
YES
10640.0
121.0
0.9888
 (10640.0/10761.0)
Total
19532.0
136.0
0.5417
 (10655.0/19668.0)






    




Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.459414927132:






    





NO
YES
Error
Rate
NO
3220.0
5687.0
0.6385
 (5687.0/8907.0)
YES
1357.0
9404.0
0.1261
 (1357.0/10761.0)
Total
4577.0
15091.0
0.3581
 (7044.0/19668.0)






    



Could not find exact threshold 0.25; using closest threshold found 0.376167626413.
Could not find exact threshold 0.33; using closest threshold found 0.376167626413.

Confusion Matrix (Act/Pred) @ threshold = 0.376167626413:






    





NO
YES
Error
Rate
NO
0.0
8907.0
1.0
 (8907.0/8907.0)
YES
0.0
10761.0
0.0
 (0.0/10761.0)
Total
0.0
19668.0
0.4529
 (8907.0/19668.0)






    




Confusion Matrix (Act/Pred) @ threshold = 0.376167626413:






    





NO
YES
Error
Rate
NO
0.0
8907.0
1.0
 (8907.0/8907.0)
YES
0.0
10761.0
0.0
 (0.0/10761.0)
Total
0.0
19668.0
0.4529
 (8907.0/19668.0)






    




Confusion Matrix (Act/Pred) for max f2 @ threshold = 0.381066670162:






    





NO
YES
Error
Rate
NO
127.0
8780.0
0.9857
 (8780.0/8907.0)
YES
21.0
10740.0
0.002
 (21.0/10761.0)
Total
148.0
19520.0
0.4475
 (8801.0/19668.0)






    




Confusion Matrix (Act/Pred) for max f0point5 @ threshold = 0.552458757447:






    





NO
YES
Error
Rate
NO
5693.0
3214.0
0.3608
 (3214.0/8907.0)
YES
3595.0
7166.0
0.3341
 (3595.0/10761.0)
Total
9288.0
10380.0
0.3462
 (6809.0/19668.0)



In [11]:

    
# Test dataset
air_test = h2o.import_file(path=_locate("smalldata/airlines/AirlinesTest.csv.zip"))

# Test performance
gbm_perf = gbm.model_performance(air_test)









    



Parse Progress: [##################################################] 100%
Imported /Users/ece/0xdata/h2o-dev/smalldata/airlines/AirlinesTest.csv.zip. Parsed 2,691 rows and 12 cols



In [12]:

    
# Show various confusion matrices for test dataset (based on metric(s))
print gbm_perf.confusion_matrix(metrics="f0point5")

print gbm_perf.confusion_matrix(metrics="min_per_class_accuracy")

cms = gbm_perf.confusion_matrix(metrics=["accuracy", "f0point5"])
print cms[0]
print cms[1]









    



Confusion Matrix (Act/Pred) for max f0point5 @ threshold = 0.552672913274:






    





NO
YES
Error
Rate
NO
759.0
458.0
0.3763
 (458.0/1217.0)
YES
478.0
996.0
0.3243
 (478.0/1474.0)
Total
1237.0
1454.0
0.3478
 (936.0/2691.0)






    




Confusion Matrix (Act/Pred) for max min_per_class_accuracy @ threshold = 0.55367524629:






    





NO
YES
Error
Rate
NO
786.0
431.0
0.3541
 (431.0/1217.0)
YES
524.0
950.0
0.3555
 (524.0/1474.0)
Total
1310.0
1381.0
0.3549
 (955.0/2691.0)






    




Confusion Matrix (Act/Pred) for max accuracy @ threshold = 0.54604440874:






    





NO
YES
Error
Rate
NO
738.0
479.0
0.3936
 (479.0/1217.0)
YES
454.0
1020.0
0.308
 (454.0/1474.0)
Total
1192.0
1499.0
0.3467
 (933.0/2691.0)






    




Confusion Matrix (Act/Pred) for max f0point5 @ threshold = 0.552672913274:






    





NO
YES
Error
Rate
NO
759.0
458.0
0.3763
 (458.0/1217.0)
YES
478.0
996.0
0.3243
 (478.0/1474.0)
Total
1237.0
1454.0
0.3478
 (936.0/2691.0)



In [13]:

    
# Show various confusion matrices for test dataset (based on threshold(s))
print gbm_perf.confusion_matrix(thresholds=0.5) 

cms = gbm_perf.confusion_matrix(thresholds=[0.01, 0.75, .88])
print cms[0]
print cms[1]
print cms[2]









    



Could not find exact threshold 0.5; using closest threshold found 0.500410605232.

Confusion Matrix (Act/Pred) @ threshold = 0.500410605232:






    





NO
YES
Error
Rate
NO
577.0
640.0
0.5259
 (640.0/1217.0)
YES
312.0
1162.0
0.2117
 (312.0/1474.0)
Total
889.0
1802.0
0.3538
 (952.0/2691.0)






    



Could not find exact threshold 0.01; using closest threshold found 0.376146323381.
Could not find exact threshold 0.75; using closest threshold found 0.684273290486.
Could not find exact threshold 0.88; using closest threshold found 0.684273290486.

Confusion Matrix (Act/Pred) @ threshold = 0.376146323381:






    





NO
YES
Error
Rate
NO
0.0
1217.0
1.0
 (1217.0/1217.0)
YES
0.0
1474.0
0.0
 (0.0/1474.0)
Total
0.0
2691.0
0.4522
 (1217.0/2691.0)






    




Confusion Matrix (Act/Pred) @ threshold = 0.684273290486:






    





NO
YES
Error
Rate
NO
1215.0
2.0
0.0016
 (2.0/1217.0)
YES
1461.0
13.0
0.9912
 (1461.0/1474.0)
Total
2676.0
15.0
0.5437
 (1463.0/2691.0)






    




Confusion Matrix (Act/Pred) @ threshold = 0.684273290486:






    





NO
YES
Error
Rate
NO
1215.0
2.0
0.0016
 (2.0/1217.0)
YES
1461.0
13.0
0.9912
 (1461.0/1474.0)
Total
2676.0
15.0
0.5437
 (1463.0/2691.0)



In [14]:

    
# Convert a ConfusionMatrix to a python list of lists: [ [tns,fps], [fns,tps] ]
cm = gbm.confusion_matrix()
print cm.to_list()

cm = gbm_perf.confusion_matrix()
print cm.to_list()









    



[[3220, 5687], [1357, 9404]]
[[394, 823], [175, 1299]]

H2O cluster uptime:	3 minutes 37 seconds 60 milliseconds
H2O cluster version:	3.5.0.99999
H2O cluster name:	ece
H2O cluster total nodes:	1
H2O cluster total memory:	10.67 GB
H2O cluster total cores:	8
H2O cluster allowed cores:	8
H2O cluster healthy:	True
H2O Connection ip:	127.0.0.1
H2O Connection port:	54321

	NO	YES	Error	Rate
NO	3220.0	5687.0	0.6385	(5687.0/8907.0)
YES	1357.0	9404.0	0.1261	(1357.0/10761.0)
Total	4577.0	15091.0	0.3581	(7044.0/19668.0)

	NO	YES	Error	Rate
NO	127.0	8780.0	0.9857	(8780.0/8907.0)
YES	21.0	10740.0	0.002	(21.0/10761.0)
Total	148.0	19520.0	0.4475	(8801.0/19668.0)

	NO	YES	Error	Rate
NO	8892.0	15.0	0.0017	(15.0/8907.0)
YES	10640.0	121.0	0.9888	(10640.0/10761.0)
Total	19532.0	136.0	0.5417	(10655.0/19668.0)

	NO	YES	Error	Rate
NO	4678.0	4229.0	0.4748	(4229.0/8907.0)
YES	2554.0	8207.0	0.2373	(2554.0/10761.0)
Total	7232.0	12436.0	0.3449	(6783.0/19668.0)

	NO	YES	Error	Rate
NO	5693.0	3214.0	0.3608	(3214.0/8907.0)
YES	3595.0	7166.0	0.3341	(3595.0/10761.0)
Total	9288.0	10380.0	0.3462	(6809.0/19668.0)

	NO	YES	Error	Rate
NO	0.0	8907.0	1.0	(8907.0/8907.0)
YES	0.0	10761.0	0.0	(0.0/10761.0)
Total	0.0	19668.0	0.4529	(8907.0/19668.0)

	NO	YES	Error	Rate
NO	4483.0	4424.0	0.4967	(4424.0/8907.0)
YES	2411.0	8350.0	0.224	(2411.0/10761.0)
Total	6894.0	12774.0	0.3475	(6835.0/19668.0)

	NO	YES	Error	Rate
NO	52.0	2107.0	0.9759	(2107.0/2159.0)
YES	4.0	2590.0	0.0015	(4.0/2594.0)
Total	56.0	4697.0	0.4441	(2111.0/4753.0)

	NO	YES	Error	Rate
NO	2159.0	0.0	0.0	(0.0/2159.0)
YES	2578.0	16.0	0.9938	(2578.0/2594.0)
Total	4737.0	16.0	0.5424	(2578.0/4753.0)

	NO	YES	Error	Rate
NO	1169.0	990.0	0.4585	(990.0/2159.0)
YES	684.0	1910.0	0.2637	(684.0/2594.0)
Total	1853.0	2900.0	0.3522	(1674.0/4753.0)

	NO	YES	Error	Rate
NO	2299.0	6608.0	0.7419	(6608.0/8907.0)
YES	919.0	9842.0	0.0854	(919.0/10761.0)
Total	3218.0	16450.0	0.3827	(7527.0/19668.0)

	NO	YES	Error	Rate
NO	759.0	458.0	0.3763	(458.0/1217.0)
YES	478.0	996.0	0.3243	(478.0/1474.0)
Total	1237.0	1454.0	0.3478	(936.0/2691.0)

	NO	YES	Error	Rate
NO	786.0	431.0	0.3541	(431.0/1217.0)
YES	524.0	950.0	0.3555	(524.0/1474.0)
Total	1310.0	1381.0	0.3549	(955.0/2691.0)

	NO	YES	Error	Rate
NO	738.0	479.0	0.3936	(479.0/1217.0)
YES	454.0	1020.0	0.308	(454.0/1474.0)
Total	1192.0	1499.0	0.3467	(933.0/2691.0)

	NO	YES	Error	Rate
NO	577.0	640.0	0.5259	(640.0/1217.0)
YES	312.0	1162.0	0.2117	(312.0/1474.0)
Total	889.0	1802.0	0.3538	(952.0/2691.0)

	NO	YES	Error	Rate
NO	0.0	1217.0	1.0	(1217.0/1217.0)
YES	0.0	1474.0	0.0	(0.0/1474.0)
Total	0.0	2691.0	0.4522	(1217.0/2691.0)

	NO	YES	Error	Rate
NO	1215.0	2.0	0.0016	(2.0/1217.0)
YES	1461.0	13.0	0.9912	(1461.0/1474.0)
Total	2676.0	15.0	0.5437	(1463.0/2691.0)