notebook.community

Edit and run



In [1]:

    
import h2o



In [2]:

    
h2o.init()









    




H2O cluster uptime: 
2 minutes 13 seconds 669 milliseconds 
H2O cluster version: 
3.1.0.99999
H2O cluster name: 
ece
H2O cluster total nodes: 
1
H2O cluster total memory: 
4.44 GB
H2O cluster total cores: 
8
H2O cluster allowed cores: 
8
H2O cluster healthy: 
True
H2O Connection ip: 
127.0.0.1
H2O Connection port: 
54321



In [3]:

    
# Airlines dataset
air = h2o.import_frame(path=h2o.locate("smalldata/airlines/AirlinesTrain.csv.zip"))









    



Parse Progress: [##################################################] 100%
Imported  /Users/ece/0xdata/h2o-dev/smalldata/airlines/AirlinesTrain.csv.zip . Parsed 24,421 rows and 12 cols



In [4]:

    
# Construct validation and training datasets by sampling (20/80)
r = air[0].runif()
air_train = air[r < 0.8]
air_valid = air[r >= 0.8]

myX = ["Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek"]
myY = "IsDepDelayed"



In [5]:

    
# Build gbm
gbm = h2o.gbm(x=air_train[myX], 
              y=air_train[myY], 
              validation_x=air_valid[myX],
              validation_y=air_valid[myY],
              distribution="bernoulli", 
              ntrees=100, 
              max_depth=3, 
              learn_rate=0.01)









    



gbm Model Build Progress: [##################################################] 100%



In [6]:

    
# Show various confusion matrices for training dataset (based on metric(s))
print gbm.confusion_matrix() # maximum f1 threshold chosen by default

print gbm.confusion_matrix(metrics="f2")

print gbm.confusion_matrix(metrics="precision")

cms = gbm.confusion_matrix(metrics=["accuracy", "f0point5"])
print cms[0]
print cms[1]









    



Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.439860523006:







    





NO
YES
Error
Rate
NO
2473.0
6377.0
0.7206
 (6377.0/8850.0)
YES
964.0
9749.0
0.09
 (964.0/10713.0)
Total
3437.0
16126.0
0.8106
 (0.8106/19563.0)






    




Confusion Matrix (Act/Pred) for max f2 @ threshold = 0.398529984517:







    





NO
YES
Error
Rate
NO
355.0
8495.0
0.9599
 (8495.0/8850.0)
YES
60.0
10653.0
0.0056
 (60.0/10713.0)
Total
415.0
19148.0
0.9655
 (0.9655/19563.0)






    




Confusion Matrix (Act/Pred) for max precision @ threshold = 0.684320673544:







    





NO
YES
Error
Rate
NO
8832.0
18.0
0.002
 (18.0/8850.0)
YES
10562.0
151.0
0.9859
 (10562.0/10713.0)
Total
19394.0
169.0
0.9879
 (0.9879/19563.0)






    




Confusion Matrix (Act/Pred) for max accuracy @ threshold = 0.533140769284:







    





NO
YES
Error
Rate
NO
5093.0
3757.0
0.4245
 (3757.0/8850.0)
YES
3076.0
7637.0
0.2871
 (3076.0/10713.0)
Total
8169.0
11394.0
0.7116
 (0.7116/19563.0)






    




Confusion Matrix (Act/Pred) for max f0point5 @ threshold = 0.535870076134:







    





NO
YES
Error
Rate
NO
5141.0
3709.0
0.4191
 (3709.0/8850.0)
YES
3128.0
7585.0
0.292
 (3128.0/10713.0)
Total
8269.0
11294.0
0.7111
 (0.7111/19563.0)



In [7]:

    
# Show various confusion matrices for training dataset (based on threshold(s))
print gbm.confusion_matrix(thresholds=0.77) 

cms = gbm.confusion_matrix(thresholds=[0.1, 0.5, 0.99])
print cms[0]
print cms[1]
print cms[2]









    



Could not find exact threshold 0.77; using closest threshold found 0.6869435993.

Confusion Matrix (Act/Pred) @ threshold = 0.6869435993:







    





NO
YES
Error
Rate
NO
8836.0
14.0
0.0016
 (14.0/8850.0)
YES
10597.0
116.0
0.9892
 (10597.0/10713.0)
Total
19433.0
130.0
0.9908
 (0.9908/19563.0)






    



Could not find exact threshold 0.1; using closest threshold found 0.383907658296.
Could not find exact threshold 0.5; using closest threshold found 0.500172069127.
Could not find exact threshold 0.99; using closest threshold found 0.6869435993.

Confusion Matrix (Act/Pred) @ threshold = 0.383907658296:







    





NO
YES
Error
Rate
NO
0.0
8850.0
1.0
 (8850.0/8850.0)
YES
0.0
10713.0
0.0
 (0.0/10713.0)
Total
0.0
19563.0
1.0
 (1.0/19563.0)






    




Confusion Matrix (Act/Pred) @ threshold = 0.500172069127:







    





NO
YES
Error
Rate
NO
4223.0
4627.0
0.5228
 (4627.0/8850.0)
YES
2258.0
8455.0
0.2108
 (2258.0/10713.0)
Total
6481.0
13082.0
0.7336
 (0.7336/19563.0)






    




Confusion Matrix (Act/Pred) @ threshold = 0.6869435993:







    





NO
YES
Error
Rate
NO
8836.0
14.0
0.0016
 (14.0/8850.0)
YES
10597.0
116.0
0.9892
 (10597.0/10713.0)
Total
19433.0
130.0
0.9908
 (0.9908/19563.0)



In [8]:

    
# Show various confusion matrices for validation dataset (based on metric(s))
print gbm.confusion_matrix(metrics="f2", valid=True)

print gbm.confusion_matrix(metrics="precision", valid=True)

cms = gbm.confusion_matrix(metrics=["accuracy", "f0point5"], valid=True)
print cms[0]
print cms[1]









    



Confusion Matrix (Act/Pred) for max f2 @ threshold = 0.398134501203:







    





NO
YES
Error
Rate
NO
73.0
2143.0
0.9671
 (2143.0/2216.0)
YES
12.0
2630.0
0.0045
 (12.0/2642.0)
Total
85.0
4773.0
0.9716
 (0.9716/4858.0)






    




Confusion Matrix (Act/Pred) for max precision @ threshold = 0.676912109388:







    





NO
YES
Error
Rate
NO
2192.0
24.0
0.0108
 (24.0/2216.0)
YES
2488.0
154.0
0.9417
 (2488.0/2642.0)
Total
4680.0
178.0
0.9525
 (0.9525/4858.0)






    




Confusion Matrix (Act/Pred) for max accuracy @ threshold = 0.539970628106:







    





NO
YES
Error
Rate
NO
1308.0
908.0
0.4097
 (908.0/2216.0)
YES
803.0
1839.0
0.3039
 (803.0/2642.0)
Total
2111.0
2747.0
0.7136
 (0.7136/4858.0)






    




Confusion Matrix (Act/Pred) for max f0point5 @ threshold = 0.548587112251:







    





NO
YES
Error
Rate
NO
1412.0
804.0
0.3628
 (804.0/2216.0)
YES
919.0
1723.0
0.3478
 (919.0/2642.0)
Total
2331.0
2527.0
0.7106
 (0.7106/4858.0)



In [9]:

    
# Show various confusion matrices for validation dataset (based on threshold(s))
print gbm.confusion_matrix(thresholds=0.77) 

cms = gbm.confusion_matrix(thresholds=[0.25, 0.33, 0.44])
print cms[0]
print cms[1]
print cms[2]









    



Could not find exact threshold 0.77; using closest threshold found 0.6869435993.

Confusion Matrix (Act/Pred) @ threshold = 0.6869435993:







    





NO
YES
Error
Rate
NO
8836.0
14.0
0.0016
 (14.0/8850.0)
YES
10597.0
116.0
0.9892
 (10597.0/10713.0)
Total
19433.0
130.0
0.9908
 (0.9908/19563.0)






    



Could not find exact threshold 0.25; using closest threshold found 0.383907658296.
Could not find exact threshold 0.33; using closest threshold found 0.383907658296.
Could not find exact threshold 0.44; using closest threshold found 0.439860523006.

Confusion Matrix (Act/Pred) @ threshold = 0.383907658296:







    





NO
YES
Error
Rate
NO
0.0
8850.0
1.0
 (8850.0/8850.0)
YES
0.0
10713.0
0.0
 (0.0/10713.0)
Total
0.0
19563.0
1.0
 (1.0/19563.0)






    




Confusion Matrix (Act/Pred) @ threshold = 0.383907658296:







    





NO
YES
Error
Rate
NO
0.0
8850.0
1.0
 (8850.0/8850.0)
YES
0.0
10713.0
0.0
 (0.0/10713.0)
Total
0.0
19563.0
1.0
 (1.0/19563.0)






    




Confusion Matrix (Act/Pred) @ threshold = 0.439860523006:







    





NO
YES
Error
Rate
NO
2473.0
6377.0
0.7206
 (6377.0/8850.0)
YES
964.0
9749.0
0.09
 (964.0/10713.0)
Total
3437.0
16126.0
0.8106
 (0.8106/19563.0)



In [10]:

    
# Show various confusion matrices for validation dataset (based on metric(s) AND threshold(s))
cms = gbm.confusion_matrix(thresholds=0.77, metrics="f1") 
print cms[0]
print cms[1]

cms = gbm.confusion_matrix(thresholds=[0.25, 0.33], metrics=["f2", "f0point5"])
print cms[0]
print cms[1]
print cms[2]
print cms[3]









    



Could not find exact threshold 0.77; using closest threshold found 0.6869435993.

Confusion Matrix (Act/Pred) @ threshold = 0.6869435993:







    





NO
YES
Error
Rate
NO
8836.0
14.0
0.0016
 (14.0/8850.0)
YES
10597.0
116.0
0.9892
 (10597.0/10713.0)
Total
19433.0
130.0
0.9908
 (0.9908/19563.0)






    




Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.439860523006:







    





NO
YES
Error
Rate
NO
2473.0
6377.0
0.7206
 (6377.0/8850.0)
YES
964.0
9749.0
0.09
 (964.0/10713.0)
Total
3437.0
16126.0
0.8106
 (0.8106/19563.0)






    



Could not find exact threshold 0.25; using closest threshold found 0.383907658296.
Could not find exact threshold 0.33; using closest threshold found 0.383907658296.

Confusion Matrix (Act/Pred) @ threshold = 0.383907658296:







    





NO
YES
Error
Rate
NO
0.0
8850.0
1.0
 (8850.0/8850.0)
YES
0.0
10713.0
0.0
 (0.0/10713.0)
Total
0.0
19563.0
1.0
 (1.0/19563.0)






    




Confusion Matrix (Act/Pred) @ threshold = 0.383907658296:







    





NO
YES
Error
Rate
NO
0.0
8850.0
1.0
 (8850.0/8850.0)
YES
0.0
10713.0
0.0
 (0.0/10713.0)
Total
0.0
19563.0
1.0
 (1.0/19563.0)






    




Confusion Matrix (Act/Pred) for max f2 @ threshold = 0.398529984517:







    





NO
YES
Error
Rate
NO
355.0
8495.0
0.9599
 (8495.0/8850.0)
YES
60.0
10653.0
0.0056
 (60.0/10713.0)
Total
415.0
19148.0
0.9655
 (0.9655/19563.0)






    




Confusion Matrix (Act/Pred) for max f0point5 @ threshold = 0.535870076134:







    





NO
YES
Error
Rate
NO
5141.0
3709.0
0.4191
 (3709.0/8850.0)
YES
3128.0
7585.0
0.292
 (3128.0/10713.0)
Total
8269.0
11294.0
0.7111
 (0.7111/19563.0)



In [11]:

    
# Test dataset
air_test = h2o.import_frame(path=h2o.locate("smalldata/airlines/AirlinesTest.csv.zip"))

# Test performance
gbm_perf = gbm.model_performance(air_test)









    



Parse Progress: [##################################################] 100%
Imported  /Users/ece/0xdata/h2o-dev/smalldata/airlines/AirlinesTest.csv.zip . Parsed 2,691 rows and 12 cols



In [12]:

    
# Show various confusion matrices for test dataset (based on metric(s))
print gbm_perf.confusion_matrix(metrics="f0point5")

print gbm_perf.confusion_matrix(metrics="min_per_class_accuracy")

cms = gbm_perf.confusion_matrix(metrics=["accuracy", "f0point5"])
print cms[0]
print cms[1]









    



Confusion Matrix (Act/Pred) for max f0point5 @ threshold = 0.532779731095:







    





NO
YES
Error
Rate
NO
681.0
536.0
0.4404
 (536.0/1217.0)
YES
400.0
1074.0
0.2714
 (400.0/1474.0)
Total
1081.0
1610.0
0.7118
 (0.7118/2691.0)






    




Confusion Matrix (Act/Pred) for max min_per_class_accuracy @ threshold = 0.554747145173:







    





NO
YES
Error
Rate
NO
776.0
441.0
0.3624
 (441.0/1217.0)
YES
537.0
937.0
0.3643
 (537.0/1474.0)
Total
1313.0
1378.0
0.7267
 (0.7267/2691.0)






    




Confusion Matrix (Act/Pred) for max accuracy @ threshold = 0.523011194769:







    





NO
YES
Error
Rate
NO
648.0
569.0
0.4675
 (569.0/1217.0)
YES
363.0
1111.0
0.2463
 (363.0/1474.0)
Total
1011.0
1680.0
0.7138
 (0.7138/2691.0)






    




Confusion Matrix (Act/Pred) for max f0point5 @ threshold = 0.532779731095:







    





NO
YES
Error
Rate
NO
681.0
536.0
0.4404
 (536.0/1217.0)
YES
400.0
1074.0
0.2714
 (400.0/1474.0)
Total
1081.0
1610.0
0.7118
 (0.7118/2691.0)



In [13]:

    
# Show various confusion matrices for test dataset (based on threshold(s))
print gbm_perf.confusion_matrix(thresholds=0.5) 

cms = gbm_perf.confusion_matrix(thresholds=[0.01, 0.75, .88])
print cms[0]
print cms[1]
print cms[2]









    



Could not find exact threshold 0.5; using closest threshold found 0.500253589848.

Confusion Matrix (Act/Pred) @ threshold = 0.500253589848:







    





NO
YES
Error
Rate
NO
563.0
654.0
0.5374
 (654.0/1217.0)
YES
288.0
1186.0
0.1954
 (288.0/1474.0)
Total
851.0
1840.0
0.7328
 (0.7328/2691.0)






    



Could not find exact threshold 0.01; using closest threshold found 0.383989388848.
Could not find exact threshold 0.75; using closest threshold found 0.686943579735.
Could not find exact threshold 0.88; using closest threshold found 0.686943579735.

Confusion Matrix (Act/Pred) @ threshold = 0.383989388848:







    





NO
YES
Error
Rate
NO
0.0
1217.0
1.0
 (1217.0/1217.0)
YES
0.0
1474.0
0.0
 (0.0/1474.0)
Total
0.0
2691.0
1.0
 (1.0/2691.0)






    




Confusion Matrix (Act/Pred) @ threshold = 0.686943579735:







    





NO
YES
Error
Rate
NO
1215.0
2.0
0.0016
 (2.0/1217.0)
YES
1458.0
16.0
0.9891
 (1458.0/1474.0)
Total
2673.0
18.0
0.9907
 (0.9907/2691.0)






    




Confusion Matrix (Act/Pred) @ threshold = 0.686943579735:







    





NO
YES
Error
Rate
NO
1215.0
2.0
0.0016
 (2.0/1217.0)
YES
1458.0
16.0
0.9891
 (1458.0/1474.0)
Total
2673.0
18.0
0.9907
 (0.9907/2691.0)



In [14]:

    
# Convert a ConfusionMatrix to a python list of lists: [ [tns,fps], [fns,tps] ]
cm = gbm.confusion_matrix()
print cm.to_list()

cm = gbm_perf.confusion_matrix()
print cm.to_list()









    



[[2473, 6377], [964, 9749]]
[[248, 969], [91, 1383]]

H2O cluster uptime:	2 minutes 13 seconds 669 milliseconds
H2O cluster version:	3.1.0.99999
H2O cluster name:	ece
H2O cluster total nodes:	1
H2O cluster total memory:	4.44 GB
H2O cluster total cores:	8
H2O cluster allowed cores:	8
H2O cluster healthy:	True
H2O Connection ip:	127.0.0.1
H2O Connection port:	54321

	NO	YES	Error	Rate
NO	2473.0	6377.0	0.7206	(6377.0/8850.0)
YES	964.0	9749.0	0.09	(964.0/10713.0)
Total	3437.0	16126.0	0.8106	(0.8106/19563.0)

	NO	YES	Error	Rate
NO	355.0	8495.0	0.9599	(8495.0/8850.0)
YES	60.0	10653.0	0.0056	(60.0/10713.0)
Total	415.0	19148.0	0.9655	(0.9655/19563.0)

	NO	YES	Error	Rate
NO	8832.0	18.0	0.002	(18.0/8850.0)
YES	10562.0	151.0	0.9859	(10562.0/10713.0)
Total	19394.0	169.0	0.9879	(0.9879/19563.0)

	NO	YES	Error	Rate
NO	5093.0	3757.0	0.4245	(3757.0/8850.0)
YES	3076.0	7637.0	0.2871	(3076.0/10713.0)
Total	8169.0	11394.0	0.7116	(0.7116/19563.0)

	NO	YES	Error	Rate
NO	5141.0	3709.0	0.4191	(3709.0/8850.0)
YES	3128.0	7585.0	0.292	(3128.0/10713.0)
Total	8269.0	11294.0	0.7111	(0.7111/19563.0)

	NO	YES	Error	Rate
NO	8836.0	14.0	0.0016	(14.0/8850.0)
YES	10597.0	116.0	0.9892	(10597.0/10713.0)
Total	19433.0	130.0	0.9908	(0.9908/19563.0)

	NO	YES	Error	Rate
NO	0.0	8850.0	1.0	(8850.0/8850.0)
YES	0.0	10713.0	0.0	(0.0/10713.0)
Total	0.0	19563.0	1.0	(1.0/19563.0)

	NO	YES	Error	Rate
NO	4223.0	4627.0	0.5228	(4627.0/8850.0)
YES	2258.0	8455.0	0.2108	(2258.0/10713.0)
Total	6481.0	13082.0	0.7336	(0.7336/19563.0)

	NO	YES	Error	Rate
NO	73.0	2143.0	0.9671	(2143.0/2216.0)
YES	12.0	2630.0	0.0045	(12.0/2642.0)
Total	85.0	4773.0	0.9716	(0.9716/4858.0)

	NO	YES	Error	Rate
NO	2192.0	24.0	0.0108	(24.0/2216.0)
YES	2488.0	154.0	0.9417	(2488.0/2642.0)
Total	4680.0	178.0	0.9525	(0.9525/4858.0)

	NO	YES	Error	Rate
NO	1308.0	908.0	0.4097	(908.0/2216.0)
YES	803.0	1839.0	0.3039	(803.0/2642.0)
Total	2111.0	2747.0	0.7136	(0.7136/4858.0)

	NO	YES	Error	Rate
NO	1412.0	804.0	0.3628	(804.0/2216.0)
YES	919.0	1723.0	0.3478	(919.0/2642.0)
Total	2331.0	2527.0	0.7106	(0.7106/4858.0)

	NO	YES	Error	Rate
NO	681.0	536.0	0.4404	(536.0/1217.0)
YES	400.0	1074.0	0.2714	(400.0/1474.0)
Total	1081.0	1610.0	0.7118	(0.7118/2691.0)

	NO	YES	Error	Rate
NO	776.0	441.0	0.3624	(441.0/1217.0)
YES	537.0	937.0	0.3643	(537.0/1474.0)
Total	1313.0	1378.0	0.7267	(0.7267/2691.0)

	NO	YES	Error	Rate
NO	648.0	569.0	0.4675	(569.0/1217.0)
YES	363.0	1111.0	0.2463	(363.0/1474.0)
Total	1011.0	1680.0	0.7138	(0.7138/2691.0)

	NO	YES	Error	Rate
NO	563.0	654.0	0.5374	(654.0/1217.0)
YES	288.0	1186.0	0.1954	(288.0/1474.0)
Total	851.0	1840.0	0.7328	(0.7328/2691.0)

	NO	YES	Error	Rate
NO	0.0	1217.0	1.0	(1217.0/1217.0)
YES	0.0	1474.0	0.0	(0.0/1474.0)
Total	0.0	2691.0	1.0	(1.0/2691.0)

	NO	YES	Error	Rate
NO	1215.0	2.0	0.0016	(2.0/1217.0)
YES	1458.0	16.0	0.9891	(1458.0/1474.0)
Total	2673.0	18.0	0.9907	(0.9907/2691.0)