In [1]:
import h2o

In [2]:
h2o.init()


H2O cluster uptime: 3 minutes 37 seconds 60 milliseconds
H2O cluster version: 3.5.0.99999
H2O cluster name: ece
H2O cluster total nodes: 1
H2O cluster total memory: 10.67 GB
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster healthy: True
H2O Connection ip: 127.0.0.1
H2O Connection port: 54321

In [3]:
from h2o.h2o import _locate # private function. used to find files within h2o git project directory.

# Airlines dataset
air = h2o.import_file(path=_locate("smalldata/airlines/AirlinesTrain.csv.zip"))


Parse Progress: [##################################################] 100%
Imported /Users/ece/0xdata/h2o-dev/smalldata/airlines/AirlinesTrain.csv.zip. Parsed 24,421 rows and 12 cols

In [4]:
# Construct validation and training datasets by sampling (20/80)
r = air[0].runif()
air_train = air[r < 0.8]
air_valid = air[r >= 0.8]

myX = ["Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek"]
myY = "IsDepDelayed"

In [5]:
# Build gbm
gbm = h2o.gbm(x=air_train[myX], 
              y=air_train[myY], 
              validation_x=air_valid[myX],
              validation_y=air_valid[myY],
              distribution="bernoulli", 
              ntrees=100, 
              max_depth=3, 
              learn_rate=0.01)


gbm Model Build Progress: [##################################################] 100%

In [6]:
# Show various confusion matrices for training dataset (based on metric(s))
print gbm.confusion_matrix() # maximum f1 threshold chosen by default

print gbm.confusion_matrix(metrics="f2")

print gbm.confusion_matrix(metrics="precision")

cms = gbm.confusion_matrix(metrics=["accuracy", "f0point5"])
print cms[0]
print cms[1]


Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.459414927132:
NO YES Error Rate
NO 3220.0 5687.0 0.6385 (5687.0/8907.0)
YES 1357.0 9404.0 0.1261 (1357.0/10761.0)
Total 4577.0 15091.0 0.3581 (7044.0/19668.0)

Confusion Matrix (Act/Pred) for max f2 @ threshold = 0.381066670162:
NO YES Error Rate
NO 127.0 8780.0 0.9857 (8780.0/8907.0)
YES 21.0 10740.0 0.002 (21.0/10761.0)
Total 148.0 19520.0 0.4475 (8801.0/19668.0)

Confusion Matrix (Act/Pred) for max precision @ threshold = 0.684200366538:
NO YES Error Rate
NO 8892.0 15.0 0.0017 (15.0/8907.0)
YES 10640.0 121.0 0.9888 (10640.0/10761.0)
Total 19532.0 136.0 0.5417 (10655.0/19668.0)

Confusion Matrix (Act/Pred) for max accuracy @ threshold = 0.505407976974:
NO YES Error Rate
NO 4678.0 4229.0 0.4748 (4229.0/8907.0)
YES 2554.0 8207.0 0.2373 (2554.0/10761.0)
Total 7232.0 12436.0 0.3449 (6783.0/19668.0)

Confusion Matrix (Act/Pred) for max f0point5 @ threshold = 0.552458757447:
NO YES Error Rate
NO 5693.0 3214.0 0.3608 (3214.0/8907.0)
YES 3595.0 7166.0 0.3341 (3595.0/10761.0)
Total 9288.0 10380.0 0.3462 (6809.0/19668.0)


In [7]:
# Show various confusion matrices for training dataset (based on threshold(s))
print gbm.confusion_matrix(thresholds=0.77) 

cms = gbm.confusion_matrix(thresholds=[0.1, 0.5, 0.99])
print cms[0]
print cms[1]
print cms[2]


Could not find exact threshold 0.77; using closest threshold found 0.684200366538.

Confusion Matrix (Act/Pred) @ threshold = 0.684200366538:
NO YES Error Rate
NO 8892.0 15.0 0.0017 (15.0/8907.0)
YES 10640.0 121.0 0.9888 (10640.0/10761.0)
Total 19532.0 136.0 0.5417 (10655.0/19668.0)
Could not find exact threshold 0.1; using closest threshold found 0.376167626413.
Could not find exact threshold 0.5; using closest threshold found 0.499996937724.
Could not find exact threshold 0.99; using closest threshold found 0.684200366538.

Confusion Matrix (Act/Pred) @ threshold = 0.376167626413:
NO YES Error Rate
NO 0.0 8907.0 1.0 (8907.0/8907.0)
YES 0.0 10761.0 0.0 (0.0/10761.0)
Total 0.0 19668.0 0.4529 (8907.0/19668.0)

Confusion Matrix (Act/Pred) @ threshold = 0.499996937724:
NO YES Error Rate
NO 4483.0 4424.0 0.4967 (4424.0/8907.0)
YES 2411.0 8350.0 0.224 (2411.0/10761.0)
Total 6894.0 12774.0 0.3475 (6835.0/19668.0)

Confusion Matrix (Act/Pred) @ threshold = 0.684200366538:
NO YES Error Rate
NO 8892.0 15.0 0.0017 (15.0/8907.0)
YES 10640.0 121.0 0.9888 (10640.0/10761.0)
Total 19532.0 136.0 0.5417 (10655.0/19668.0)


In [8]:
# Show various confusion matrices for validation dataset (based on metric(s))
print gbm.confusion_matrix(metrics="f2", valid=True)

print gbm.confusion_matrix(metrics="precision", valid=True)

cms = gbm.confusion_matrix(metrics=["accuracy", "f0point5"], valid=True)
print cms[0]
print cms[1]


Confusion Matrix (Act/Pred) for max f2 @ threshold = 0.389262598446:
NO YES Error Rate
NO 52.0 2107.0 0.9759 (2107.0/2159.0)
YES 4.0 2590.0 0.0015 (4.0/2594.0)
Total 56.0 4697.0 0.4441 (2111.0/4753.0)

Confusion Matrix (Act/Pred) for max precision @ threshold = 0.684273290486:
NO YES Error Rate
NO 2159.0 0.0 0.0 (0.0/2159.0)
YES 2578.0 16.0 0.9938 (2578.0/2594.0)
Total 4737.0 16.0 0.5424 (2578.0/4753.0)

Confusion Matrix (Act/Pred) for max accuracy @ threshold = 0.522340383226:
NO YES Error Rate
NO 1169.0 990.0 0.4585 (990.0/2159.0)
YES 684.0 1910.0 0.2637 (684.0/2594.0)
Total 1853.0 2900.0 0.3522 (1674.0/4753.0)

Confusion Matrix (Act/Pred) for max accuracy @ threshold = 0.522340383226:
NO YES Error Rate
NO 1169.0 990.0 0.4585 (990.0/2159.0)
YES 684.0 1910.0 0.2637 (684.0/2594.0)
Total 1853.0 2900.0 0.3522 (1674.0/4753.0)


In [9]:
# Show various confusion matrices for validation dataset (based on threshold(s))
print gbm.confusion_matrix(thresholds=0.77) 

cms = gbm.confusion_matrix(thresholds=[0.25, 0.33, 0.44])
print cms[0]
print cms[1]
print cms[2]


Could not find exact threshold 0.77; using closest threshold found 0.684200366538.

Confusion Matrix (Act/Pred) @ threshold = 0.684200366538:
NO YES Error Rate
NO 8892.0 15.0 0.0017 (15.0/8907.0)
YES 10640.0 121.0 0.9888 (10640.0/10761.0)
Total 19532.0 136.0 0.5417 (10655.0/19668.0)
Could not find exact threshold 0.25; using closest threshold found 0.376167626413.
Could not find exact threshold 0.33; using closest threshold found 0.376167626413.
Could not find exact threshold 0.44; using closest threshold found 0.440166828406.

Confusion Matrix (Act/Pred) @ threshold = 0.376167626413:
NO YES Error Rate
NO 0.0 8907.0 1.0 (8907.0/8907.0)
YES 0.0 10761.0 0.0 (0.0/10761.0)
Total 0.0 19668.0 0.4529 (8907.0/19668.0)

Confusion Matrix (Act/Pred) @ threshold = 0.376167626413:
NO YES Error Rate
NO 0.0 8907.0 1.0 (8907.0/8907.0)
YES 0.0 10761.0 0.0 (0.0/10761.0)
Total 0.0 19668.0 0.4529 (8907.0/19668.0)

Confusion Matrix (Act/Pred) @ threshold = 0.440166828406:
NO YES Error Rate
NO 2299.0 6608.0 0.7419 (6608.0/8907.0)
YES 919.0 9842.0 0.0854 (919.0/10761.0)
Total 3218.0 16450.0 0.3827 (7527.0/19668.0)


In [10]:
# Show various confusion matrices for validation dataset (based on metric(s) AND threshold(s))
cms = gbm.confusion_matrix(thresholds=0.77, metrics="f1") 
print cms[0]
print cms[1]

cms = gbm.confusion_matrix(thresholds=[0.25, 0.33], metrics=["f2", "f0point5"])
print cms[0]
print cms[1]
print cms[2]
print cms[3]


Could not find exact threshold 0.77; using closest threshold found 0.684200366538.

Confusion Matrix (Act/Pred) @ threshold = 0.684200366538:
NO YES Error Rate
NO 8892.0 15.0 0.0017 (15.0/8907.0)
YES 10640.0 121.0 0.9888 (10640.0/10761.0)
Total 19532.0 136.0 0.5417 (10655.0/19668.0)

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.459414927132:
NO YES Error Rate
NO 3220.0 5687.0 0.6385 (5687.0/8907.0)
YES 1357.0 9404.0 0.1261 (1357.0/10761.0)
Total 4577.0 15091.0 0.3581 (7044.0/19668.0)
Could not find exact threshold 0.25; using closest threshold found 0.376167626413.
Could not find exact threshold 0.33; using closest threshold found 0.376167626413.

Confusion Matrix (Act/Pred) @ threshold = 0.376167626413:
NO YES Error Rate
NO 0.0 8907.0 1.0 (8907.0/8907.0)
YES 0.0 10761.0 0.0 (0.0/10761.0)
Total 0.0 19668.0 0.4529 (8907.0/19668.0)

Confusion Matrix (Act/Pred) @ threshold = 0.376167626413:
NO YES Error Rate
NO 0.0 8907.0 1.0 (8907.0/8907.0)
YES 0.0 10761.0 0.0 (0.0/10761.0)
Total 0.0 19668.0 0.4529 (8907.0/19668.0)

Confusion Matrix (Act/Pred) for max f2 @ threshold = 0.381066670162:
NO YES Error Rate
NO 127.0 8780.0 0.9857 (8780.0/8907.0)
YES 21.0 10740.0 0.002 (21.0/10761.0)
Total 148.0 19520.0 0.4475 (8801.0/19668.0)

Confusion Matrix (Act/Pred) for max f0point5 @ threshold = 0.552458757447:
NO YES Error Rate
NO 5693.0 3214.0 0.3608 (3214.0/8907.0)
YES 3595.0 7166.0 0.3341 (3595.0/10761.0)
Total 9288.0 10380.0 0.3462 (6809.0/19668.0)


In [11]:
# Test dataset
air_test = h2o.import_file(path=_locate("smalldata/airlines/AirlinesTest.csv.zip"))

# Test performance
gbm_perf = gbm.model_performance(air_test)


Parse Progress: [##################################################] 100%
Imported /Users/ece/0xdata/h2o-dev/smalldata/airlines/AirlinesTest.csv.zip. Parsed 2,691 rows and 12 cols

In [12]:
# Show various confusion matrices for test dataset (based on metric(s))
print gbm_perf.confusion_matrix(metrics="f0point5")

print gbm_perf.confusion_matrix(metrics="min_per_class_accuracy")

cms = gbm_perf.confusion_matrix(metrics=["accuracy", "f0point5"])
print cms[0]
print cms[1]


Confusion Matrix (Act/Pred) for max f0point5 @ threshold = 0.552672913274:
NO YES Error Rate
NO 759.0 458.0 0.3763 (458.0/1217.0)
YES 478.0 996.0 0.3243 (478.0/1474.0)
Total 1237.0 1454.0 0.3478 (936.0/2691.0)

Confusion Matrix (Act/Pred) for max min_per_class_accuracy @ threshold = 0.55367524629:
NO YES Error Rate
NO 786.0 431.0 0.3541 (431.0/1217.0)
YES 524.0 950.0 0.3555 (524.0/1474.0)
Total 1310.0 1381.0 0.3549 (955.0/2691.0)

Confusion Matrix (Act/Pred) for max accuracy @ threshold = 0.54604440874:
NO YES Error Rate
NO 738.0 479.0 0.3936 (479.0/1217.0)
YES 454.0 1020.0 0.308 (454.0/1474.0)
Total 1192.0 1499.0 0.3467 (933.0/2691.0)

Confusion Matrix (Act/Pred) for max f0point5 @ threshold = 0.552672913274:
NO YES Error Rate
NO 759.0 458.0 0.3763 (458.0/1217.0)
YES 478.0 996.0 0.3243 (478.0/1474.0)
Total 1237.0 1454.0 0.3478 (936.0/2691.0)


In [13]:
# Show various confusion matrices for test dataset (based on threshold(s))
print gbm_perf.confusion_matrix(thresholds=0.5) 

cms = gbm_perf.confusion_matrix(thresholds=[0.01, 0.75, .88])
print cms[0]
print cms[1]
print cms[2]


Could not find exact threshold 0.5; using closest threshold found 0.500410605232.

Confusion Matrix (Act/Pred) @ threshold = 0.500410605232:
NO YES Error Rate
NO 577.0 640.0 0.5259 (640.0/1217.0)
YES 312.0 1162.0 0.2117 (312.0/1474.0)
Total 889.0 1802.0 0.3538 (952.0/2691.0)
Could not find exact threshold 0.01; using closest threshold found 0.376146323381.
Could not find exact threshold 0.75; using closest threshold found 0.684273290486.
Could not find exact threshold 0.88; using closest threshold found 0.684273290486.

Confusion Matrix (Act/Pred) @ threshold = 0.376146323381:
NO YES Error Rate
NO 0.0 1217.0 1.0 (1217.0/1217.0)
YES 0.0 1474.0 0.0 (0.0/1474.0)
Total 0.0 2691.0 0.4522 (1217.0/2691.0)

Confusion Matrix (Act/Pred) @ threshold = 0.684273290486:
NO YES Error Rate
NO 1215.0 2.0 0.0016 (2.0/1217.0)
YES 1461.0 13.0 0.9912 (1461.0/1474.0)
Total 2676.0 15.0 0.5437 (1463.0/2691.0)

Confusion Matrix (Act/Pred) @ threshold = 0.684273290486:
NO YES Error Rate
NO 1215.0 2.0 0.0016 (2.0/1217.0)
YES 1461.0 13.0 0.9912 (1461.0/1474.0)
Total 2676.0 15.0 0.5437 (1463.0/2691.0)


In [14]:
# Convert a ConfusionMatrix to a python list of lists: [ [tns,fps], [fns,tps] ]
cm = gbm.confusion_matrix()
print cm.to_list()

cm = gbm_perf.confusion_matrix()
print cm.to_list()


[[3220, 5687], [1357, 9404]]
[[394, 823], [175, 1299]]