In [1]:
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator

In [2]:
# Connect to a pre-existing cluster
h2o.init()


H2O cluster uptime: 5 seconds 730 milliseconds
H2O cluster version: 3.7.0.99999
H2O cluster name: spIdea
H2O cluster total nodes: 1
H2O cluster total free memory: 12.44 GB
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster healthy: True
H2O Connection ip: 127.0.0.1
H2O Connection port: 54321
H2O Connection proxy: None
Python Version: 3.5.0

In [3]:
from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.

df = h2o.import_file(path=_locate("smalldata/logreg/prostate.csv"))


Parse Progress: [##################################################] 100%

In [4]:
df.describe()


Rows:380 Cols:9

Chunk compression summary: 
chunk_type chunk_name count count_percentage size size_percentage
CBS Bits 1 11.111112 118 B 2.4210093
C1N 1-Byte Integers (w/o NAs) 5 55.555557 2.2 KB 45.958145
C2 2-Byte Integers 1 11.111112 828 B 16.9881
C2S 2-Byte Fractions 2 22.222223 1.6 KB 34.632744
Frame distribution summary: 
size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.84:54321 4.8 KB 380.0 1.0 9.0
mean 4.8 KB 380.0 1.0 9.0
min 4.8 KB 380.0 1.0 9.0
max 4.8 KB 380.0 1.0 9.0
stddev 0 B 0.0 0.0 0.0
total 4.8 KB 380.0 1.0 9.0

ID CAPSULE AGE RACE DPROS DCAPS PSA VOL GLEASON
type int int int int int int real real int
mins 1.0 0.0 43.0 0.0 1.0 1.0 0.3 0.0 0.0
mean 190.5 0.402631578947368466.039473684210491.08684210526315722.27105263157894881.107894736842104815.40863157894737515.8129210526315736.3842105263157904
maxs 380.0 1.0 79.0 2.0 4.0 2.0 139.7000000000000297.60000000000001 9.0
sigma 109.840793879141270.49107433896305526.5270712691733110.30877325802527931.00010761815028610.310656449351493919.99757266856046 18.3476199672711751.0919533744261092
zeros 0 227 0 3 0 0 0 167 2
missing0 0 0 0 0 0 0 0 0
0 1.0 0.0 65.0 1.0 2.0 1.0 1.40000000000000010.0 6.0
1 2.0 0.0 72.0 1.0 3.0 2.0 6.7 0.0 7.0
2 3.0 0.0 70.0 1.0 1.0 2.0 4.9 0.0 6.0
3 4.0 0.0 76.0 2.0 2.0 1.0 51.2 20.0 7.0
4 5.0 0.0 69.0 1.0 1.0 1.0 12.3 55.9 6.0
5 6.0 1.0 71.0 1.0 3.0 2.0 3.30000000000000030.0 8.0
6 7.0 0.0 68.0 2.0 4.0 2.0 31.9000000000000020.0 7.0
7 8.0 0.0 61.0 2.0 4.0 2.0 66.7 27.2 7.0
8 9.0 0.0 69.0 1.0 1.0 1.0 3.9 24.0 7.0
9 10.0 0.0 68.0 2.0 1.0 2.0 13.0 0.0 6.0

In [5]:
# Remove ID from training frame
train = df.drop("ID")

In [6]:
# For VOL & GLEASON, a zero really means "missing"
vol = train['VOL']
vol[vol == 0] = None
gle = train['GLEASON']
gle[gle == 0] = None

In [7]:
# Convert CAPSULE to a logical factor
train['CAPSULE'] = train['CAPSULE'].asfactor()

In [8]:
# See that the data is ready
train.describe()


Rows:380 Cols:8

Chunk compression summary: 
chunk_type chunk_name count count_percentage size size_percentage
CBS Bits 1 12.5 118 B 2.9164608
C1N 1-Byte Integers (w/o NAs) 5 62.5 2.2 KB 55.363323
C2S 2-Byte Fractions 2 25.0 1.6 KB 41.72022
Frame distribution summary: 
size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.84:54321 4.0 KB 380.0 1.0 8.0
mean 4.0 KB 380.0 1.0 8.0
min 4.0 KB 380.0 1.0 8.0
max 4.0 KB 380.0 1.0 8.0
stddev 0 B 0.0 0.0 0.0
total 4.0 KB 380.0 1.0 8.0

CAPSULE AGE RACE DPROS DCAPS PSA VOL GLEASON
type enum int int int int real real int
mins 0.0 43.0 0.0 1.0 1.0 0.3 0.0 0.0
mean 0.402631578947368466.039473684210491.08684210526315722.27105263157894881.107894736842104815.40863157894737515.8129210526315736.3842105263157904
maxs 1.0 79.0 2.0 4.0 2.0 139.7000000000000297.60000000000001 9.0
sigma 0.49107433896305526.5270712691733110.30877325802527931.00010761815028610.310656449351493919.99757266856046 18.3476199672711751.0919533744261092
zeros 227 0 3 0 0 0 167 2
missing0 0 0 0 0 0 0 0
0 0 65.0 1.0 2.0 1.0 1.40000000000000010.0 6.0
1 0 72.0 1.0 3.0 2.0 6.7 0.0 7.0
2 0 70.0 1.0 1.0 2.0 4.9 0.0 6.0
3 0 76.0 2.0 2.0 1.0 51.2 20.0 7.0
4 0 69.0 1.0 1.0 1.0 12.3 55.9 6.0
5 1 71.0 1.0 3.0 2.0 3.30000000000000030.0 8.0
6 0 68.0 2.0 4.0 2.0 31.9000000000000020.0 7.0
7 0 61.0 2.0 4.0 2.0 66.7 27.2 7.0
8 0 69.0 1.0 1.0 1.0 3.9 24.0 7.0
9 0 68.0 2.0 1.0 2.0 13.0 0.0 6.0

In [9]:
# Run GBM
my_gbm = H2OGradientBoostingEstimator(distribution = "bernoulli", ntrees=50, learn_rate=0.1)

my_gbm.train(x=list(range(1,train.ncol)), y="CAPSULE", training_frame=train, validation_frame=train)


gbm Model Build Progress: [##################################################] 100%

In [10]:
my_gbm_metrics = my_gbm.model_performance(train)
my_gbm_metrics.show()


ModelMetricsBinomial: gbm
** Reported on test data. **

MSE: 0.07584147467507414
R^2: 0.6846762562816877
LogLoss: 0.2744668128481441
AUC: 0.9780311537243385
Gini: 0.9560623074486769

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.4549496668047897: 
0 1 Error Rate
0 216.0 11.0 0.0485 (11.0/227.0)
1 14.0 139.0 0.0915 (14.0/153.0)
Total 230.0 150.0 0.0658 (25.0/380.0)
Maximum Metrics: Maximum metrics at their respective thresholds

metric threshold value idx
max f1 0.4549497 0.9174917 149.0
max f2 0.3032010 0.9394314 196.0
max f0point5 0.4728313 0.9244265 146.0
max accuracy 0.4549497 0.9342105 149.0
max precision 0.9747938 1.0 0.0
max absolute_MCC 0.4549497 0.8629130 149.0
max min_per_class_accuracy 0.4373995 0.9215686 156.0
Gains/Lift Table: Avg response rate: 40.26 %

group lower_threshold cumulative_data_fraction response_rate cumulative_response_rate capture_rate cumulative_capture_rate lift cumulative_lift gain cumulative_gain
1 0.9405750 0.05 1.0 1.0 0.1241830 0.1241830 2.4836601 2.4836601 148.3660131 148.3660131
2 0.8921980 0.1 1.0 1.0 0.1241830 0.2483660 2.4836601 2.4836601 148.3660131 148.3660131
3 0.8263695 0.15 1.0 1.0 0.1241830 0.3725490 2.4836601 2.4836601 148.3660131 148.3660131
4 0.7595460 0.2 0.9473684 0.9868421 0.1176471 0.4901961 2.3529412 2.4509804 135.2941176 145.0980392
5 0.7081926 0.25 1.0 0.9894737 0.1241830 0.6143791 2.4836601 2.4575163 148.3660131 145.7516340
6 0.6364312 0.3 0.8947368 0.9736842 0.1111111 0.7254902 2.2222222 2.4183007 122.2222222 141.8300654
7 0.5478651 0.35 0.6842105 0.9323308 0.0849673 0.8104575 1.6993464 2.3155929 69.9346405 131.5592904
8 0.4499827 0.4 0.7894737 0.9144737 0.0980392 0.9084967 1.9607843 2.2712418 96.0784314 127.1241830
9 0.3927870 0.45 0.2105263 0.8362573 0.0261438 0.9346405 0.5228758 2.0769789 -47.7124183 107.6978940
10 0.3207657 0.5 0.3157895 0.7842105 0.0392157 0.9738562 0.7843137 1.9477124 -21.5686275 94.7712418
11 0.2425744 0.55 0.1578947 0.7272727 0.0196078 0.9934641 0.3921569 1.8062983 -60.7843137 80.6298277
12 0.1977616 0.6 0.0 0.6666667 0.0 0.9934641 0.0 1.6557734 -100.0 65.5773420
13 0.1586941 0.65 0.0526316 0.6194332 0.0065359 1.0 0.1307190 1.5384615 -86.9281046 53.8461538
14 0.1353591 0.7 0.0 0.5751880 0.0 1.0 0.0 1.4285714 -100.0 42.8571429
15 0.1094101 0.75 0.0 0.5368421 0.0 1.0 0.0 1.3333333 -100.0 33.3333333
16 0.0923828 0.8 0.0 0.5032895 0.0 1.0 0.0 1.25 -100.0 25.0
17 0.0665933 0.85 0.0 0.4736842 0.0 1.0 0.0 1.1764706 -100.0 17.6470588
18 0.0477968 0.9 0.0 0.4473684 0.0 1.0 0.0 1.1111111 -100.0 11.1111111
19 0.0276973 0.95 0.0 0.4238227 0.0 1.0 0.0 1.0526316 -100.0 5.2631579
20 0.0125566 1.0 0.0 0.4026316 0.0 1.0 0.0 1.0 -100.0 0.0