In [1]:
import h2o

In [2]:
# Connect to a pre-existing cluster
h2o.init()


H2O cluster uptime: 15 minutes 45 seconds 666 milliseconds
H2O cluster version: 3.5.0.99999
H2O cluster name: ece
H2O cluster total nodes: 1
H2O cluster total memory: 10.67 GB
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster healthy: True
H2O Connection ip: 127.0.0.1
H2O Connection port: 54321

In [3]:
from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.

df = h2o.import_file(path=_locate("smalldata/logreg/prostate.csv"))


Parse Progress: [##################################################] 100%
Imported /Users/ece/0xdata/h2o-dev/smalldata/logreg/prostate.csv. Parsed 380 rows and 9 cols

In [4]:
df.describe()


Rows: 380 Cols: 9

Chunk compression summary:
chunk_type chunk_name count count_percentage size size_percentage
CBS Bits 1 11.111112 118 B 2.4210093
C1N 1-Byte Integers (w/o NAs) 5 55.555557 2.2 KB 45.958145
C2 2-Byte Integers 1 11.111112 828 B 16.9881
C2S 2-Byte Fractions 2 22.222223 1.6 KB 34.632744
Frame distribution summary:
size number_of_rows number_of_chunks_per_column number_of_chunks
10.0.0.24:54321 4.8 KB 380.0 1.0 9.0
mean 4.8 KB 380.0 1.0 9.0
min 4.8 KB 380.0 1.0 9.0
max 4.8 KB 380.0 1.0 9.0
stddev 0 B 0.0 0.0 0.0
total 4.8 KB 380.0 1.0 9.0
Column-by-Column Summary:

ID CAPSULE AGE RACE DPROS DCAPS PSA VOL GLEASON
type int int int int int int real real int
mins 1.0 0.0 43.0 0.0 1.0 1.0 0.3 0.0 0.0
maxs 380.0 1.0 79.0 2.0 4.0 2.0 139.7 97.6 9.0
mean 190.5 0.402631578947 66.0394736842 1.08684210526 2.27105263158 1.10789473684 15.4086315789 15.8129210526 6.38421052632
sigma 109.840793879 0.491074338963 6.52707126917 0.308773258025 1.00010761815 0.310656449351 19.9975726686 18.3476199673 1.09195337443
zero_count 0 227 0 3 0 0 0 167 2
missing_count 0 0 0 0 0 0 0 0 0

In [5]:
# Remove ID from training frame
train = df.drop("ID")

In [6]:
# For VOL & GLEASON, a zero really means "missing"
vol = train['VOL']
vol[vol == 0] = None
gle = train['GLEASON']
gle[gle == 0] = None

In [7]:
# Convert CAPSULE to a logical factor
train['CAPSULE'] = train['CAPSULE'].asfactor()

In [8]:
# See that the data is ready
train.describe()


Rows: 380 Cols: 8

Chunk compression summary:
chunk_type chunk_name count count_percentage size size_percentage
CBS Bits 1 12.5 118 B 2.9164608
C1N 1-Byte Integers (w/o NAs) 5 62.5 2.2 KB 55.363323
C2S 2-Byte Fractions 2 25.0 1.6 KB 41.72022
Frame distribution summary:
size number_of_rows number_of_chunks_per_column number_of_chunks
10.0.0.24:54321 4.0 KB 380.0 1.0 8.0
mean 4.0 KB 380.0 1.0 8.0
min 4.0 KB 380.0 1.0 8.0
max 4.0 KB 380.0 1.0 8.0
stddev 0 B 0.0 0.0 0.0
total 4.0 KB 380.0 1.0 8.0
Column-by-Column Summary:

CAPSULE AGE RACE DPROS DCAPS PSA VOL GLEASON
type enum int int int int real real int
mins 0.0 43.0 0.0 1.0 1.0 0.3 0.0 0.0
maxs 1.0 79.0 2.0 4.0 2.0 139.7 97.6 9.0
mean 0.402631578947 66.0394736842 1.08684210526 2.27105263158 1.10789473684 15.4086315789 15.8129210526 6.38421052632
sigma 0.491074338963 6.52707126917 0.308773258025 1.00010761815 0.310656449351 19.9975726686 18.3476199673 1.09195337443
zero_count 227 0 3 0 0 0 167 2
missing_count 0 0 0 0 0 0 0 0

In [9]:
# Run GBM
my_gbm = h2o.gbm(           y=train["CAPSULE"],
                 validation_y=train["CAPSULE"],
                            x=train[1:],
                 validation_x=train[1:],
                 distribution = "bernoulli",
                 ntrees=50,
                 learn_rate=0.1)


gbm Model Build Progress: [##################################################] 100%

In [10]:
my_gbm_metrics = my_gbm.model_performance(train)
my_gbm_metrics.show()


ModelMetricsBinomial: gbm
** Reported on test data. **

MSE: 0.0758414746751
R^2: 0.684676256282
LogLoss: 0.274466812848
AUC: 0.978031153724
Gini: 0.956062307449

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.454949666805:
0 1 Error Rate
0 216.0 11.0 0.0485 (11.0/227.0)
1 14.0 139.0 0.0915 (14.0/153.0)
Total 230.0 150.0 0.0658 (25.0/380.0)
Maximum Metrics: Maximum metrics at their respective thresholds

metric threshold value idx
max f1 0.454949666805 0.917491749175 149.0
max f2 0.303200968061 0.939431396786 196.0
max f0point5 0.472831330673 0.924426450742 146.0
max accuracy 0.454949666805 0.934210526316 149.0
max precision 0.974793768297 1.0 0.0
max absolute_MCC 0.454949666805 0.862913038286 149.0
max min_per_class_accuracy 0.437399498018 0.921568627451 156.0