In [1]:
import h2o

In [2]:
# Connect to a pre-existing cluster
h2o.init()


H2O cluster uptime: 4 minutes 56 seconds 274 milliseconds
H2O cluster version: 3.1.0.99999
H2O cluster name: ece
H2O cluster total nodes: 1
H2O cluster total memory: 4.44 GB
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster healthy: True
H2O Connection ip: 127.0.0.1
H2O Connection port: 54321

In [3]:
df = h2o.import_file(path=h2o.locate("smalldata/logreg/prostate.csv"))


Parse Progress: [##################################################] 100%
Imported  /Users/ece/0xdata/h2o-dev/smalldata/logreg/prostate.csv . Parsed 380 rows and 9 cols

In [4]:
df.describe()


Rows: 380 Cols: 9

Chunk compression summary:

chunk_type chunk_name count count_percentage size size_percentage
CBS Bits 1 11.111112 118 B 2.4210093
C1N 1-Byte Integers (w/o NAs) 5 55.555557 2.2 KB 45.958145
C2 2-Byte Integers 1 11.111112 828 B 16.9881
C2S 2-Byte Fractions 2 22.222223 1.6 KB 34.632744
Frame distribution summary:

size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.41:54321 4.8 KB 380.0 1.0 9.0
mean 4.8 KB 380.0 1.0 9.0
min 4.8 KB 380.0 1.0 9.0
max 4.8 KB 380.0 1.0 9.0
stddev 0 B 0.0 0.0 0.0
total 4.8 KB 380.0 1.0 9.0
Column-by-Column Summary:

ID CAPSULE AGE RACE DPROS DCAPS PSA VOL GLEASON
type int int int int int int real real int
mins 1.0 0.0 43.0 0.0 1.0 1.0 0.3 0.0 0.0
mean 190.5 0.402631578947 66.0394736842 1.08684210526 2.27105263158 1.10789473684 15.4086315789 15.8129210526 6.38421052632
maxs 380.0 1.0 79.0 2.0 4.0 2.0 139.7 97.6 9.0
sigma 109.840793879 0.491074338963 6.52707126917 0.308773258025 1.00010761815 0.310656449351 19.9975726686 18.3476199673 1.09195337443
zero_count 0 227 0 3 0 0 0 167 2
missing_count 0 0 0 0 0 0 0 0 0

In [5]:
# Remove ID from training frame
train = df.drop("ID")

In [6]:
# For VOL & GLEASON, a zero really means "missing"
vol = train['VOL']
vol[vol == 0] = None
gle = train['GLEASON']
gle[gle == 0] = None

In [7]:
# Convert CAPSULE to a logical factor
train['CAPSULE'] = train['CAPSULE'].asfactor()

In [8]:
# See that the data is ready
train.describe()


Rows: 380 Cols: 8

Chunk compression summary:

chunk_type chunk_name count count_percentage size size_percentage
CBS Bits 1 12.5 118 B 2.9164608
C1 1-Byte Integers 1 12.5 448 B 11.072664
C1N 1-Byte Integers (w/o NAs) 4 50.0 1.8 KB 44.290657
C2S 2-Byte Fractions 2 25.0 1.6 KB 41.72022
Frame distribution summary:

size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.41:54321 4.0 KB 380.0 1.0 8.0
mean 4.0 KB 380.0 1.0 8.0
min 4.0 KB 380.0 1.0 8.0
max 4.0 KB 380.0 1.0 8.0
stddev 0 B 0.0 0.0 0.0
total 4.0 KB 380.0 1.0 8.0
Column-by-Column Summary:

CAPSULE AGE RACE DPROS DCAPS PSA VOL GLEASON
type enum int int int int real real int
mins 0.0 43.0 0.0 1.0 1.0 0.3 1.92 4.0
mean 0.402631578947 66.0394736842 1.08684210526 2.27105263158 1.10789473684 15.4086315789 28.2108450704 6.41798941799
maxs 1.0 79.0 2.0 4.0 2.0 139.7 97.6 9.0
sigma 0.491074338963 6.52707126917 0.308773258025 1.00010761815 0.310656449351 19.9975726686 15.824317841 0.990616012776
zero_count 227 0 3 0 0 0 0 0
missing_count 0 0 0 0 0 0 167 2

In [9]:
# Run GBM
my_gbm = h2o.gbm(           y=train["CAPSULE"],
                 validation_y=train["CAPSULE"],
                            x=train[1:],
                 validation_x=train[1:],
                 distribution = "bernoulli",
                 ntrees=50,
                 learn_rate=0.1)


gbm Model Build Progress: [##################################################] 100%

In [10]:
my_gbm_metrics = my_gbm.model_performance(train)
my_gbm_metrics.show()


ModelMetricsBinomial: gbm
** Reported on test data. **

MSE: 0.0744435128035
R^2: 0.691303022834
LogLoss: 0.270431302856
AUC: 0.980550516829
Gini: 0.961101033659

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.452447834853:

0 1 Error Rate
0 214.0 13.0 0.0573 (13.0/227.0)
1 14.0 139.0 0.0915 (14.0/153.0)
Total 228.0 152.0 0.1488 (0.1488/380.0)
Maximum Metrics:

metric threshold value idx
f1 0.452447834853 0.911475409836 151.0
f2 0.316330050777 0.942928039702 192.0
f0point5 0.561811969917 0.926756352765 128.0
accuracy 0.452447834853 0.928947368421 151.0
precision 0.984119707771 1.0 0.0
absolute_MCC 0.452447834853 0.852148594702 151.0
min_per_class_accuracy 0.433436616032 0.921568627451 157.0
tns 0.984119707771 227.0 0.0
fns 0.984119707771 152.0 0.0
fps 0.012063597474 227.0 378.0
tps 0.197563211765 153.0 229.0
tnr 0.984119707771 1.0 0.0
fnr 0.984119707771 0.993464052288 0.0
fpr 0.012063597474 1.0 378.0
tpr 0.197563211765 1.0 229.0