notebook.community

Edit and run



In [1]:

    
import h2o



In [2]:

    
# Connect to a pre-existing cluster
h2o.init()









    




H2O cluster uptime: 
15 minutes 45 seconds 666 milliseconds 
H2O cluster version: 
3.5.0.99999
H2O cluster name: 
ece
H2O cluster total nodes: 
1
H2O cluster total memory: 
10.67 GB
H2O cluster total cores: 
8
H2O cluster allowed cores: 
8
H2O cluster healthy: 
True
H2O Connection ip: 
127.0.0.1
H2O Connection port: 
54321



In [3]:

    
from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.

df = h2o.import_file(path=_locate("smalldata/logreg/prostate.csv"))









    



Parse Progress: [##################################################] 100%
Imported /Users/ece/0xdata/h2o-dev/smalldata/logreg/prostate.csv. Parsed 380 rows and 9 cols



In [4]:

    
df.describe()









    



Rows: 380 Cols: 9

Chunk compression summary:






    




chunk_type
chunk_name
count
count_percentage
size
size_percentage
CBS
Bits
1
11.111112
    118  B
2.4210093
C1N
1-Byte Integers (w/o NAs)
5
55.555557
    2.2 KB
45.958145
C2
2-Byte Integers
1
11.111112
    828  B
16.9881
C2S
2-Byte Fractions
2
22.222223
    1.6 KB
34.632744






    



Frame distribution summary:






    





size
number_of_rows
number_of_chunks_per_column
number_of_chunks
10.0.0.24:54321
    4.8 KB
380.0
1.0
9.0
mean
    4.8 KB
380.0
1.0
9.0
min
    4.8 KB
380.0
1.0
9.0
max
    4.8 KB
380.0
1.0
9.0
stddev
      0  B
0.0
0.0
0.0
total
    4.8 KB
380.0
1.0
9.0






    



Column-by-Column Summary:







    





ID
CAPSULE
AGE
RACE
DPROS
DCAPS
PSA
VOL
GLEASON
type
int
int
int
int
int
int
real
real
int
mins
1.0
0.0
43.0
0.0
1.0
1.0
0.3
0.0
0.0
maxs
380.0
1.0
79.0
2.0
4.0
2.0
139.7
97.6
9.0
mean
190.5
0.402631578947
66.0394736842
1.08684210526
2.27105263158
1.10789473684
15.4086315789
15.8129210526
6.38421052632
sigma
109.840793879
0.491074338963
6.52707126917
0.308773258025
1.00010761815
0.310656449351
19.9975726686
18.3476199673
1.09195337443
zero_count
0
227
0
3
0
0
0
167
2
missing_count
0
0
0
0
0
0
0
0
0



In [5]:

    
# Remove ID from training frame
train = df.drop("ID")



In [6]:

    
# For VOL & GLEASON, a zero really means "missing"
vol = train['VOL']
vol[vol == 0] = None
gle = train['GLEASON']
gle[gle == 0] = None



In [7]:

    
# Convert CAPSULE to a logical factor
train['CAPSULE'] = train['CAPSULE'].asfactor()



In [8]:

    
# See that the data is ready
train.describe()









    



Rows: 380 Cols: 8

Chunk compression summary:






    




chunk_type
chunk_name
count
count_percentage
size
size_percentage
CBS
Bits
1
12.5
    118  B
2.9164608
C1N
1-Byte Integers (w/o NAs)
5
62.5
    2.2 KB
55.363323
C2S
2-Byte Fractions
2
25.0
    1.6 KB
41.72022






    



Frame distribution summary:






    





size
number_of_rows
number_of_chunks_per_column
number_of_chunks
10.0.0.24:54321
    4.0 KB
380.0
1.0
8.0
mean
    4.0 KB
380.0
1.0
8.0
min
    4.0 KB
380.0
1.0
8.0
max
    4.0 KB
380.0
1.0
8.0
stddev
      0  B
0.0
0.0
0.0
total
    4.0 KB
380.0
1.0
8.0






    



Column-by-Column Summary:







    





CAPSULE
AGE
RACE
DPROS
DCAPS
PSA
VOL
GLEASON
type
enum
int
int
int
int
real
real
int
mins
0.0
43.0
0.0
1.0
1.0
0.3
0.0
0.0
maxs
1.0
79.0
2.0
4.0
2.0
139.7
97.6
9.0
mean
0.402631578947
66.0394736842
1.08684210526
2.27105263158
1.10789473684
15.4086315789
15.8129210526
6.38421052632
sigma
0.491074338963
6.52707126917
0.308773258025
1.00010761815
0.310656449351
19.9975726686
18.3476199673
1.09195337443
zero_count
227
0
3
0
0
0
167
2
missing_count
0
0
0
0
0
0
0
0



In [9]:

    
# Run GBM
my_gbm = h2o.gbm(           y=train["CAPSULE"],
                 validation_y=train["CAPSULE"],
                            x=train[1:],
                 validation_x=train[1:],
                 distribution = "bernoulli",
                 ntrees=50,
                 learn_rate=0.1)









    



gbm Model Build Progress: [##################################################] 100%



In [10]:

    
my_gbm_metrics = my_gbm.model_performance(train)
my_gbm_metrics.show()









    



ModelMetricsBinomial: gbm
** Reported on test data. **

MSE: 0.0758414746751
R^2: 0.684676256282
LogLoss: 0.274466812848
AUC: 0.978031153724
Gini: 0.956062307449

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.454949666805:






    





0
1
Error
Rate
0
216.0
11.0
0.0485
 (11.0/227.0)
1
14.0
139.0
0.0915
 (14.0/153.0)
Total
230.0
150.0
0.0658
 (25.0/380.0)






    



Maximum Metrics: Maximum metrics at their respective thresholds







    




metric
threshold
value
idx
max f1
0.454949666805
0.917491749175
149.0
max f2
0.303200968061
0.939431396786
196.0
max f0point5
0.472831330673
0.924426450742
146.0
max accuracy
0.454949666805
0.934210526316
149.0
max precision
0.974793768297
1.0
0.0
max absolute_MCC
0.454949666805
0.862913038286
149.0
max min_per_class_accuracy
0.437399498018
0.921568627451
156.0

H2O cluster uptime:	15 minutes 45 seconds 666 milliseconds
H2O cluster version:	3.5.0.99999
H2O cluster name:	ece
H2O cluster total nodes:	1
H2O cluster total memory:	10.67 GB
H2O cluster total cores:	8
H2O cluster allowed cores:	8
H2O cluster healthy:	True
H2O Connection ip:	127.0.0.1
H2O Connection port:	54321

chunk_type	chunk_name	count	count_percentage	size	size_percentage
CBS	Bits	1	11.111112	118 B	2.4210093
C1N	1-Byte Integers (w/o NAs)	5	55.555557	2.2 KB	45.958145
C2	2-Byte Integers	1	11.111112	828 B	16.9881
C2S	2-Byte Fractions	2	22.222223	1.6 KB	34.632744

	size	number_of_rows	number_of_chunks_per_column	number_of_chunks
10.0.0.24:54321	4.8 KB	380.0	1.0	9.0
mean	4.8 KB	380.0	1.0	9.0
min	4.8 KB	380.0	1.0	9.0
max	4.8 KB	380.0	1.0	9.0
stddev	0 B	0.0	0.0	0.0
total	4.8 KB	380.0	1.0	9.0

	ID	CAPSULE	AGE	RACE	DPROS	DCAPS	PSA	VOL	GLEASON
type	int	int	int	int	int	int	real	real	int
mins	1.0	0.0	43.0	0.0	1.0	1.0	0.3	0.0	0.0
maxs	380.0	1.0	79.0	2.0	4.0	2.0	139.7	97.6	9.0
mean	190.5	0.402631578947	66.0394736842	1.08684210526	2.27105263158	1.10789473684	15.4086315789	15.8129210526	6.38421052632
sigma	109.840793879	0.491074338963	6.52707126917	0.308773258025	1.00010761815	0.310656449351	19.9975726686	18.3476199673	1.09195337443
zero_count	0	227	0	3	0	0	0	167	2
missing_count	0	0	0	0	0	0	0	0	0

	0	1	Error	Rate
0	216.0	11.0	0.0485	(11.0/227.0)
1	14.0	139.0	0.0915	(14.0/153.0)
Total	230.0	150.0	0.0658	(25.0/380.0)

metric	threshold	value	idx
max f1	0.454949666805	0.917491749175	149.0
max f2	0.303200968061	0.939431396786	196.0
max f0point5	0.472831330673	0.924426450742	146.0
max accuracy	0.454949666805	0.934210526316	149.0
max precision	0.974793768297	1.0	0.0
max absolute_MCC	0.454949666805	0.862913038286	149.0
max min_per_class_accuracy	0.437399498018	0.921568627451	156.0