notebook.community

Edit and run



In [1]:

    
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator



In [2]:

    
# Connect to a pre-existing cluster
h2o.init()









    




H2O cluster uptime: 
5 seconds 730 milliseconds 
H2O cluster version: 
3.7.0.99999
H2O cluster name: 
spIdea
H2O cluster total nodes: 
1
H2O cluster total free memory: 
12.44 GB
H2O cluster total cores: 
8
H2O cluster allowed cores: 
8
H2O cluster healthy: 
True
H2O Connection ip: 
127.0.0.1
H2O Connection port: 
54321
H2O Connection proxy: 
None
Python Version: 
3.5.0



In [3]:

    
from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.

df = h2o.import_file(path=_locate("smalldata/logreg/prostate.csv"))









    



Parse Progress: [##################################################] 100%



In [4]:

    
df.describe()









    



Rows:380 Cols:9

Chunk compression summary: 






    




chunk_type
chunk_name
count
count_percentage
size
size_percentage
CBS
Bits
1
11.111112
    118  B
2.4210093
C1N
1-Byte Integers (w/o NAs)
5
55.555557
    2.2 KB
45.958145
C2
2-Byte Integers
1
11.111112
    828  B
16.9881
C2S
2-Byte Fractions
2
22.222223
    1.6 KB
34.632744






    



Frame distribution summary: 






    





size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.84:54321
    4.8 KB
380.0
1.0
9.0
mean
    4.8 KB
380.0
1.0
9.0
min
    4.8 KB
380.0
1.0
9.0
max
    4.8 KB
380.0
1.0
9.0
stddev
      0  B
0.0
0.0
0.0
total
    4.8 KB
380.0
1.0
9.0






    










    





       ID                CAPSULE           AGE              RACE              DPROS             DCAPS             PSA               VOL               GLEASON           
type   int               int               int              int               int               int               real              real              int               
mins   1.0               0.0               43.0             0.0               1.0               1.0               0.3               0.0               0.0               
mean   190.5             0.4026315789473684 66.03947368421049 1.0868421052631572 2.2710526315789488 1.1078947368421048 15.408631578947375 15.812921052631573 6.3842105263157904
maxs   380.0             1.0               79.0             2.0               4.0               2.0               139.70000000000002 97.60000000000001 9.0               
sigma  109.84079387914127 0.4910743389630552 6.527071269173311 0.3087732580252793 1.0001076181502861 0.3106564493514939 19.99757266856046 18.347619967271175 1.0919533744261092
zeros  0                 227               0                3                 0                 0                 0                 167               2                 
missing 0                 0                 0                0                 0                 0                 0                 0                 0                 
0      1.0               0.0               65.0             1.0               2.0               1.0               1.4000000000000001 0.0               6.0               
1      2.0               0.0               72.0             1.0               3.0               2.0               6.7               0.0               7.0               
2      3.0               0.0               70.0             1.0               1.0               2.0               4.9               0.0               6.0               
3      4.0               0.0               76.0             2.0               2.0               1.0               51.2              20.0              7.0               
4      5.0               0.0               69.0             1.0               1.0               1.0               12.3              55.9              6.0               
5      6.0               1.0               71.0             1.0               3.0               2.0               3.3000000000000003 0.0               8.0               
6      7.0               0.0               68.0             2.0               4.0               2.0               31.900000000000002 0.0               7.0               
7      8.0               0.0               61.0             2.0               4.0               2.0               66.7              27.2              7.0               
8      9.0               0.0               69.0             1.0               1.0               1.0               3.9               24.0              7.0               
9      10.0              0.0               68.0             2.0               1.0               2.0               13.0              0.0               6.0



In [5]:

    
# Remove ID from training frame
train = df.drop("ID")



In [6]:

    
# For VOL & GLEASON, a zero really means "missing"
vol = train['VOL']
vol[vol == 0] = None
gle = train['GLEASON']
gle[gle == 0] = None



In [7]:

    
# Convert CAPSULE to a logical factor
train['CAPSULE'] = train['CAPSULE'].asfactor()



In [8]:

    
# See that the data is ready
train.describe()









    



Rows:380 Cols:8

Chunk compression summary: 






    




chunk_type
chunk_name
count
count_percentage
size
size_percentage
CBS
Bits
1
12.5
    118  B
2.9164608
C1N
1-Byte Integers (w/o NAs)
5
62.5
    2.2 KB
55.363323
C2S
2-Byte Fractions
2
25.0
    1.6 KB
41.72022






    



Frame distribution summary: 






    





size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.84:54321
    4.0 KB
380.0
1.0
8.0
mean
    4.0 KB
380.0
1.0
8.0
min
    4.0 KB
380.0
1.0
8.0
max
    4.0 KB
380.0
1.0
8.0
stddev
      0  B
0.0
0.0
0.0
total
    4.0 KB
380.0
1.0
8.0






    










    





       CAPSULE           AGE              RACE              DPROS             DCAPS             PSA               VOL               GLEASON           
type   enum              int              int               int               int               real              real              int               
mins   0.0               43.0             0.0               1.0               1.0               0.3               0.0               0.0               
mean   0.4026315789473684 66.03947368421049 1.0868421052631572 2.2710526315789488 1.1078947368421048 15.408631578947375 15.812921052631573 6.3842105263157904
maxs   1.0               79.0             2.0               4.0               2.0               139.70000000000002 97.60000000000001 9.0               
sigma  0.4910743389630552 6.527071269173311 0.3087732580252793 1.0001076181502861 0.3106564493514939 19.99757266856046 18.347619967271175 1.0919533744261092
zeros  227               0                3                 0                 0                 0                 167               2                 
missing 0                 0                0                 0                 0                 0                 0                 0                 
0      0                 65.0             1.0               2.0               1.0               1.4000000000000001 0.0               6.0               
1      0                 72.0             1.0               3.0               2.0               6.7               0.0               7.0               
2      0                 70.0             1.0               1.0               2.0               4.9               0.0               6.0               
3      0                 76.0             2.0               2.0               1.0               51.2              20.0              7.0               
4      0                 69.0             1.0               1.0               1.0               12.3              55.9              6.0               
5      1                 71.0             1.0               3.0               2.0               3.3000000000000003 0.0               8.0               
6      0                 68.0             2.0               4.0               2.0               31.900000000000002 0.0               7.0               
7      0                 61.0             2.0               4.0               2.0               66.7              27.2              7.0               
8      0                 69.0             1.0               1.0               1.0               3.9               24.0              7.0               
9      0                 68.0             2.0               1.0               2.0               13.0              0.0               6.0



In [9]:

    
# Run GBM
my_gbm = H2OGradientBoostingEstimator(distribution = "bernoulli", ntrees=50, learn_rate=0.1)

my_gbm.train(x=list(range(1,train.ncol)), y="CAPSULE", training_frame=train, validation_frame=train)









    



gbm Model Build Progress: [##################################################] 100%



In [10]:

    
my_gbm_metrics = my_gbm.model_performance(train)
my_gbm_metrics.show()









    



ModelMetricsBinomial: gbm
** Reported on test data. **

MSE: 0.07584147467507414
R^2: 0.6846762562816877
LogLoss: 0.2744668128481441
AUC: 0.9780311537243385
Gini: 0.9560623074486769

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.4549496668047897: 






    





0
1
Error
Rate
0
216.0
11.0
0.0485
 (11.0/227.0)
1
14.0
139.0
0.0915
 (14.0/153.0)
Total
230.0
150.0
0.0658
 (25.0/380.0)






    



Maximum Metrics: Maximum metrics at their respective thresholds







    




metric
threshold
value
idx
max f1
0.4549497
0.9174917
149.0
max f2
0.3032010
0.9394314
196.0
max f0point5
0.4728313
0.9244265
146.0
max accuracy
0.4549497
0.9342105
149.0
max precision
0.9747938
1.0
0.0
max absolute_MCC
0.4549497
0.8629130
149.0
max min_per_class_accuracy
0.4373995
0.9215686
156.0






    



Gains/Lift Table: Avg response rate: 40.26 %







    





group
lower_threshold
cumulative_data_fraction
response_rate
cumulative_response_rate
capture_rate
cumulative_capture_rate
lift
cumulative_lift
gain
cumulative_gain

1
0.9405750
0.05
1.0
1.0
0.1241830
0.1241830
2.4836601
2.4836601
148.3660131
148.3660131

2
0.8921980
0.1
1.0
1.0
0.1241830
0.2483660
2.4836601
2.4836601
148.3660131
148.3660131

3
0.8263695
0.15
1.0
1.0
0.1241830
0.3725490
2.4836601
2.4836601
148.3660131
148.3660131

4
0.7595460
0.2
0.9473684
0.9868421
0.1176471
0.4901961
2.3529412
2.4509804
135.2941176
145.0980392

5
0.7081926
0.25
1.0
0.9894737
0.1241830
0.6143791
2.4836601
2.4575163
148.3660131
145.7516340

6
0.6364312
0.3
0.8947368
0.9736842
0.1111111
0.7254902
2.2222222
2.4183007
122.2222222
141.8300654

7
0.5478651
0.35
0.6842105
0.9323308
0.0849673
0.8104575
1.6993464
2.3155929
69.9346405
131.5592904

8
0.4499827
0.4
0.7894737
0.9144737
0.0980392
0.9084967
1.9607843
2.2712418
96.0784314
127.1241830

9
0.3927870
0.45
0.2105263
0.8362573
0.0261438
0.9346405
0.5228758
2.0769789
-47.7124183
107.6978940

10
0.3207657
0.5
0.3157895
0.7842105
0.0392157
0.9738562
0.7843137
1.9477124
-21.5686275
94.7712418

11
0.2425744
0.55
0.1578947
0.7272727
0.0196078
0.9934641
0.3921569
1.8062983
-60.7843137
80.6298277

12
0.1977616
0.6
0.0
0.6666667
0.0
0.9934641
0.0
1.6557734
-100.0
65.5773420

13
0.1586941
0.65
0.0526316
0.6194332
0.0065359
1.0
0.1307190
1.5384615
-86.9281046
53.8461538

14
0.1353591
0.7
0.0
0.5751880
0.0
1.0
0.0
1.4285714
-100.0
42.8571429

15
0.1094101
0.75
0.0
0.5368421
0.0
1.0
0.0
1.3333333
-100.0
33.3333333

16
0.0923828
0.8
0.0
0.5032895
0.0
1.0
0.0
1.25
-100.0
25.0

17
0.0665933
0.85
0.0
0.4736842
0.0
1.0
0.0
1.1764706
-100.0
17.6470588

18
0.0477968
0.9
0.0
0.4473684
0.0
1.0
0.0
1.1111111
-100.0
11.1111111

19
0.0276973
0.95
0.0
0.4238227
0.0
1.0
0.0
1.0526316
-100.0
5.2631579

20
0.0125566
1.0
0.0
0.4026316
0.0
1.0
0.0
1.0
-100.0
0.0

H2O cluster uptime:	5 seconds 730 milliseconds
H2O cluster version:	3.7.0.99999
H2O cluster name:	spIdea
H2O cluster total nodes:	1
H2O cluster total free memory:	12.44 GB
H2O cluster total cores:	8
H2O cluster allowed cores:	8
H2O cluster healthy:	True
H2O Connection ip:	127.0.0.1
H2O Connection port:	54321
H2O Connection proxy:	None
Python Version:	3.5.0

chunk_type	chunk_name	count	count_percentage	size	size_percentage
CBS	Bits	1	11.111112	118 B	2.4210093
C1N	1-Byte Integers (w/o NAs)	5	55.555557	2.2 KB	45.958145
C2	2-Byte Integers	1	11.111112	828 B	16.9881
C2S	2-Byte Fractions	2	22.222223	1.6 KB	34.632744

	size	number_of_rows	number_of_chunks_per_column	number_of_chunks
172.16.2.84:54321	4.8 KB	380.0	1.0	9.0
mean	4.8 KB	380.0	1.0	9.0
min	4.8 KB	380.0	1.0	9.0
max	4.8 KB	380.0	1.0	9.0
stddev	0 B	0.0	0.0	0.0
total	4.8 KB	380.0	1.0	9.0

	ID	CAPSULE	AGE	RACE	DPROS	DCAPS	PSA	VOL	GLEASON
type	int	int	int	int	int	int	real	real	int
mins	1.0	0.0	43.0	0.0	1.0	1.0	0.3	0.0	0.0
mean	190.5	0.4026315789473684	66.03947368421049	1.0868421052631572	2.2710526315789488	1.1078947368421048	15.408631578947375	15.812921052631573	6.3842105263157904
maxs	380.0	1.0	79.0	2.0	4.0	2.0	139.70000000000002	97.60000000000001	9.0
sigma	109.84079387914127	0.4910743389630552	6.527071269173311	0.3087732580252793	1.0001076181502861	0.3106564493514939	19.99757266856046	18.347619967271175	1.0919533744261092
zeros	0	227	0	3	0	0	0	167	2
missing	0	0	0	0	0	0	0	0	0
0	1.0	0.0	65.0	1.0	2.0	1.0	1.4000000000000001	0.0	6.0
1	2.0	0.0	72.0	1.0	3.0	2.0	6.7	0.0	7.0
2	3.0	0.0	70.0	1.0	1.0	2.0	4.9	0.0	6.0
3	4.0	0.0	76.0	2.0	2.0	1.0	51.2	20.0	7.0
4	5.0	0.0	69.0	1.0	1.0	1.0	12.3	55.9	6.0
5	6.0	1.0	71.0	1.0	3.0	2.0	3.3000000000000003	0.0	8.0
6	7.0	0.0	68.0	2.0	4.0	2.0	31.900000000000002	0.0	7.0
7	8.0	0.0	61.0	2.0	4.0	2.0	66.7	27.2	7.0
8	9.0	0.0	69.0	1.0	1.0	1.0	3.9	24.0	7.0
9	10.0	0.0	68.0	2.0	1.0	2.0	13.0	0.0	6.0

	CAPSULE	AGE	RACE	DPROS	DCAPS	PSA	VOL	GLEASON
type	enum	int	int	int	int	real	real	int
mins	0.0	43.0	0.0	1.0	1.0	0.3	0.0	0.0
mean	0.4026315789473684	66.03947368421049	1.0868421052631572	2.2710526315789488	1.1078947368421048	15.408631578947375	15.812921052631573	6.3842105263157904
maxs	1.0	79.0	2.0	4.0	2.0	139.70000000000002	97.60000000000001	9.0
sigma	0.4910743389630552	6.527071269173311	0.3087732580252793	1.0001076181502861	0.3106564493514939	19.99757266856046	18.347619967271175	1.0919533744261092
zeros	227	0	3	0	0	0	167	2
missing	0	0	0	0	0	0	0	0
0	0	65.0	1.0	2.0	1.0	1.4000000000000001	0.0	6.0
1	0	72.0	1.0	3.0	2.0	6.7	0.0	7.0
2	0	70.0	1.0	1.0	2.0	4.9	0.0	6.0
3	0	76.0	2.0	2.0	1.0	51.2	20.0	7.0
4	0	69.0	1.0	1.0	1.0	12.3	55.9	6.0
5	1	71.0	1.0	3.0	2.0	3.3000000000000003	0.0	8.0
6	0	68.0	2.0	4.0	2.0	31.900000000000002	0.0	7.0
7	0	61.0	2.0	4.0	2.0	66.7	27.2	7.0
8	0	69.0	1.0	1.0	1.0	3.9	24.0	7.0
9	0	68.0	2.0	1.0	2.0	13.0	0.0	6.0

	0	1	Error	Rate
0	216.0	11.0	0.0485	(11.0/227.0)
1	14.0	139.0	0.0915	(14.0/153.0)
Total	230.0	150.0	0.0658	(25.0/380.0)

metric	threshold	value	idx
max f1	0.4549497	0.9174917	149.0
max f2	0.3032010	0.9394314	196.0
max f0point5	0.4728313	0.9244265	146.0
max accuracy	0.4549497	0.9342105	149.0
max precision	0.9747938	1.0	0.0
max absolute_MCC	0.4549497	0.8629130	149.0
max min_per_class_accuracy	0.4373995	0.9215686	156.0

group	lower_threshold	cumulative_data_fraction	response_rate	cumulative_response_rate	capture_rate	cumulative_capture_rate	lift	cumulative_lift	gain	cumulative_gain
1	0.9405750	0.05	1.0	1.0	0.1241830	0.1241830	2.4836601	2.4836601	148.3660131	148.3660131
2	0.8921980	0.1	1.0	1.0	0.1241830	0.2483660	2.4836601	2.4836601	148.3660131	148.3660131
3	0.8263695	0.15	1.0	1.0	0.1241830	0.3725490	2.4836601	2.4836601	148.3660131	148.3660131
4	0.7595460	0.2	0.9473684	0.9868421	0.1176471	0.4901961	2.3529412	2.4509804	135.2941176	145.0980392
5	0.7081926	0.25	1.0	0.9894737	0.1241830	0.6143791	2.4836601	2.4575163	148.3660131	145.7516340
6	0.6364312	0.3	0.8947368	0.9736842	0.1111111	0.7254902	2.2222222	2.4183007	122.2222222	141.8300654
7	0.5478651	0.35	0.6842105	0.9323308	0.0849673	0.8104575	1.6993464	2.3155929	69.9346405	131.5592904
8	0.4499827	0.4	0.7894737	0.9144737	0.0980392	0.9084967	1.9607843	2.2712418	96.0784314	127.1241830
9	0.3927870	0.45	0.2105263	0.8362573	0.0261438	0.9346405	0.5228758	2.0769789	-47.7124183	107.6978940
10	0.3207657	0.5	0.3157895	0.7842105	0.0392157	0.9738562	0.7843137	1.9477124	-21.5686275	94.7712418
11	0.2425744	0.55	0.1578947	0.7272727	0.0196078	0.9934641	0.3921569	1.8062983	-60.7843137	80.6298277
12	0.1977616	0.6	0.0	0.6666667	0.0	0.9934641	0.0	1.6557734	-100.0	65.5773420
13	0.1586941	0.65	0.0526316	0.6194332	0.0065359	1.0	0.1307190	1.5384615	-86.9281046	53.8461538
14	0.1353591	0.7	0.0	0.5751880	0.0	1.0	0.0	1.4285714	-100.0	42.8571429
15	0.1094101	0.75	0.0	0.5368421	0.0	1.0	0.0	1.3333333	-100.0	33.3333333
16	0.0923828	0.8	0.0	0.5032895	0.0	1.0	0.0	1.25	-100.0	25.0
17	0.0665933	0.85	0.0	0.4736842	0.0	1.0	0.0	1.1764706	-100.0	17.6470588
18	0.0477968	0.9	0.0	0.4473684	0.0	1.0	0.0	1.1111111	-100.0	11.1111111
19	0.0276973	0.95	0.0	0.4238227	0.0	1.0	0.0	1.0526316	-100.0	5.2631579
20	0.0125566	1.0	0.0	0.4026316	0.0	1.0	0.0	1.0	-100.0	0.0