In [1]:
import h2o
In [2]:
# Connect to a pre-existing cluster
h2o.init()
In [3]:
from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.
df = h2o.import_file(path=_locate("smalldata/logreg/prostate.csv"))
In [4]:
df.describe()
In [5]:
# Remove ID from training frame
train = df.drop("ID")
In [6]:
# For VOL & GLEASON, a zero really means "missing"
vol = train['VOL']
vol[vol == 0] = None
gle = train['GLEASON']
gle[gle == 0] = None
In [7]:
# Convert CAPSULE to a logical factor
train['CAPSULE'] = train['CAPSULE'].asfactor()
In [8]:
# See that the data is ready
train.describe()
In [9]:
# Run GBM
my_gbm = h2o.gbm( y=train["CAPSULE"],
validation_y=train["CAPSULE"],
x=train[1:],
validation_x=train[1:],
distribution = "bernoulli",
ntrees=50,
learn_rate=0.1)
In [10]:
my_gbm_metrics = my_gbm.model_performance(train)
my_gbm_metrics.show()