In [1]:
import h2o
import shap
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o import H2OFrame

# initialize H2O
h2o.init()

# load JS visualization code to notebook
shap.initjs()


Checking whether there is an H2O instance running at http://localhost:54321 . connected.
H2O cluster uptime: 58 mins 24 secs
H2O cluster timezone: America/Los_Angeles
H2O data parsing timezone: UTC
H2O cluster version: 3.27.0.4746
H2O cluster version age: 21 days, 8 hours and 56 minutes
H2O cluster name: H2O_from_python_zhaq_n30gim
H2O cluster total nodes: 1
H2O cluster free memory: 3.261 Gb
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster status: locked, healthy
H2O connection url: http://localhost:54321
H2O connection proxy: None
H2O internal security: False
H2O API Extensions: Amazon S3, XGBoost, Algos, AutoML, Core V3, Core V4
Python version: 3.7.3 final

In [2]:
# Import the prostate dataset
h2o_df = h2o.import_file("https://raw.github.com/h2oai/h2o/master/smalldata/logreg/prostate.csv")

# Split the data into Train/Test/Validation with Train having 70% and test and validation 15% each
train,test,valid = h2o_df.split_frame(ratios=[.7, .15])

# Convert the response column to a factor
h2o_df["CAPSULE"] = h2o_df["CAPSULE"].asfactor()


Parse progress: |█████████████████████████████████████████████████████████| 100%

In [3]:
# Generate a GBM model using the training dataset
model = H2OGradientBoostingEstimator(distribution="bernoulli",
                                     ntrees=100,
                                     max_depth=4,
                                     learn_rate=0.1)
model.train(y="CAPSULE", x=["AGE","RACE","PSA","GLEASON"],training_frame=h2o_df)


gbm Model Build progress: |███████████████████████████████████████████████| 100%

In [4]:
# calculate SHAP values using function predict_contributions
contributions = model.predict_contributions(h2o_df)

In [5]:
# convert the H2O Frame to use with shap's visualization functions
contributions_matrix = contributions.as_data_frame().as_matrix()
# shap values are calculated for all features
shap_values = contributions_matrix[:,0:4]
# expected values is the last returned column
expected_value = contributions_matrix[:,4].min()


/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ipykernel_launcher.py:2: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
  

In [6]:
X=["AGE","RACE","PSA","GLEASON"]

# visualize the training set predictions
shap.force_plot(expected_value, shap_values, X)


Out[6]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.

In [7]:
# summarize the effects of all the features
shap.summary_plot(shap_values, X)



In [8]:
shap.summary_plot(shap_values, X, plot_type="bar")



In [ ]: