In [2]:
import xgboost as xgb
import shap
from sklearn.model_selection import train_test_split
import pandas as pd
In [3]:
X,y = shap.datasets.boston()
X.head()
Out[3]:
In [4]:
print(y.shape) # predict house price
y[4:10]
Out[4]:
In [5]:
y = pd.DataFrame(y)
y.head()
Out[5]:
In [6]:
# for regression method, I can not use stratify split with this method
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=410, train_size=0.75, test_size=0.25)
In [7]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
In [8]:
param_dist = {'learning_rate':0.01}
model = xgb.XGBRegressor(**param_dist)
model.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
verbose=False)
Out[8]:
In [9]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
In [10]:
print(shap_values.shape)
shap_values
Out[10]:
In [11]:
# If you JS load successfully, this will generate interactive visualization
shap.initjs()
check_row = 7 # check each individual case
shap.force_plot(explainer.expected_value, shap_values[check_row,:], X.iloc[check_row,:])
Out[11]:
In [12]:
shap.summary_plot(shap_values, X)
In [37]:
X.iloc[7:9, :]
Out[37]:
In [40]:
shap_values[7:9, :]
Out[40]:
In [36]:
shap.summary_plot(shap_values[7:9, :], X.iloc[7:9, :])
In [55]:
# comparing with xgboost feature importance (by default it's using gain to rank fearure importance)
print(X.columns)
model.feature_importances_
Out[55]:
In [57]:
# using absolute mean value of SHAP values to rank the features
shap.summary_plot(shap_values, X, plot_type="bar")