In [ ]:
from skater.core.explanations import Interpretation
from skater.model import InMemoryModel
In [158]:
from __future__ import division
__all__ = ['h', 'h_all_pairs']
import itertools
import math
import numpy as np
import sklearn.ensemble.partial_dependence as partial_dependence
def h(gbm, array_or_frame, indices_or_columns = 'all'):
if indices_or_columns == 'all':
if gbm.max_depth < array_or_frame.shape[1]:
raise \
Exception(
"gbm.max_depth == {} < array_or_frame.shape[1] == {}, so indices_or_columns must not be 'all'."
.format(gbm.max_depth, array_or_frame.shape[1])
)
else:
if gbm.max_depth < len(indices_or_columns):
raise \
Exception(
"gbm.max_depth == {}, so indices_or_columns must contain at most {} {}."
.format(gbm.max_depth, gbm.max_depth, "element" if gbm.max_depth == 1 else "elements")
)
check_args_contd(array_or_frame, indices_or_columns)
arr, model_inds = get_arr_and_model_inds(array_or_frame, indices_or_columns)
width = arr.shape[1]
f_vals = {}
for n in range(width, 0, -1):
for inds in itertools.combinations(range(width), n):
f_vals[inds] = compute_f_vals(gbm, model_inds, arr, inds)
return compute_h_val(f_vals, arr, tuple(range(width)))
def h_all_pairs(gbm, array_or_frame, indices_or_columns = 'all'):
if gbm.max_depth < 2:
raise Exception("gbm.max_depth must be at least 2.")
check_args_contd(array_or_frame, indices_or_columns)
arr, model_inds = get_arr_and_model_inds(array_or_frame, indices_or_columns)
width = arr.shape[1]
f_vals = {}
for n in [2, 1]:
for inds in itertools.combinations(range(width), n):
f_vals[inds] = compute_f_vals(gbm, model_inds, arr, inds)
h_vals = {}
for inds in itertools.combinations(range(width), 2):
h_vals[inds] = compute_h_val(f_vals, arr, inds)
if indices_or_columns != 'all':
h_vals = {tuple(model_inds[(inds,)]): h_vals[inds] for inds in h_vals.keys()}
if not isinstance(array_or_frame, np.ndarray):
all_cols = array_or_frame.columns.values
h_vals = {tuple(all_cols[(inds,)]): h_vals[inds] for inds in h_vals.keys()}
return h_vals
def check_args_contd(array_or_frame, indices_or_columns):
if indices_or_columns != 'all':
if len(indices_or_columns) < 2:
raise Exception("indices_or_columns must be 'all' or contain at least 2 elements.")
if isinstance(array_or_frame, np.ndarray):
all_inds = range(array_or_frame.shape[1])
if not all(ind in all_inds for ind in indices_or_columns):
raise Exception("indices_or_columns must be 'all' or a subset of {}.".format(all_inds))
else:
all_cols = array_or_frame.columns.tolist()
if not all(col in all_cols for col in indices_or_columns):
raise Exception("indices_or_columns must be 'all' or a subset of {}.".format(all_cols))
def get_arr_and_model_inds(array_or_frame, indices_or_columns):
if isinstance(array_or_frame, np.ndarray):
if indices_or_columns == 'all': indices_or_columns = range(array_or_frame.shape[1])
arr = array_or_frame[:, indices_or_columns]
model_inds = np.array(indices_or_columns)
else:
all_cols = array_or_frame.columns.tolist()
if indices_or_columns == 'all': indices_or_columns = all_cols
arr = array_or_frame[indices_or_columns].values
model_inds = np.array([all_cols.index(col) for col in indices_or_columns])
return arr, model_inds
def compute_f_vals(gbm, model_inds, arr, inds):
feat_vals, feat_val_counts = unique_rows_with_counts(arr[:, inds])
print(f"Grid values{feat_vals.shape} value count {feat_val_counts.shape}")
uncentd_f_vals = partial_dependence.partial_dependence(gbm, model_inds[(inds,)],
grid_resolution=10, X=X_train)[0][0]
print("-------------------")
print(uncentd_f_vals.shape)
mean_uncentd_f_val = np.dot(feat_val_counts, uncentd_f_vals)/arr.shape[0]
f_vals = uncentd_f_vals-mean_uncentd_f_val
return dict(zip(map(tuple, feat_vals), f_vals))
def compute_h_val(f_vals, arr, inds):
feat_vals, feat_val_counts = unique_rows_with_counts(arr)
uniq_height = feat_vals.shape[0]
numer_els = np.zeros(uniq_height)
denom_els = np.empty_like(numer_els)
for i in range(uniq_height):
feat_vals_i = feat_vals[i]
sign = 1.0
for n in range(len(inds), 0, -1):
for subinds in itertools.combinations(inds, n):
numer_els[i] += sign*f_vals[subinds][tuple(feat_vals_i[(subinds,)])]
sign *= -1.0
denom_els[i] = f_vals[inds][tuple(feat_vals_i[(inds,)])]
numer = np.dot(feat_val_counts, numer_els**2)
denom = np.dot(feat_val_counts, denom_els**2)
return math.sqrt(numer/denom) if numer < denom else np.nan
def unique_rows_with_counts(inp_arr):
width = inp_arr.shape[1]
cont_arr = np.ascontiguousarray(inp_arr)
tuple_dtype = [(str(i), inp_arr.dtype) for i in range(width)]
tuple_arr = cont_arr.view(tuple_dtype)
uniq_arr, counts = np.unique(tuple_arr, return_counts = True)
outp_arr = uniq_arr.view(inp_arr.dtype).reshape(-1, width)
return outp_arr, counts
In [159]:
import time
from sklearn import datasets
from sklearn import ensemble
from sklearn.model_selection import train_test_split
iris_data = datasets.load_iris()
X = iris_data.data
y = iris_data.target
# train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [160]:
import pandas as pd
feature_names = [f"f_{i}" for i in X_train_df.columns.values]
print(feature_names)
X_train_df = pd.DataFrame(X_train, columns=feature_names)
target_names = np.unique(y)
print(target_names)
In [161]:
gbes = ensemble.GradientBoostingClassifier(n_estimators=5,
validation_fraction=0.2,
n_iter_no_change=5, tol=0.01,
random_state=0)
In [162]:
start = time.time()
gbes.fit(X_train, y_train)
end = time.time() - start
In [163]:
gbes.feature_importances_
Out[163]:
In [164]:
h_all_pairs(gbes, X_train)
In [92]:
from skater.core.explanations import Interpretation
from skater.model import InMemoryModel
in_obj = InMemoryModel(gbes.predict_proba, examples=X_train, target_names=[0, 1, 2])
interpreter = Interpretation(X_train, feature_names=feature_names)
values = interpreter.partial_dependence.partial_dependence(['f_0'],
in_obj, grid_resolution=10)
#interpreter.partial_dependence.plot_partial_dependence(['f_2', 'f_3'], in_obj, grid_resolution=10, with_variance=True)
In [96]:
values[[0, 1, 2]].T
Out[96]:
In [88]:
value2 = partial_dependence.partial_dependence(gbes, target_variables=[0], grid_resolution=10, X=X_train)
In [97]:
pd.DataFrame(value2[0])
Out[97]:
In [ ]: