notebook.community

Edit and run



In [ ]:

    
from skater.core.explanations import Interpretation
from skater.model import InMemoryModel



In [158]:

    
from __future__ import division


__all__ = ['h', 'h_all_pairs']


import itertools

import math

import numpy as np

import sklearn.ensemble.partial_dependence as partial_dependence


def h(gbm, array_or_frame, indices_or_columns = 'all'):
    if indices_or_columns == 'all':
        if gbm.max_depth < array_or_frame.shape[1]:
            raise \
                Exception(
                    "gbm.max_depth == {} < array_or_frame.shape[1] == {}, so indices_or_columns must not be 'all'."
                    .format(gbm.max_depth, array_or_frame.shape[1])
                )
    else:
        if gbm.max_depth < len(indices_or_columns):
            raise \
                Exception(
                    "gbm.max_depth == {}, so indices_or_columns must contain at most {} {}."
                    .format(gbm.max_depth, gbm.max_depth, "element" if gbm.max_depth == 1 else "elements")
                )
    check_args_contd(array_or_frame, indices_or_columns)

    arr, model_inds = get_arr_and_model_inds(array_or_frame, indices_or_columns)

    width = arr.shape[1]
    f_vals = {}
    for n in range(width, 0, -1):
        for inds in itertools.combinations(range(width), n):
            f_vals[inds] = compute_f_vals(gbm, model_inds, arr, inds)

    return compute_h_val(f_vals, arr, tuple(range(width)))


def h_all_pairs(gbm, array_or_frame, indices_or_columns = 'all'):
    if gbm.max_depth < 2:
        raise Exception("gbm.max_depth must be at least 2.")
    check_args_contd(array_or_frame, indices_or_columns)

    arr, model_inds = get_arr_and_model_inds(array_or_frame, indices_or_columns)

    width = arr.shape[1]
    f_vals = {}
    for n in [2, 1]:
        for inds in itertools.combinations(range(width), n):
            f_vals[inds] = compute_f_vals(gbm, model_inds, arr, inds)

    h_vals = {}
    for inds in itertools.combinations(range(width), 2):
        h_vals[inds] = compute_h_val(f_vals, arr, inds)
    if indices_or_columns != 'all':
        h_vals = {tuple(model_inds[(inds,)]): h_vals[inds] for inds in h_vals.keys()}
    if not isinstance(array_or_frame, np.ndarray):
        all_cols = array_or_frame.columns.values
        h_vals = {tuple(all_cols[(inds,)]): h_vals[inds] for inds in h_vals.keys()}

    return h_vals


def check_args_contd(array_or_frame, indices_or_columns):
    if indices_or_columns != 'all':
        if len(indices_or_columns) < 2:
            raise Exception("indices_or_columns must be 'all' or contain at least 2 elements.")
        if isinstance(array_or_frame, np.ndarray):
            all_inds = range(array_or_frame.shape[1])
            if not all(ind in all_inds for ind in indices_or_columns):
                raise Exception("indices_or_columns must be 'all' or a subset of {}.".format(all_inds))
        else:
            all_cols = array_or_frame.columns.tolist()
            if not all(col in all_cols for col in indices_or_columns):
                raise Exception("indices_or_columns must be 'all' or a subset of {}.".format(all_cols))


def get_arr_and_model_inds(array_or_frame, indices_or_columns):
    if isinstance(array_or_frame, np.ndarray):
        if indices_or_columns == 'all': indices_or_columns = range(array_or_frame.shape[1])
        arr = array_or_frame[:, indices_or_columns]
        model_inds = np.array(indices_or_columns)
    else:
        all_cols = array_or_frame.columns.tolist()
        if indices_or_columns == 'all': indices_or_columns = all_cols
        arr = array_or_frame[indices_or_columns].values
        model_inds = np.array([all_cols.index(col) for col in indices_or_columns])
    return arr, model_inds


def compute_f_vals(gbm, model_inds, arr, inds):
    feat_vals, feat_val_counts = unique_rows_with_counts(arr[:, inds])
    print(f"Grid values{feat_vals.shape} value count {feat_val_counts.shape}")
    uncentd_f_vals = partial_dependence.partial_dependence(gbm, model_inds[(inds,)], 
                                                           grid_resolution=10, X=X_train)[0][0]
    print("-------------------")
    print(uncentd_f_vals.shape)
    mean_uncentd_f_val = np.dot(feat_val_counts, uncentd_f_vals)/arr.shape[0]
    f_vals = uncentd_f_vals-mean_uncentd_f_val
    return dict(zip(map(tuple, feat_vals), f_vals))


def compute_h_val(f_vals, arr, inds):
    feat_vals, feat_val_counts = unique_rows_with_counts(arr)
    uniq_height = feat_vals.shape[0]
    numer_els = np.zeros(uniq_height)
    denom_els = np.empty_like(numer_els)
    for i in range(uniq_height):
        feat_vals_i = feat_vals[i]
        sign = 1.0
        for n in range(len(inds), 0, -1):
            for subinds in itertools.combinations(inds, n):
                numer_els[i] += sign*f_vals[subinds][tuple(feat_vals_i[(subinds,)])]
            sign *= -1.0
        denom_els[i] = f_vals[inds][tuple(feat_vals_i[(inds,)])]
    numer = np.dot(feat_val_counts, numer_els**2)
    denom = np.dot(feat_val_counts, denom_els**2)
    return math.sqrt(numer/denom) if numer < denom else np.nan


def unique_rows_with_counts(inp_arr):
    width = inp_arr.shape[1]
    cont_arr = np.ascontiguousarray(inp_arr)
    tuple_dtype = [(str(i), inp_arr.dtype) for i in range(width)]
    tuple_arr = cont_arr.view(tuple_dtype)
    uniq_arr, counts = np.unique(tuple_arr, return_counts = True)
    outp_arr = uniq_arr.view(inp_arr.dtype).reshape(-1, width)
    return outp_arr, counts



In [159]:

    
import time
from sklearn import datasets
from sklearn import ensemble
from sklearn.model_selection import train_test_split


iris_data = datasets.load_iris()
X = iris_data.data
y = iris_data.target

# train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [160]:

    
import pandas as pd

feature_names = [f"f_{i}" for i in X_train_df.columns.values]
print(feature_names)
X_train_df = pd.DataFrame(X_train, columns=feature_names)

target_names = np.unique(y)
print(target_names)









    



['f_f_f_f_f_f_f_f_0', 'f_f_f_f_f_f_f_f_1', 'f_f_f_f_f_f_f_f_2', 'f_f_f_f_f_f_f_f_3']
[0 1 2]



In [161]:

    
gbes = ensemble.GradientBoostingClassifier(n_estimators=5,
                                               validation_fraction=0.2,
                                               n_iter_no_change=5, tol=0.01,
                                               random_state=0)



In [162]:

    
start = time.time()
gbes.fit(X_train, y_train)
end = time.time() - start



In [163]:

    
gbes.feature_importances_









    Out[163]:





array([ 0.00729438,  0.00658948,  0.29969678,  0.68641936])



In [164]:

    
h_all_pairs(gbes, X_train)









    



Grid values(102, 2) value count (102,)
-------------------
(100,)






    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-164-28209a693a26> in <module>
----> 1 h_all_pairs(gbes, X_train)

<ipython-input-158-a24bece3241c> in h_all_pairs(gbm, array_or_frame, indices_or_columns)
     53     for n in [2, 1]:
     54         for inds in itertools.combinations(range(width), n):
---> 55             f_vals[inds] = compute_f_vals(gbm, model_inds, arr, inds)
     56 
     57     h_vals = {}

<ipython-input-158-a24bece3241c> in compute_f_vals(gbm, model_inds, arr, inds)
    101     print("-------------------")
    102     print(uncentd_f_vals.shape)
--> 103     mean_uncentd_f_val = np.dot(feat_val_counts, uncentd_f_vals)/arr.shape[0]
    104     f_vals = uncentd_f_vals-mean_uncentd_f_val
    105     return dict(zip(map(tuple, feat_vals), f_vals))

ValueError: shapes (102,) and (100,) not aligned: 102 (dim 0) != 100 (dim 0)



In [92]:

    
from skater.core.explanations import Interpretation
from skater.model import InMemoryModel

in_obj = InMemoryModel(gbes.predict_proba, examples=X_train, target_names=[0, 1, 2])
interpreter = Interpretation(X_train, feature_names=feature_names)
values = interpreter.partial_dependence.partial_dependence(['f_0'], 
                                                           in_obj, grid_resolution=10)
#interpreter.partial_dependence.plot_partial_dependence(['f_2', 'f_3'], in_obj, grid_resolution=10, with_variance=True)









    



2018-11-30 03:45:23,008 - skater.core.explanations - WARNING - Progress bars slow down runs by 10-20%. For slightly 
faster runs, do progressbar=False






    



[10/10] grid cells ████████████████████ Time elapsed: 0 seconds



In [96]:

    
values[[0, 1, 2]].T



In [88]:

    
value2 = partial_dependence.partial_dependence(gbes, target_variables=[0], grid_resolution=10, X=X_train)



In [97]:

    
pd.DataFrame(value2[0])



In [ ]:

	0	1	2	3	4	5	6	7	8	9
0	0.343019	0.341402	0.341402	0.341402	0.341402	0.342175	0.342175	0.342175	0.342175	0.351203
1	0.330244	0.330987	0.330987	0.330987	0.330987	0.332724	0.332724	0.332724	0.332724	0.313899
2	0.326737	0.327611	0.327611	0.327611	0.327611	0.325101	0.325101	0.325101	0.325101	0.334898

	0	1	2	3	4	5	6	7	8	9
0	-0.021333	-0.021333	-0.027041	-0.027041	-0.027041	-0.027041	-0.027041	-0.027041	-0.027041	-0.027041
1	-0.043498	-0.043498	-0.043498	-0.043498	-0.043498	-0.043498	-0.043498	-0.043498	-0.043498	-0.123083
2	-0.090783	-0.090783	-0.090783	-0.090783	-0.090783	-0.103075	-0.103075	-0.103075	-0.103075	-0.103075