In [47]:
import xgboost
import numpy as np
import shap
import time
from tqdm import tqdm
import matplotlib.pylab as pl
In [21]:
from iml.common import convert_to_instance, convert_to_model, match_instance_to_data, match_model_to_data, convert_to_instance_with_index
from iml.explanations import AdditiveExplanation
from iml.links import convert_to_link, IdentityLink
from iml.datatypes import convert_to_data, DenseData
import logging
from iml.explanations import AdditiveExplanation
log = logging.getLogger('shap')
from shap import KernelExplainer
class IMEExplainer(KernelExplainer):
""" This is an implementation of the IME explanation method (aka. Shapley sampling values)
This is implemented here for comparision and evaluation purposes, the KernelExplainer is
typically more efficient and so is the preferred model agnostic estimation method in this package.
IME was proposed in "An Efficient Explanation of Individual Classifications using Game Theory",
Erik Štrumbelj, Igor Kononenko, JMLR 2010
"""
def __init__(self, model, data, **kwargs):
# silence warning about large datasets
level = log.level
log.setLevel(logging.ERROR)
super(IMEExplainer, self).__init__(model, data, **kwargs)
log.setLevel(level)
def explain(self, incoming_instance, **kwargs):
# convert incoming input to a standardized iml object
instance = convert_to_instance(incoming_instance)
match_instance_to_data(instance, self.data)
# pick a reasonable number of samples if the user didn't specify how many they wanted
self.nsamples = kwargs.get("nsamples", 0)
if self.nsamples == 0:
self.nsamples = 1000 * self.P
# divide up the samples among the features
self.nsamples_each = np.ones(self.P, dtype=np.int64) * 2 * (self.nsamples // (self.P * 2))
for i in range((self.nsamples % (self.P * 2)) // 2):
self.nsamples_each[i] += 2
model_out = self.model.f(instance.x)
# explain every feature
phi = np.zeros(self.P)
self.X_masked = np.zeros((self.nsamples_each.max(), X.shape[1]))
for i in range(self.P):
phi[i] = self.ime(i, self.model.f, instance.x, self.data.data, nsamples=self.nsamples_each[i])
phi = np.array(phi)
return AdditiveExplanation(self.link.f(1), self.link.f(1), phi, np.zeros(len(phi)), instance, self.link,
self.model, self.data)
def ime(self, j, f, x, X, nsamples=10):
assert nsamples % 2 == 0, "nsamples must be divisible by 2!"
X_masked = self.X_masked[:nsamples,:]
inds = np.arange(X.shape[1])
for i in range(0, nsamples//2):
np.random.shuffle(inds)
pos = np.where(inds == j)[0][0]
rind = np.random.randint(X.shape[0])
X_masked[i,:] = x
X_masked[i,inds[pos+1:]] = X[rind,inds[pos+1:]]
X_masked[-(i+1),:] = x
X_masked[-(i+1),inds[pos:]] = X[rind,inds[pos:]]
s = time.time()
evals = f(X_masked)
#print("n",time.time() - s)
evals_on = evals[:nsamples//2]
evals_off = evals[nsamples//2:][::-1]
return np.mean(evals[:nsamples//2] - evals[nsamples//2:])
In [91]:
tree_shap_times = []
sample_times = []
Ms = [20,30,40,50,60,70,80,90,100]
for M in tqdm(Ms):
X = np.random.randn(N, M)
y = np.random.randn(N)
model = xgboost.train({"eta": 1}, xgboost.DMatrix(X, y), 1000)
#print()
e = shap.TreeExplainer(model)
s = time.time()
e.shap_values(X)
tree_shap_times.append((time.time() - s)/1000)
#print((time.time() - s)/1000)
tmp = np.vstack([X for i in range(1 * M)])
s = time.time()
model.predict(xgboost.DMatrix(tmp))
sample_times.append(time.time() - s)
In [88]:
np.array(tree_shap_times)*10000
Out[88]:
In [92]:
pl.plot(Ms, np.array(tree_shap_times[:-1])*10000 / (60))
pl.plot(Ms, np.array(sample_times[:-1])*10000 / (60))
pl.ylabel("minutes of runtime")
pl.xlabel("# of features")
pl.show()
In [162]:
np.mean(np.array(tree_shap_times[:-1])*10000)
Out[162]:
In [165]:
4995.5940246/5.27129322
Out[165]:
In [165]:
4995.5940246/5.27129322
Out[165]:
In [170]:
sample_times[-5]/tree_shap_times[-5]
Out[170]:
In [164]:
np.mean(np.array(sample_times[:-1])*10000)
Out[164]:
In [163]:
f = pl.figure(figsize=(10,3))
pl.subplot(1, 2, 1)
pl.plot(
Ms[:-1], np.array(sample_times[:-1])*10000 / (60),
label="Model agnostic sampling",
color="#7C52FF", linewidth=3
)
pl.plot(
Ms[:-1], np.array(tree_shap_times[:-1])*10000 / (60),
label="Tree SHAP", color="#1E88E5", linewidth=3
)
pl.ylabel("minutes of runtime\nexplaining 10k predictions")
pl.xlabel("# of features in the model")
pl.legend()
#pl.savefig("runtime.pdf")
#pl.show()
pl.subplot(1, 2, 2)
pl.plot(
Ms[:-1], (ime_std[:-1] / ime_m[:-1])*100, "--",
label="IME", color="#7C52FF", linewidth=3
)
pl.plot(
Ms[:-1], (kernel_shap_std[:-1] / kernel_shap_m[:-1])*100,
label="Kernel SHAP",
color="#7C52FF", linewidth=3
)
pl.plot(
Ms[:-1], np.zeros(len(Ms)-1),
label="Tree SHAP",
color="#1E88E5", linewidth=3
)
pl.ylabel("Std. deviation as % of magnitude")
pl.xlabel("# of features in the model")
pl.legend(loc="upper left")
pl.savefig("perf.pdf")
pl.show()
In [146]:
pl.plot(
Ms[:-1], (kernel_shap_std[:-1] / kernel_shap_m[:-1])*100,
label="Kernel SHAP",
color="#7C52FF", linewidth=3
)
pl.plot(
Ms[:-1], (ime_std[:-1] / ime_m[:-1])*100, "--",
label="IME", color="#7C52FF", linewidth=3
)
pl.ylabel("Std. deviation as % of magnitude")
pl.xlabel("# of features")
pl.legend(loc="upper left")
#pl.savefig("std_dev.pdf")
pl.show()
In [98]:
np.mean(kernel_shap_std[:-1] / kernel_shap_m)
Out[98]:
In [105]:
np.mean(kernel_shap_std[:-1] / kernel_shap_m[:-1])
Out[105]:
In [106]:
np.mean(ime_std[:-1] / ime_m[:-1])
Out[106]:
In [146]:
pl.plot(
Ms[:-1], (ime_std[:-1] / ime_m[:-1])*100,
label="IME", color="#7C52FF", linewidth=3
)
pl.plot(
Ms[:-1], (kernel_shap_std[:-1] / kernel_shap_m[:-1])*100,
label="Kernel SHAP",
color="#7C52FF", linewidth=3
)
pl.plot(
Ms[:-1], (ime_std[:-1] / ime_m[:-1])*100, "--",
label="IME", color="#7C52FF", linewidth=3
)
pl.ylabel("Std. deviation as % of magnitude")
pl.xlabel("# of features")
pl.legend(loc="upper left")
#pl.savefig("std_dev.pdf")
pl.show()
In [97]:
pl.plot(Ms, kernel_shap_std / kernel_shap_m)
pl.plot(Ms, ime_std / ime_m)
pl.ylabel("minutes of runtime")
pl.xlabel("# of features")
pl.plot()
pl.show()
In [104]:
np.array(sample_times) / np.array(tree_shap_times)
Out[104]:
In [61]:
pl.plot(Ms, np.array(tree_shap_times)*10000 / (60*60))
pl.plot(Ms, np.array(sample_times)*10000 / (60*60))
pl.ylabel("hours of runtime")
pl.xlabel("# of features")
pl.show()
In [50]:
pl.semilogy(Ms, tree_shap_times)
pl.semilogy(Ms, sample_times)
pl.show()
In [42]:
2.1067/0.00056
Out[42]:
In [29]:
s = time.time()
model.predict(xgboost.DMatrix(X[:100,:]))
time.time() - s
Out[29]:
In [ ]:
N = 1000
X_full = np.random.randn(N, 100)
y = np.random.randn(N)
tree_shap_times = []
kernel_shap_times = []
ime_times = []
tree_shap_std = []
kernel_shap_std = []
kernel_shap_m = []
ime_std = []
ime_m = []
for M in [20,30,40,50,60,70,80,90,100]:#,30,40,50]:
print("\nM", M)
X = X_full[:,:M]
model = xgboost.train({"eta": 1}, xgboost.DMatrix(X, y), 1000)
def f(x):
return model.predict(xgboost.DMatrix(x))
e = shap.TreeExplainer(model)
start = time.time()
e.shap_values(X)
iter_time = (time.time() - start)/X.shape[0]
tree_shap_times.append(iter_time)
tree_shap_std.append(0)
e = shap.KernelExplainer(f, X.mean(0).reshape(1,M))
nsamples = 1000 * M
start = time.time()
out = np.vstack([e.shap_values(X[:1,:], silent=True, nsamples=nsamples) for i in range(50)])
std_dev = out.std(0)[:-1].mean()
mval = np.abs(out.mean(0))[:-1].mean()
kernel_shap_m.append(mval)
iter_time = (time.time() - start)/50
kernel_shap_times.append(iter_time)
kernel_shap_std.append(std_dev)
print(std_dev, mval, std_dev / mval)
print("KernelExplainer", iter_time)
e = IMEExplainer(f, X.mean(0).reshape(1,M))
nsamples = 1000 * M
start = time.time()
out = np.vstack([e.shap_values(X[:1,:], silent=True, nsamples=nsamples) for i in range(50)])
std_dev = out.std(0)[:-1].mean()
mval = np.abs(out.mean(0))[:-1].mean()
ime_m.append(mval)
iter_time = (time.time() - start)/50
ime_times.append(iter_time)
ime_std.append(std_dev)
print(std_dev, mval, std_dev / mval)
print("IMEExplainer", iter_time)
ime_std = np.array(ime_std)
ime_m = np.array(ime_m)
kernel_shap_std = np.array(kernel_shap_std)
kernel_shap_m = np.array(kernel_shap_m)
In [95]:
ime_std
Out[95]:
In [100]:
ime_m
Out[100]:
In [101]:
kernel_shap_std
Out[101]:
In [102]:
kernel_shap_m
Out[102]:
In [94]:
from tqdm import tqdm
tree_shap_times = []
kernel_shap_times = []
ime_times = []
nreps = 10
N = 1000
X_full = np.random.randn(N, 20)
y = np.random.randn(N)
for M in range(4,8):
ts = []
tree_shap_time = 0
kernel_shap_time = 0
ime_time = 0
for k in tqdm(range(nreps)):
# print()
#+ ((X > 0).sum(1) % 2)
X = X_full[:,:M]
model = xgboost.train({"eta": 1}, xgboost.DMatrix(X, y), 1000)
def f(x):
return model.predict(xgboost.DMatrix(x))
start = time.time()
shap_values = shap.TreeExplainer(model).shap_values(X)
tree_shap_time += time.time() - start
# print("Tree SHAP:", tree_shap_time, "seconds")
shap_stddev = shap_values.std(0)[:-1].mean()
# print("mean std dev of SHAP values over samples:", shap_stddev)
e = shap.KernelExplainer(f, X.mean(0).reshape(1,M))
nsamples = 200
# print(shap_stddev/20)
for j in range(2000):
#print(nsamples)
start = time.time()
std_dev = np.vstack([e.shap_values(X[:1,:], silent=True, nsamples=nsamples) for i in range(50)]).std(0)[:-1].mean()
iter_time = (time.time() - start)/50
#print(std_dev)
if std_dev < shap_stddev/20:
# print("KernelExplainer", nsamples)
# print("KernelExplainer", std_dev)
# print("KernelExplainer", iter_time, "seconds")
kernel_shap_time += iter_time * 1000
break
nsamples += int(nsamples * 0.5)
e = IMEExplainer(f, X.mean(0).reshape(1,M))
nsamples = 200
for j in range(2000):
# print()
# print(nsamples)
start = time.time()
std_dev = np.vstack([e.shap_values(X[:1,:], silent=True, nsamples=nsamples) for i in range(50)]).std(0)[:-1].mean()
# print("time", (time.time() - start)/50)
# print(std_dev)
iter_time = (time.time() - start)/50
if std_dev < shap_stddev/20:
# print("IMEExplainer", nsamples)
# print("IMEExplainer", std_dev)
# print("IMEExplainer", iter_time, "seconds")
ime_time += iter_time * 1000
break
nsamples += int(nsamples * 0.5)
tree_shap_times.append(tree_shap_time / nreps)
kernel_shap_times.append(kernel_shap_time / nreps)
ime_times.append(ime_time / nreps)
print("TreeExplainer", tree_shap_times[-1])
print("KernelExplainer", kernel_shap_times[-1])
print("IMEExplainer", ime_times[-1])
In [96]:
model.predict(xgboost.DMatrix(X)).mean()
Out[96]:
In [97]:
shap.TreeExplainer(model).shap_values(X)
Out[97]:
In [68]:
e = shap.KernelExplainer(f, X.mean(0).reshape(1,M))
np.vstack([e.shap_values(X[:1,:], silent=True, nsamples=100) for i in range(50)]).std(0)[:-1].mean()
Out[68]:
In [ ]:
In [72]:
e = shap.KernelExplainer(f, X.mean(0).reshape(1,M))
nsamples = 200
print(shap_stddev/20)
for j in range(2000):
print(nsamples)
start = time.time()
std_dev = np.vstack([e.shap_values(X[:1,:], silent=True, nsamples=nsamples) for i in range(50)]).std(0)[:-1].mean()
iter_time = time.time() - start)/50
print(std_dev)
if std_dev < shap_stddev/20:
print(nsamples)
break
nsamples += int(nsamples * 0.2)
In [74]:
e = IMEExplainer(f, X.mean(0).reshape(1,M))
nsamples = 200
print(shap_stddev/20)
for j in range(2000):
print()
print(nsamples)
start = time.time()
std_dev = np.vstack([e.shap_values(X[:1,:], silent=True, nsamples=nsamples) for i in range(50)]).std(0)[:-1].mean()
print("time", (time.time() - start)/50)
print(std_dev)
if std_dev < shap_stddev/20:
print(nsamples)
break
nsamples += int(nsamples * 0.2)
In [75]:
0.56939 * 1000
Out[75]:
In [36]:
np.std([IMEExplainer(f, X.mean(0).reshape(1,M)).shap_values(X[:1,:], silent=True, nsamples=1000)[0,0] for i in range(10)])
Out[36]:
In [20]:
[shap.KernelExplainer(f, X.mean(0).reshape(1,M)).shap_values(X[:1,:], silent=True, nsamples=1000)[0,0] for i in range(100)]
Out[20]:
In [7]:
def f(x):
return model.predict(xgboost.DMatrix(x))
start = time.time()
shap_values2 = shap.KernelExplainer(f, X.mean(0).reshape(1,M)).shap_values(X)
print(time.time() - start)
In [22]:
start = time.time()
IMEExplainer(f, X.mean(0).reshape(1,M)).shap_values(X)
print(time.time() - start)
In [ ]: