In [23]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import LinearSVC, LinearSVR
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.datasets import load_boston, load_iris, load_diabetes, make_classification, make_regression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import r2_score
import numpy as np
import pandas as pd
from scipy.stats import spearmanr, pearsonr
import eli5
from eli5.sklearn import PermutationImportance
In [19]:
def dcg_score(y_true, y_score, k=10):
order = np.argsort(y_score)[::-1]
y_true = np.take(y_true, order[:k])
gains = 2 ** y_true - 1
# highest rank is 1 so +2 instead of +1
discounts = np.log2(np.arange(len(y_true)) + 2)
return np.sum(gains / discounts)
def ndcg_score(y_true, y_score, k=10):
best = dcg_score(y_true, y_true, k)
actual = dcg_score(y_true, y_score, k)
return actual / best
In [46]:
def get_classification_datasets():
res = []
data = load_iris()
res.append(('iris_binary', data.data, data.target != 0, data.feature_names))
X, y = make_classification(n_informative=5, n_redundant=0)
res.append(('CLF(n_informative=5, n_redundant=0)', X, y, None))
X, y = make_classification(n_informative=5, n_redundant=4)
res.append(('CLF(n_informative=5, n_redundant=4)', X, y, None))
X, y = make_classification(n_informative=1, n_redundant=4, n_clusters_per_class=1)
res.append(('CLF(n_informative=1, n_redundant=4)', X, y, None))
X, y = make_classification(n_informative=20, n_redundant=0)
res.append(('CLF(n_informative=20, n_redundant=0)', X, y, None))
return res
def get_regression_datasets():
res = []
data = load_boston()
res.append(('boston', data.data, data.target, data.feature_names))
data = load_diabetes()
res.append(('diabetese', data.data, data.target, None))
X, y = make_regression(n_informative=5)
res.append(('REG(n_informative=5)', X, y, None))
X, y = make_regression(n_informative=5, effective_rank=2)
res.append(('REG(n_informative=5, effective_rank=2)', X, y, None))
X, y = make_regression(n_informative=1)
res.append(('REG(n_informative=1)', X, y, None))
X, y = make_regression(n_informative=20)
res.append(('REG(n_informative=20)', X, y, None))
return res
In [47]:
def get_classifiers():
return [
LogisticRegression(),
LinearSVC(),
RandomForestClassifier(),
DecisionTreeClassifier(),
]
def get_regressors():
return [
make_pipeline(StandardScaler(), LinearRegression()),
make_pipeline(StandardScaler(), LinearSVR()),
RandomForestRegressor(),
DecisionTreeRegressor(),
]
In [48]:
def get_explanations(est, X, y, feature_names):
df_inspect = eli5.explain_weights_df(est, feature_names=feature_names, top=100)
if isinstance(df_inspect.index, pd.MultiIndex):
df_inspect.index = df_inspect.index.droplevel()
df_inspect.index.name = None
pi = PermutationImportance(est, cv='prefit', n_iter=10).fit(X, y)
df_pi = eli5.explain_weights_df(pi, feature_names=feature_names, top=100)
pi_cv = PermutationImportance(est, cv=5, n_iter=10).fit(X, y)
df_picv = eli5.explain_weights_df(pi_cv, feature_names=feature_names, top=100)
df = pd.concat([df_inspect.weight, df_pi.weight, df_picv.weight], axis=1)
df.columns=['w_inspect', 'w_pi', 'w_picv']
df = df.dropna() / df.abs().sum()
return df
In [65]:
def get_scores(df):
w_inspect_abs = df.w_inspect.abs().values
def _scores(column):
return {
'SpearmanR': spearmanr(w_inspect_abs, column.values).correlation,
'NDCG': ndcg_score(w_inspect_abs, column.values, 100000),
'NDCG@5': ndcg_score(w_inspect_abs, column.values, 5),
'Pearson': pearsonr(w_inspect_abs, column.values)[0],
# 'R^2': r2_score(w_inspect_abs, column.values),
'L2': np.linalg.norm(w_inspect_abs - column.values),
}
return {
'PI': _scores(df.w_pi),
'PICV': _scores(df.w_picv),
}
In [66]:
def get_name(est):
if isinstance(est, Pipeline):
est = est.steps[-1][1]
return est.__class__.__name__
dfs = []
estimators = {}
scores = []
def _append(X, y, feature_names, dataset_name, est):
est.fit(X, y)
df = get_explanations(est, X, y, feature_names)
name = get_name(est)
estimators[name, dataset_name] = est
dfs.append((name, dataset_name, df))
for k, v in get_scores(df).items():
scores.append((name, dataset_name, k, v))
print("done: {} {}".format(name, dataset_name))
for (dataset_name, X, y, feature_names) in get_classification_datasets():
for clf in get_classifiers():
_append(X, y, feature_names, dataset_name, clf)
for (dataset_name, X, y, feature_names) in get_regression_datasets():
for reg in get_regressors():
_append(X, y, feature_names, dataset_name, reg)
In [67]:
df = pd.DataFrame([s[3] for s in scores])
df = df.assign(
estimator=[s[0] for s in scores],
dataset=[s[1] for s in scores],
type=[s[2] for s in scores],
)
df
Out[67]:
In [68]:
df_pi = df[df.type=="PI"]
df_pi
Out[68]:
In [71]:
df_pi.groupby('estimator').mean()
Out[71]:
In [72]:
df_pi.groupby('dataset').mean()
Out[72]:
In [ ]: