Link: https://archive.ics.uci.edu/ml/datasets/adult Predict whether income exceeds $50K/yr based on census data. Also known as "Census Income" dataset.
In [2]:
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from xtoy import Toy
df = pd.read_csv(
"http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
header=None)
df.columns = [
"Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
"MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
"CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"
]
# df = df.sample(frac=0.01, random_state=1)
train_cols = df.columns[0:-1]
label = df.columns[-1]
X = df[train_cols]
y = df[label].apply(lambda x: 0 if x == " <=50K" else 1)
toy = Toy()
toy.fit(X,y)
In [4]:
from interpret import show
from interpret.perf import ROC
blackbox_perf = ROC(toy.predict_proba).explain_perf(X,y, name='toy')
show(blackbox_perf)
Using MorrisSensitivity to answer this question https://www.sciencedirect.com/science/article/pii/S0022169412008918
In [5]:
from interpret.blackbox import MorrisSensitivity
trans_df = pd.DataFrame(data=toy.featurizer.transform(X).A, columns=toy.feature_names_)
sensitivity = MorrisSensitivity(predict_fn=toy.best_evo.predict_proba, data=trans_df)
sensitivity_global = sensitivity.explain_global(name="Global Sensitivity")
show(sensitivity_global)
In [16]:
print('Why Does this person displayed below earn less than 50k dollars?')
display(X.head(1))
print('Let\'s use shap value to explain our prediction')
In [6]:
from interpret.blackbox import ShapKernel
import numpy as np
background_val = np.median(toy.featurizer.transform(X).A, axis=0).reshape(1, -1)
shap = ShapKernel(predict_fn=toy.best_evo.predict_proba, data=background_val, feature_names=toy.feature_names_)
In [18]:
from ipywidgets import IntProgress
shap_local = shap.explain_local(toy.featurizer.transform(X).A[0:1], y[0:1], name='SHAP')
show(shap_local)
The Above Analysis suggest that while Education Number is 13, Education has the word 'bachelors' and his age is 39, his capital gain is just 2174
In [19]:
df[df['CapitalGain'] <= 2200]['Income'].value_counts()
Out[19]: