In [1]:
from sklearn import datasets
import pandas as pd
import numpy as np
%matplotlib inline

Build a model


In [2]:
ds = datasets.load_breast_cancer();
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(min_samples_leaf=30, random_state = 1960)
NC = 12

X = ds.data[:,0:NC]
y = ds.target

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1960)

clf.fit(X_train , y_train)


Out[2]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=30, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1960,
            splitter='best')

Decision Tree Plot


In [3]:
import graphviz 
from sklearn import tree

dot_data = tree.export_graphviz(clf, out_file=None, 
                         feature_names=ds.feature_names[0:NC],  
                         class_names=ds.target_names,  
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = graphviz.Source(dot_data)  
graph


Out[3]:
Tree 0 mean concave points ≤ 0.051 gini = 0.462 samples = 284 value = [103, 181] class = benign 1 mean area ≤ 571.85 gini = 0.076 samples = 177 value = [7, 170] class = benign 0->1 True 6 mean perimeter ≤ 98.32 gini = 0.184 samples = 107 value = [96, 11] class = malignant 0->6 False 2 radius error ≤ 0.354 gini = 0.014 samples = 141 value = [1, 140] class = benign 1->2 5 gini = 0.278 samples = 36 value = [6, 30] class = benign 1->5 3 gini = 0.0 samples = 111 value = [0, 111] class = benign 2->3 4 gini = 0.064 samples = 30 value = [1, 29] class = benign 2->4 7 gini = 0.464 samples = 30 value = [19, 11] class = malignant 6->7 8 gini = 0.0 samples = 77 value = [77, 0] class = malignant 6->8

sklearn_explain approach


In [4]:
# Explain the score = ln(p(1) / (1 - p(1)))


import sklearn_explain.explainer as expl
lExplainer = expl.cModelScoreExplainer(clf)
lExplainer.mSettings.mFeatureNames = ds.feature_names[0:NC]
lExplainer.mSettings.mExplanationOrder = 2
    
lExplainer.fit(X_train)
df_rc = lExplainer.explain(X_test)

# print(df_rc.columns)


USING_LOG_ODDS_AS_SCORE
SCORE_QUANTILES {0: -inf, 1: -11.512915464920228, 2: 1.6094379124341005, 3: 3.3672958299864741, 4: 11.512915464924779}
CONST_SCORE_DETECTIOM -11.5129154649 11.5129154649 False
NON_USED_FEATURES ['mean radius', 'mean texture', 'mean smoothness', 'mean compactness', 'mean concavity', 'mean symmetry', 'mean fractal dimension', 'texture error']
USED_FEATURES ['mean perimeter', 'mean area', 'mean concave points', 'radius error']
FEATURE_QUANTILES mean perimeter {0: -inf, 1: 71.884, 2: 80.623999999999995, 3: 90.566000000000003, 4: 111.44}
FEATURE_QUANTILES mean area {0: -inf, 1: 387.10000000000002, 2: 477.30000000000001, 3: 602.0, 4: 922.44000000000005}
FEATURE_QUANTILES mean concave points {0: -inf, 1: 0.017368000000000001, 2: 0.027373999999999999, 3: 0.047373999999999965, 4: 0.084198000000000009}
FEATURE_QUANTILES radius error {0: -inf, 1: 0.21351999999999999, 2: 0.26870000000000005, 3: 0.36443999999999999, 4: 0.55352000000000001}

In [5]:
df_rc_2 = lExplainer.explain(X_test[0].reshape(1, -1))

In [6]:
X_test[0].reshape(1, -1)


Out[6]:
array([[  1.17600000e+01,   2.16000000e+01,   7.47200000e+01,
          4.27900000e+02,   8.63700000e-02,   4.96600000e-02,
          1.65700000e-02,   1.11500000e-02,   1.49500000e-01,
          5.88800000e-02,   4.06200000e-01,   1.21000000e+00]])

In [7]:
df_rc_2


Out[7]:
mean radius mean texture mean perimeter mean area mean smoothness mean compactness mean concavity mean concave points mean symmetry mean fractal dimension ... reason_1_idx detailed_reason_1 reason_2_idx detailed_reason_2 reason_3_idx detailed_reason_3 reason_4_idx detailed_reason_4 reason_5_idx detailed_reason_5
0 11.76 21.6 74.72 427.9 0.08637 0.04966 0.01657 0.01115 0.1495 0.05888 ... 3 [(387.1 < 'mean area' <= 477.3), (-inf < 'mean... 1 [(71.884 < 'mean perimeter' <= 80.624), (-inf ... 5 [(-inf < 'mean concave points' <= 0.017368), (... 0 [(71.884 < 'mean perimeter' <= 80.624), (387.1... 4 [(387.1 < 'mean area' <= 477.3), (0.36444 < 'r...

1 rows × 51 columns


In [8]:
df_rc_2[[col for col in df_rc_2.columns if col.startswith('detailed')]]


Out[8]:
detailed_reason_1 detailed_reason_2 detailed_reason_3 detailed_reason_4 detailed_reason_5
0 [(387.1 < 'mean area' <= 477.3), (-inf < 'mean... [(71.884 < 'mean perimeter' <= 80.624), (-inf ... [(-inf < 'mean concave points' <= 0.017368), (... [(71.884 < 'mean perimeter' <= 80.624), (387.1... [(387.1 < 'mean area' <= 477.3), (0.36444 < 'r...

Automatic Feature Binning

This binning was built using a equi-frequency approach (5 bins , all bins contain the same number of rows)


In [9]:
lExplainer.mImplementation.mFeatureQuantiles


Out[9]:
{'mean area': {0: -inf,
  1: 387.10000000000002,
  2: 477.30000000000001,
  3: 602.0,
  4: 922.44000000000005},
 'mean concave points': {0: -inf,
  1: 0.017368000000000001,
  2: 0.027373999999999999,
  3: 0.047373999999999965,
  4: 0.084198000000000009},
 'mean perimeter': {0: -inf,
  1: 71.884,
  2: 80.623999999999995,
  3: 90.566000000000003,
  4: 111.44},
 'radius error': {0: -inf,
  1: 0.21351999999999999,
  2: 0.26870000000000005,
  3: 0.36443999999999999,
  4: 0.55352000000000001}}

Customized Feature binning

We use a second explainer and we customize the binning so that the decision tree defines the bin (lower) limits


In [10]:
lFeature_Quantiles = {
'mean area': {0: -np.inf,
  1: 571.85},
'mean concave points': {0: -np.inf,
  1: 0.51},
'mean perimeter': {0: -np.inf,
  1: 98.32},
'radius error': {0: -np.inf,
  1: 0.354}
}

In [11]:
lFeature_Quantiles


Out[11]:
{'mean area': {0: -inf, 1: 571.85},
 'mean concave points': {0: -inf, 1: 0.51},
 'mean perimeter': {0: -inf, 1: 98.32},
 'radius error': {0: -inf, 1: 0.354}}

In [12]:
lExplainer2 = expl.cModelScoreExplainer(clf)
lExplainer2.mSettings.mFeatureNames = ds.feature_names[0:NC]
lExplainer2.mSettings.mCustomFeatureQuantiles = lFeature_Quantiles
lExplainer2.mSettings.mExplanationOrder = 2
    
lExplainer2.fit(X_train)
df_rc2 = lExplainer2.explain(X_test)

# print(df_rc2.columns)

df_rc_2 = lExplainer2.explain(X_test[0].reshape(1, -1))
df_rc_2[[col for col in df_rc_2.columns if col.startswith('detailed')]]


USING_LOG_ODDS_AS_SCORE
SCORE_QUANTILES {0: -inf, 1: -11.512915464920228, 2: 1.6094379124341005, 3: 3.3672958299864741, 4: 11.512915464924779}
CONST_SCORE_DETECTIOM -11.5129154649 11.5129154649 False
NON_USED_FEATURES ['mean radius', 'mean texture', 'mean smoothness', 'mean compactness', 'mean concavity', 'mean symmetry', 'mean fractal dimension', 'texture error']
USED_FEATURES ['mean perimeter', 'mean area', 'mean concave points', 'radius error']
CUSTOM_FEATURE_QUANTILES mean perimeter {0: -inf, 1: 98.32}
FEATURE_QUANTILES mean perimeter {0: -inf, 1: 98.32}
CUSTOM_FEATURE_QUANTILES mean area {0: -inf, 1: 571.85}
FEATURE_QUANTILES mean area {0: -inf, 1: 571.85}
CUSTOM_FEATURE_QUANTILES mean concave points {0: -inf, 1: 0.51}
FEATURE_QUANTILES mean concave points {0: -inf, 1: 0.51}
CUSTOM_FEATURE_QUANTILES radius error {0: -inf, 1: 0.354}
FEATURE_QUANTILES radius error {0: -inf, 1: 0.354}
Out[12]:
detailed_reason_1 detailed_reason_2 detailed_reason_3 detailed_reason_4 detailed_reason_5
0 [(-inf < 'mean perimeter' <= 98.32), (-inf < '... [(-inf < 'mean area' <= 571.85), (-inf < 'mean... [(-inf < 'mean perimeter' <= 98.32), (-inf < '... [(-inf < 'mean area' <= 571.85), (0.354 < 'rad... [(-inf < 'mean perimeter' <= 98.32), (0.354 < ...

Score Customized Binning

Score qbining can also be customized .... (interest ?)


In [13]:
lExplainer.mImplementation.mScoreQuantiles


Out[13]:
{0: -inf,
 1: -11.512915464920228,
 2: 1.6094379124341005,
 3: 3.3672958299864741,
 4: 11.512915464924779}

In [14]:
lScore_Quantiles =  {
    0: -np.inf,
     1: 0.
}

In [15]:
lExplainer3 = expl.cModelScoreExplainer(clf)
lExplainer3.mSettings.mFeatureNames = ds.feature_names[0:NC]
lExplainer3.mSettings.mCustomFeatureQuantiles = lFeature_Quantiles
lExplainer3.mSettings.mCustomScoreQuantiles = lScore_Quantiles
lExplainer3.mSettings.mExplanationOrder = 2
    
lExplainer3.fit(X_train)
df_rc3 = lExplainer2.explain(X_test)

# print(df_rc2.columns)

df_rc_3 = lExplainer3.explain(X_test[0].reshape(1, -1))
df_rc_3[[col for col in df_rc_3.columns if col.startswith('detailed')]]


USING_LOG_ODDS_AS_SCORE
CUSTOM_SCORE_QUANTILES {0: -inf, 1: 0.0}
SCORE_QUANTILES {0: -inf, 1: 0.0}
CONST_SCORE_DETECTIOM -11.5129154649 11.5129154649 False
NON_USED_FEATURES ['mean radius', 'mean texture', 'mean smoothness', 'mean compactness', 'mean concavity', 'mean symmetry', 'mean fractal dimension', 'texture error']
USED_FEATURES ['mean perimeter', 'mean area', 'mean concave points', 'radius error']
CUSTOM_FEATURE_QUANTILES mean perimeter {0: -inf, 1: 98.32}
FEATURE_QUANTILES mean perimeter {0: -inf, 1: 98.32}
CUSTOM_FEATURE_QUANTILES mean area {0: -inf, 1: 571.85}
FEATURE_QUANTILES mean area {0: -inf, 1: 571.85}
CUSTOM_FEATURE_QUANTILES mean concave points {0: -inf, 1: 0.51}
FEATURE_QUANTILES mean concave points {0: -inf, 1: 0.51}
CUSTOM_FEATURE_QUANTILES radius error {0: -inf, 1: 0.354}
FEATURE_QUANTILES radius error {0: -inf, 1: 0.354}
Out[15]:
detailed_reason_1 detailed_reason_2 detailed_reason_3 detailed_reason_4 detailed_reason_5
0 [(-inf < 'mean area' <= 571.85), (0.354 < 'rad... [(-inf < 'mean area' <= 571.85), (-inf < 'mean... [(-inf < 'mean perimeter' <= 98.32), (-inf < '... [(-inf < 'mean perimeter' <= 98.32), (-inf < '... [(-inf < 'mean concave points' <= 0.51), (0.35...

In [ ]: