In [1]:

    
from sklearn import datasets
import pandas as pd
import numpy as np
%matplotlib inline

Build a model



In [2]:

    
ds = datasets.load_breast_cancer();
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(min_samples_leaf=30, random_state = 1960)
NC = 12

X = ds.data[:,0:NC]
y = ds.target

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1960)

clf.fit(X_train , y_train)









    Out[2]:





DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=30, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1960,
            splitter='best')

Decision Tree Plot



In [3]:

    
import graphviz 
from sklearn import tree

dot_data = tree.export_graphviz(clf, out_file=None, 
                         feature_names=ds.feature_names[0:NC],  
                         class_names=ds.target_names,  
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = graphviz.Source(dot_data)  
graph









    Out[3]:

sklearn_explain approach



In [4]:

    
# Explain the score = ln(p(1) / (1 - p(1)))


import sklearn_explain.explainer as expl
lExplainer = expl.cModelScoreExplainer(clf)
lExplainer.mSettings.mFeatureNames = ds.feature_names[0:NC]
lExplainer.mSettings.mExplanationOrder = 2
    
lExplainer.fit(X_train)
df_rc = lExplainer.explain(X_test)

# print(df_rc.columns)









    



USING_LOG_ODDS_AS_SCORE
SCORE_QUANTILES {0: -inf, 1: -11.512915464920228, 2: 1.6094379124341005, 3: 3.3672958299864741, 4: 11.512915464924779}
CONST_SCORE_DETECTIOM -11.5129154649 11.5129154649 False
NON_USED_FEATURES ['mean radius', 'mean texture', 'mean smoothness', 'mean compactness', 'mean concavity', 'mean symmetry', 'mean fractal dimension', 'texture error']
USED_FEATURES ['mean perimeter', 'mean area', 'mean concave points', 'radius error']
FEATURE_QUANTILES mean perimeter {0: -inf, 1: 71.884, 2: 80.623999999999995, 3: 90.566000000000003, 4: 111.44}
FEATURE_QUANTILES mean area {0: -inf, 1: 387.10000000000002, 2: 477.30000000000001, 3: 602.0, 4: 922.44000000000005}
FEATURE_QUANTILES mean concave points {0: -inf, 1: 0.017368000000000001, 2: 0.027373999999999999, 3: 0.047373999999999965, 4: 0.084198000000000009}
FEATURE_QUANTILES radius error {0: -inf, 1: 0.21351999999999999, 2: 0.26870000000000005, 3: 0.36443999999999999, 4: 0.55352000000000001}



In [5]:

    
df_rc_2 = lExplainer.explain(X_test[0].reshape(1, -1))



In [6]:

    
X_test[0].reshape(1, -1)









    Out[6]:





array([[  1.17600000e+01,   2.16000000e+01,   7.47200000e+01,
          4.27900000e+02,   8.63700000e-02,   4.96600000e-02,
          1.65700000e-02,   1.11500000e-02,   1.49500000e-01,
          5.88800000e-02,   4.06200000e-01,   1.21000000e+00]])



In [7]:

    
df_rc_2









    Out[7]:







  
    
      
      mean radius
      mean texture
      mean perimeter
      mean area
      mean smoothness
      mean compactness
      mean concavity
      mean concave points
      mean symmetry
      mean fractal dimension
      ...
      reason_1_idx
      detailed_reason_1
      reason_2_idx
      detailed_reason_2
      reason_3_idx
      detailed_reason_3
      reason_4_idx
      detailed_reason_4
      reason_5_idx
      detailed_reason_5
    
  
  
    
      0
      11.76
      21.6
      74.72
      427.9
      0.08637
      0.04966
      0.01657
      0.01115
      0.1495
      0.05888
      ...
      3
      [(387.1 < 'mean area' <= 477.3), (-inf < 'mean...
      1
      [(71.884 < 'mean perimeter' <= 80.624), (-inf ...
      5
      [(-inf < 'mean concave points' <= 0.017368), (...
      0
      [(71.884 < 'mean perimeter' <= 80.624), (387.1...
      4
      [(387.1 < 'mean area' <= 477.3), (0.36444 < 'r...
    
  

1 rows × 51 columns



In [8]:

    
df_rc_2[[col for col in df_rc_2.columns if col.startswith('detailed')]]









    Out[8]:







  
    
      
      detailed_reason_1
      detailed_reason_2
      detailed_reason_3
      detailed_reason_4
      detailed_reason_5
    
  
  
    
      0
      [(387.1 < 'mean area' <= 477.3), (-inf < 'mean...
      [(71.884 < 'mean perimeter' <= 80.624), (-inf ...
      [(-inf < 'mean concave points' <= 0.017368), (...
      [(71.884 < 'mean perimeter' <= 80.624), (387.1...
      [(387.1 < 'mean area' <= 477.3), (0.36444 < 'r...

Automatic Feature Binning

This binning was built using a equi-frequency approach (5 bins , all bins contain the same number of rows)



In [9]:

    
lExplainer.mImplementation.mFeatureQuantiles









    Out[9]:





{'mean area': {0: -inf,
  1: 387.10000000000002,
  2: 477.30000000000001,
  3: 602.0,
  4: 922.44000000000005},
 'mean concave points': {0: -inf,
  1: 0.017368000000000001,
  2: 0.027373999999999999,
  3: 0.047373999999999965,
  4: 0.084198000000000009},
 'mean perimeter': {0: -inf,
  1: 71.884,
  2: 80.623999999999995,
  3: 90.566000000000003,
  4: 111.44},
 'radius error': {0: -inf,
  1: 0.21351999999999999,
  2: 0.26870000000000005,
  3: 0.36443999999999999,
  4: 0.55352000000000001}}

Customized Feature binning

We use a second explainer and we customize the binning so that the decision tree defines the bin (lower) limits



In [10]:

    
lFeature_Quantiles = {
'mean area': {0: -np.inf,
  1: 571.85},
'mean concave points': {0: -np.inf,
  1: 0.51},
'mean perimeter': {0: -np.inf,
  1: 98.32},
'radius error': {0: -np.inf,
  1: 0.354}
}



In [11]:

    
lFeature_Quantiles









    Out[11]:





{'mean area': {0: -inf, 1: 571.85},
 'mean concave points': {0: -inf, 1: 0.51},
 'mean perimeter': {0: -inf, 1: 98.32},
 'radius error': {0: -inf, 1: 0.354}}



In [12]:

    
lExplainer2 = expl.cModelScoreExplainer(clf)
lExplainer2.mSettings.mFeatureNames = ds.feature_names[0:NC]
lExplainer2.mSettings.mCustomFeatureQuantiles = lFeature_Quantiles
lExplainer2.mSettings.mExplanationOrder = 2
    
lExplainer2.fit(X_train)
df_rc2 = lExplainer2.explain(X_test)

# print(df_rc2.columns)

df_rc_2 = lExplainer2.explain(X_test[0].reshape(1, -1))
df_rc_2[[col for col in df_rc_2.columns if col.startswith('detailed')]]









    



USING_LOG_ODDS_AS_SCORE
SCORE_QUANTILES {0: -inf, 1: -11.512915464920228, 2: 1.6094379124341005, 3: 3.3672958299864741, 4: 11.512915464924779}
CONST_SCORE_DETECTIOM -11.5129154649 11.5129154649 False
NON_USED_FEATURES ['mean radius', 'mean texture', 'mean smoothness', 'mean compactness', 'mean concavity', 'mean symmetry', 'mean fractal dimension', 'texture error']
USED_FEATURES ['mean perimeter', 'mean area', 'mean concave points', 'radius error']
CUSTOM_FEATURE_QUANTILES mean perimeter {0: -inf, 1: 98.32}
FEATURE_QUANTILES mean perimeter {0: -inf, 1: 98.32}
CUSTOM_FEATURE_QUANTILES mean area {0: -inf, 1: 571.85}
FEATURE_QUANTILES mean area {0: -inf, 1: 571.85}
CUSTOM_FEATURE_QUANTILES mean concave points {0: -inf, 1: 0.51}
FEATURE_QUANTILES mean concave points {0: -inf, 1: 0.51}
CUSTOM_FEATURE_QUANTILES radius error {0: -inf, 1: 0.354}
FEATURE_QUANTILES radius error {0: -inf, 1: 0.354}






    Out[12]:







  
    
      
      detailed_reason_1
      detailed_reason_2
      detailed_reason_3
      detailed_reason_4
      detailed_reason_5
    
  
  
    
      0
      [(-inf < 'mean perimeter' <= 98.32), (-inf < '...
      [(-inf < 'mean area' <= 571.85), (-inf < 'mean...
      [(-inf < 'mean perimeter' <= 98.32), (-inf < '...
      [(-inf < 'mean area' <= 571.85), (0.354 < 'rad...
      [(-inf < 'mean perimeter' <= 98.32), (0.354 < ...

Score Customized Binning

Score qbining can also be customized .... (interest ?)



In [13]:

    
lExplainer.mImplementation.mScoreQuantiles









    Out[13]:





{0: -inf,
 1: -11.512915464920228,
 2: 1.6094379124341005,
 3: 3.3672958299864741,
 4: 11.512915464924779}



In [14]:

    
lScore_Quantiles =  {
    0: -np.inf,
     1: 0.
}



In [15]:

    
lExplainer3 = expl.cModelScoreExplainer(clf)
lExplainer3.mSettings.mFeatureNames = ds.feature_names[0:NC]
lExplainer3.mSettings.mCustomFeatureQuantiles = lFeature_Quantiles
lExplainer3.mSettings.mCustomScoreQuantiles = lScore_Quantiles
lExplainer3.mSettings.mExplanationOrder = 2
    
lExplainer3.fit(X_train)
df_rc3 = lExplainer2.explain(X_test)

# print(df_rc2.columns)

df_rc_3 = lExplainer3.explain(X_test[0].reshape(1, -1))
df_rc_3[[col for col in df_rc_3.columns if col.startswith('detailed')]]









    



USING_LOG_ODDS_AS_SCORE
CUSTOM_SCORE_QUANTILES {0: -inf, 1: 0.0}
SCORE_QUANTILES {0: -inf, 1: 0.0}
CONST_SCORE_DETECTIOM -11.5129154649 11.5129154649 False
NON_USED_FEATURES ['mean radius', 'mean texture', 'mean smoothness', 'mean compactness', 'mean concavity', 'mean symmetry', 'mean fractal dimension', 'texture error']
USED_FEATURES ['mean perimeter', 'mean area', 'mean concave points', 'radius error']
CUSTOM_FEATURE_QUANTILES mean perimeter {0: -inf, 1: 98.32}
FEATURE_QUANTILES mean perimeter {0: -inf, 1: 98.32}
CUSTOM_FEATURE_QUANTILES mean area {0: -inf, 1: 571.85}
FEATURE_QUANTILES mean area {0: -inf, 1: 571.85}
CUSTOM_FEATURE_QUANTILES mean concave points {0: -inf, 1: 0.51}
FEATURE_QUANTILES mean concave points {0: -inf, 1: 0.51}
CUSTOM_FEATURE_QUANTILES radius error {0: -inf, 1: 0.354}
FEATURE_QUANTILES radius error {0: -inf, 1: 0.354}






    Out[15]:







  
    
      
      detailed_reason_1
      detailed_reason_2
      detailed_reason_3
      detailed_reason_4
      detailed_reason_5
    
  
  
    
      0
      [(-inf < 'mean area' <= 571.85), (0.354 < 'rad...
      [(-inf < 'mean area' <= 571.85), (-inf < 'mean...
      [(-inf < 'mean perimeter' <= 98.32), (-inf < '...
      [(-inf < 'mean perimeter' <= 98.32), (-inf < '...
      [(-inf < 'mean concave points' <= 0.51), (0.35...



In [ ]: