In [1]:
from sklearn import datasets
import pandas as pd

%matplotlib inline

Build a model


In [2]:
ds = datasets.load_breast_cancer();
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(min_samples_leaf=30, random_state = 1960)
NC = 12

X = ds.data[:,0:NC]
y = ds.target

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1960)

clf.fit(X_train , y_train)


Out[2]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=30, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1960,
            splitter='best')

Decision Tree Plot


In [3]:
import graphviz 
from sklearn import tree

dot_data = tree.export_graphviz(clf, out_file=None, 
                         feature_names=ds.feature_names[0:NC],  
                         class_names=ds.target_names,  
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = graphviz.Source(dot_data)  
graph


Out[3]:
Tree 0 mean concave points ≤ 0.051 gini = 0.462 samples = 284 value = [103, 181] class = benign 1 mean area ≤ 571.85 gini = 0.076 samples = 177 value = [7, 170] class = benign 0->1 True 6 mean perimeter ≤ 98.32 gini = 0.184 samples = 107 value = [96, 11] class = malignant 0->6 False 2 radius error ≤ 0.354 gini = 0.014 samples = 141 value = [1, 140] class = benign 1->2 5 gini = 0.278 samples = 36 value = [6, 30] class = benign 1->5 3 gini = 0.0 samples = 111 value = [0, 111] class = benign 2->3 4 gini = 0.064 samples = 30 value = [1, 29] class = benign 2->4 7 gini = 0.464 samples = 30 value = [19, 11] class = malignant 6->7 8 gini = 0.0 samples = 77 value = [77, 0] class = malignant 6->8

sklearn_explain approach


In [4]:
# Explain the score = ln(p(1) / (1 - p(1)))


import sklearn_explain.explainer as expl
lExplainer = expl.cModelScoreExplainer(clf)
lExplainer.mSettings.mFeatureNames = ds.feature_names[0:NC]
lExplainer.mSettings.mExplanationOrder = 1
    
lExplainer.fit(X_train)
df_rc = lExplainer.explain(X_test)

print(df_rc.columns)


USING_LOG_ODDS_AS_SCORE
SCORE_QUANTILES {0: -inf, 1: -11.512915464920228, 2: 1.6094379124341005, 3: 3.3672958299864741, 4: 11.512915464924779}
CONST_SCORE_DETECTIOM -11.5129154649 11.5129154649 False
NON_USED_FEATURES ['mean radius', 'mean texture', 'mean smoothness', 'mean compactness', 'mean concavity', 'mean symmetry', 'mean fractal dimension', 'texture error']
USED_FEATURES ['mean perimeter', 'mean area', 'mean concave points', 'radius error']
FEATURE_QUANTILES mean perimeter {0: -inf, 1: 71.884, 2: 80.623999999999995, 3: 90.566000000000003, 4: 111.44}
FEATURE_QUANTILES mean area {0: -inf, 1: 387.10000000000002, 2: 477.30000000000001, 3: 602.0, 4: 922.44000000000005}
FEATURE_QUANTILES mean concave points {0: -inf, 1: 0.017368000000000001, 2: 0.027373999999999999, 3: 0.047373999999999965, 4: 0.084198000000000009}
FEATURE_QUANTILES radius error {0: -inf, 1: 0.21351999999999999, 2: 0.26870000000000005, 3: 0.36443999999999999, 4: 0.55352000000000001}
Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'Score', 'BinnedScore',
       'mean perimeter_bin', 'mean area_bin', 'mean concave points_bin',
       'radius error_bin', 'mean perimeter_encoded', 'mean area_encoded',
       'mean concave points_encoded', 'radius error_encoded',
       'mean perimeter_Effect', 'mean area_Effect',
       'mean concave points_Effect', 'radius error_Effect', 'reason_1',
       'reason_2', 'reason_3', 'reason_4', 'reason_1_idx', 'detailed_reason_1',
       'reason_2_idx', 'detailed_reason_2', 'reason_3_idx',
       'detailed_reason_3', 'reason_4_idx', 'detailed_reason_4'],
      dtype='object')

In [5]:
df_rc_2 = lExplainer.explain(X_test[0].reshape(1, -1))

In [6]:
X_test[0].reshape(1, -1)


Out[6]:
array([[  1.17600000e+01,   2.16000000e+01,   7.47200000e+01,
          4.27900000e+02,   8.63700000e-02,   4.96600000e-02,
          1.65700000e-02,   1.11500000e-02,   1.49500000e-01,
          5.88800000e-02,   4.06200000e-01,   1.21000000e+00]])

In [7]:
df_rc_2


Out[7]:
mean radius mean texture mean perimeter mean area mean smoothness mean compactness mean concavity mean concave points mean symmetry mean fractal dimension ... reason_3 reason_4 reason_1_idx detailed_reason_1 reason_2_idx detailed_reason_2 reason_3_idx detailed_reason_3 reason_4_idx detailed_reason_4
0 11.76 21.6 1 1 0.08637 0.04966 0.01657 0 0.1495 0.05888 ... mean area mean perimeter 2 [(-inf < 'mean concave points' <= 0.017368)] 3 [(0.36444 < 'radius error' <= 0.55352)] 1 [(387.1 < 'mean area' <= 477.3)] 0 [(71.884 < 'mean perimeter' <= 80.624)]

1 rows × 38 columns


In [8]:
df_rc_2[[col for col in df_rc_2.columns if col.startswith('detailed')]]


Out[8]:
detailed_reason_1 detailed_reason_2 detailed_reason_3 detailed_reason_4
0 [(-inf < 'mean concave points' <= 0.017368)] [(0.36444 < 'radius error' <= 0.55352)] [(387.1 < 'mean area' <= 477.3)] [(71.884 < 'mean perimeter' <= 80.624)]

Scorecard


In [11]:
scorecard_df = lExplainer.get_local_score_card(X_test[0].reshape(1, -1))


GET_LOCAL_SCORE_CARD [[  1.17600000e+01   2.16000000e+01   7.47200000e+01   4.27900000e+02
    8.63700000e-02   4.96600000e-02   1.65700000e-02   1.11500000e-02
    1.49500000e-01   5.88800000e-02   4.06200000e-01   1.21000000e+00]] [ 3.36729583]

In [12]:
scorecard_df.head(scorecard_df.shape[0])


Out[12]:
feature_1 feature_min_1 feature_max_1 points
0 mean perimeter -inf 71.884000 1.682561
1 mean perimeter 71.884000 80.624000 1.682561
2 mean perimeter 80.624000 90.566000 1.682561
3 mean perimeter 90.566000 111.440000 -0.079875
4 mean perimeter 111.440000 inf -1.682561
5 mean area -inf 387.100000 1.682561
6 mean area 387.100000 477.300000 1.682561
7 mean area 477.300000 602.000000 1.682561
8 mean area 602.000000 922.440000 -0.079875
9 mean area 922.440000 inf -1.682561
10 mean concave points -inf 0.017368 0.002357
11 mean concave points 0.017368 0.027374 0.002357
12 mean concave points 0.027374 0.047374 0.000689
13 mean concave points 0.047374 0.084198 -0.000112
14 mean concave points 0.084198 inf -0.002357
15 radius error -inf 0.213520 0.000347
16 radius error 0.213520 0.268700 0.000347
17 radius error 0.268700 0.364440 0.000101
18 radius error 0.364440 0.553520 0.000048
19 radius error 0.553520 inf -0.000347

In [13]:
scorecard_df.columns


Out[13]:
Index(['feature_1', 'feature_min_1', 'feature_max_1', 'points'], dtype='object')