In [1]:
%config InlineBackend.figure_format='retina'
%matplotlib inline

# Silence warnings
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)
warnings.simplefilter(action="ignore", category=RuntimeWarning)

import numpy as np
np.random.seed(123)
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (8, 8)
plt.rcParams["font.size"] = 14

from collections import Counter

Splitting criteria for tree growing

Why not use accuracy as splitting criteria for classification trees.

In this problem it is possible to separate class 0 and class 1 perfectly.


In [2]:
X = np.array([[0,0,0], [0,0,1], [0,1,0], [0,1,1], [0,1,1], [1,0,0],
              [1,0,0], [1,0,0], [1,0,0], [1,1,1]])
# two class problem with classes 0 and 1
y = [0,0,1,1,1,1,1,1,1,1]

In [3]:
def accuracy(a, b):
    N = a + b
    return 1 - max(a/N, b/N)


def entropy(a, b):
    p = b/(a + b) # fraction or probability for class 1
    ent = 0
    if p > 0:
        ent += p * np.log2(p)
    if (1-p) > 0:
        ent += (1-p) * np.log2(1-p)
    return -ent


def gini(a, b):
    p = b/(a+b)
    return 2* p * (1-p)


def delta_i(impurity, t, tl, tr):
    N = sum(t)
    print('i(t) =', round(impurity(*t), 5),
          ' Nl/(Nl+Nr) * i(tL) =', round((sum(tl)/N)*impurity(*tl), 5),
          ' Nr/(Nl+Nr) * i(tR) =', round((sum(tr)/N)*impurity(*tr), 5))
    return impurity(*t) - (sum(tl)/N)*impurity(*tl) - (sum(tr)/N)*impurity(*tr)

In [4]:
def split(impurity):
    # for simplicity we iterate over all three variables
    for idx in (0, 1, 2):
        t = Counter(y)
        print('Splitting t on X%i<0.5' % (idx))
        tl = [0, 0]
        tr = [0, 0]
        for x,y_ in zip(X[:, idx], y):
            if x < 0.5:
                if y_ < 0.5:
                    tl[0] += 1
                else:
                    tl[1] += 1
            else:
                if y_ < 0.5:
                    tr[0] += 1
                else:
                    tr[1] += 1

        print('resulting leaf tL:', tl, 'and leaf tR:', tr)
        dI = delta_i(impurity, (t[0], t[1]), tl, tr)
        print('di(t) = i(t) - Nl/(Nl+Nr)*i(tL) - Nr/(Nl+Nr)*i(tR) =', round(dI, 5))
        print('----------')
        
print('With "accuracy" as splitting criteria:')
split(accuracy)
print()


With "accuracy" as splitting criteria:
Splitting t on X0<0.5
resulting leaf tL: [2, 3] and leaf tR: [0, 5]
i(t) = 0.2  Nl/(Nl+Nr) * i(tL) = 0.2  Nr/(Nl+Nr) * i(tR) = 0.0
di(t) = i(t) - Nl/(Nl+Nr)*i(tL) - Nr/(Nl+Nr)*i(tR) = -0.0
----------
Splitting t on X1<0.5
resulting leaf tL: [2, 4] and leaf tR: [0, 4]
i(t) = 0.2  Nl/(Nl+Nr) * i(tL) = 0.2  Nr/(Nl+Nr) * i(tR) = 0.0
di(t) = i(t) - Nl/(Nl+Nr)*i(tL) - Nr/(Nl+Nr)*i(tR) = -0.0
----------
Splitting t on X2<0.5
resulting leaf tL: [1, 5] and leaf tR: [1, 3]
i(t) = 0.2  Nl/(Nl+Nr) * i(tL) = 0.1  Nr/(Nl+Nr) * i(tR) = 0.1
di(t) = i(t) - Nl/(Nl+Nr)*i(tL) - Nr/(Nl+Nr)*i(tR) = -0.0
----------

When using accuracy as splitting criteria we can not find a split to make for the root node as none of the splits would lead to an increase in the impurity. Splits where the majority class remains the same in the child nodes are considered not worth making.

The tree growing procedure proceeds one step at a time, this makes it shortsighted. We can help the procedure out by using a impurity/splitting criteria that can measure the "potential" for a future split. The entropy does this as it looks at the class probabilities. It consideres a split worth making if the split alters the class distributions of the child nodes. Even if the predicted classes for each remain the same as for the parent.


In [5]:
# repeat using entropy as splitting criteria
print('With "entropy" as splitting criteria:')
split(entropy)


With "entropy" as splitting criteria:
Splitting t on X0<0.5
resulting leaf tL: [2, 3] and leaf tR: [0, 5]
i(t) = 0.72193  Nl/(Nl+Nr) * i(tL) = 0.48548  Nr/(Nl+Nr) * i(tR) = -0.0
di(t) = i(t) - Nl/(Nl+Nr)*i(tL) - Nr/(Nl+Nr)*i(tR) = 0.23645
----------
Splitting t on X1<0.5
resulting leaf tL: [2, 4] and leaf tR: [0, 4]
i(t) = 0.72193  Nl/(Nl+Nr) * i(tL) = 0.55098  Nr/(Nl+Nr) * i(tR) = -0.0
di(t) = i(t) - Nl/(Nl+Nr)*i(tL) - Nr/(Nl+Nr)*i(tR) = 0.17095
----------
Splitting t on X2<0.5
resulting leaf tL: [1, 5] and leaf tR: [1, 3]
i(t) = 0.72193  Nl/(Nl+Nr) * i(tL) = 0.39001  Nr/(Nl+Nr) * i(tR) = 0.32451
di(t) = i(t) - Nl/(Nl+Nr)*i(tL) - Nr/(Nl+Nr)*i(tR) = 0.0074
----------

In [6]:
from utils import draw_tree
from sklearn.tree import DecisionTreeClassifier

# check using a DecisionTreeClassifier from sklearn
# note the tree stops growing after depth 3, even though
# we set no limit on max_depth.
# The terminal leaves are pure.
clf = DecisionTreeClassifier(criterion='entropy', max_depth=None)
clf.fit(X, y)
draw_tree(clf, ['X1', 'X2', 'X3'], filled=True)


Out[6]:
Tree 0 X1 <= 0.5 entropy = 0.7219 samples = 10 value = [2, 8] 1 X2 <= 0.5 entropy = 0.971 samples = 5 value = [2, 3] 0->1 True 4 entropy = 0.0 samples = 5 value = [0, 5] 0->4 False 2 entropy = 0.0 samples = 2 value = [2, 0] 1->2 3 entropy = 0.0 samples = 3 value = [0, 3] 1->3

In [7]:
# similar for the gini index
split(gini)


Splitting t on X0<0.5
resulting leaf tL: [2, 3] and leaf tR: [0, 5]
i(t) = 0.32  Nl/(Nl+Nr) * i(tL) = 0.24  Nr/(Nl+Nr) * i(tR) = 0.0
di(t) = i(t) - Nl/(Nl+Nr)*i(tL) - Nr/(Nl+Nr)*i(tR) = 0.08
----------
Splitting t on X1<0.5
resulting leaf tL: [2, 4] and leaf tR: [0, 4]
i(t) = 0.32  Nl/(Nl+Nr) * i(tL) = 0.26667  Nr/(Nl+Nr) * i(tR) = 0.0
di(t) = i(t) - Nl/(Nl+Nr)*i(tL) - Nr/(Nl+Nr)*i(tR) = 0.05333
----------
Splitting t on X2<0.5
resulting leaf tL: [1, 5] and leaf tR: [1, 3]
i(t) = 0.32  Nl/(Nl+Nr) * i(tL) = 0.16667  Nr/(Nl+Nr) * i(tR) = 0.15
di(t) = i(t) - Nl/(Nl+Nr)*i(tL) - Nr/(Nl+Nr)*i(tR) = 0.00333
----------