In [1]:
%config InlineBackend.figure_format='retina'
%matplotlib inline
# Silence warnings
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)
warnings.simplefilter(action="ignore", category=RuntimeWarning)
import numpy as np
np.random.seed(123)
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (8, 8)
plt.rcParams["font.size"] = 14
from collections import Counter
In [2]:
X = np.array([[0,0,0], [0,0,1], [0,1,0], [0,1,1], [0,1,1], [1,0,0],
[1,0,0], [1,0,0], [1,0,0], [1,1,1]])
# two class problem with classes 0 and 1
y = [0,0,1,1,1,1,1,1,1,1]
In [3]:
def accuracy(a, b):
N = a + b
return 1 - max(a/N, b/N)
def entropy(a, b):
p = b/(a + b) # fraction or probability for class 1
ent = 0
if p > 0:
ent += p * np.log2(p)
if (1-p) > 0:
ent += (1-p) * np.log2(1-p)
return -ent
def gini(a, b):
p = b/(a+b)
return 2* p * (1-p)
def delta_i(impurity, t, tl, tr):
N = sum(t)
print('i(t) =', round(impurity(*t), 5),
' Nl/(Nl+Nr) * i(tL) =', round((sum(tl)/N)*impurity(*tl), 5),
' Nr/(Nl+Nr) * i(tR) =', round((sum(tr)/N)*impurity(*tr), 5))
return impurity(*t) - (sum(tl)/N)*impurity(*tl) - (sum(tr)/N)*impurity(*tr)
In [4]:
def split(impurity):
# for simplicity we iterate over all three variables
for idx in (0, 1, 2):
t = Counter(y)
print('Splitting t on X%i<0.5' % (idx))
tl = [0, 0]
tr = [0, 0]
for x,y_ in zip(X[:, idx], y):
if x < 0.5:
if y_ < 0.5:
tl[0] += 1
else:
tl[1] += 1
else:
if y_ < 0.5:
tr[0] += 1
else:
tr[1] += 1
print('resulting leaf tL:', tl, 'and leaf tR:', tr)
dI = delta_i(impurity, (t[0], t[1]), tl, tr)
print('di(t) = i(t) - Nl/(Nl+Nr)*i(tL) - Nr/(Nl+Nr)*i(tR) =', round(dI, 5))
print('----------')
print('With "accuracy" as splitting criteria:')
split(accuracy)
print()
When using accuracy as splitting criteria we can not find a split to make for the root node as none of the splits would lead to an increase in the impurity. Splits where the majority class remains the same in the child nodes are considered not worth making.
The tree growing procedure proceeds one step at a time, this makes it shortsighted. We can help the procedure out by using a impurity/splitting criteria that can measure the "potential" for a future split. The entropy does this as it looks at the class probabilities. It consideres a split worth making if the split alters the class distributions of the child nodes. Even if the predicted classes for each remain the same as for the parent.
In [5]:
# repeat using entropy as splitting criteria
print('With "entropy" as splitting criteria:')
split(entropy)
In [6]:
from utils import draw_tree
from sklearn.tree import DecisionTreeClassifier
# check using a DecisionTreeClassifier from sklearn
# note the tree stops growing after depth 3, even though
# we set no limit on max_depth.
# The terminal leaves are pure.
clf = DecisionTreeClassifier(criterion='entropy', max_depth=None)
clf.fit(X, y)
draw_tree(clf, ['X1', 'X2', 'X3'], filled=True)
Out[6]:
In [7]:
# similar for the gini index
split(gini)