``````

In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.ndimage import generic_filter as gf
from sklearn import model_selection, datasets, metrics, tree

``````
``````

In [2]:

class DecisionTree():
def __init__(self, maxdepth=6, maxelemsleaf=1):
self.maxdepth = maxdepth
self.maxelemsleaf = maxelemsleaf
self.tree = []

def fit(self, xdf, y=None, curr_depth=0, node=None, isBegin=True):

if xdf.shape[0] == 0:
print('Error: xdf.shape[0] == 0')
return -1
if isBegin:
node = self.tree
xdf = pd.DataFrame(xdf).copy()
xdf['target'] = y
self.dimx = xdf.shape[-1] - 1
N = xdf.shape[0]
if N <= self.maxelemsleaf:
node.append(np.mean(xdf.target.values))
return 0

if curr_depth < self.maxdepth or self.maxdepth == -1:
err = -1
porogs = gf(np.sort((xdf.iloc[:, :-1]).values, axis=0), np.mean, size=(3, 1)).T
im, pm = 0, (porogs[0, 0] if porogs[0].size > 0 else 0)
# im - индекс, pm - порог
# находим разбиение с минимальной ошибкой stdn
for i in range(self.dimx):
for p in np.unique(porogs[i]):
# вычисляем ошибку
L = xdf[xdf[i] <= p]
R = xdf[xdf[i] > p]
tl = L.iloc[:, -1:].values
tr = R.iloc[:, -1:].values
stdn = tl.size*np.std(tl) + tr.size*np.std(tr)

if err == -1:
err = stdn
im, pm = i, p
if err > stdn:
err = stdn
im, pm = i, p
L = xdf[xdf[im] <= pm]
R = xdf[xdf[im] > pm]
if L.size == 0 or R.size == 0:
node.append(np.mean(xdf.target.values))
else:
node += [im, pm, [], []]
child_node_l = node[2]
child_node_r = node[3]
# реккурсивный вызов fit для ветвей дерева
self.fit(L, curr_depth=curr_depth + 1, node=child_node_l, isBegin=False)
self.fit(R, curr_depth=curr_depth + 1, node=child_node_r, isBegin=False)
elif curr_depth == self.maxdepth:
node.append(np.mean(xdf.target.values))
return 1

def predict(self, x, isBegin=True):
if isBegin:
nd = self.tree
def f(x, node=nd):
while len(node) == 4:
node = node[2] if x[node[0]] <= node[1] else node[3]
return node[0]
y = np.apply_along_axis(f, -1, x)
return y

``````
``````

In [3]:

boston_data = pd.DataFrame(boston.data)
boston_data['target'] = boston.target

``````
``````

Out[3]:

0
1
2
3
4
5
6
7
8
9
10
11
12
target

0
0.00632
18.0
2.31
0.0
0.538
6.575
65.2
4.0900
1.0
296.0
15.3
396.90
4.98
24.0

1
0.02731
0.0
7.07
0.0
0.469
6.421
78.9
4.9671
2.0
242.0
17.8
396.90
9.14
21.6

2
0.02729
0.0
7.07
0.0
0.469
7.185
61.1
4.9671
2.0
242.0
17.8
392.83
4.03
34.7

3
0.03237
0.0
2.18
0.0
0.458
6.998
45.8
6.0622
3.0
222.0
18.7
394.63
2.94
33.4

4
0.06905
0.0
2.18
0.0
0.458
7.147
54.2
6.0622
3.0
222.0
18.7
396.90
5.33
36.2

``````

Разделим датасет на тренировку и тест

``````

In [4]:

X, y = boston_data.iloc[:, :-1], boston_data.iloc[:, -1]
train_len = int(0.75 * len(X))
X_train, X_test, y_train, y_test = X.iloc[:train_len], X.iloc[train_len:], y.iloc[:train_len], y.iloc[train_len:]
# print(list(map(lambda x: x.shape, (X_train, X_test, y_train, y_test))))

``````

С ограничением глубины дерева

``````

In [5]:

model = DecisionTree(maxdepth=3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('maxdepth = 3, Error =', metrics.mean_squared_error(y_test, y_pred))

``````
``````

/home/daniel/anaconda3/lib/python3.5/site-packages/numpy/core/_methods.py:135: RuntimeWarning: Degrees of freedom <= 0 for slice
keepdims=keepdims)
/home/daniel/anaconda3/lib/python3.5/site-packages/numpy/core/_methods.py:105: RuntimeWarning: invalid value encountered in true_divide
arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
/home/daniel/anaconda3/lib/python3.5/site-packages/numpy/core/_methods.py:127: RuntimeWarning: invalid value encountered in double_scalars
ret = ret.dtype.type(ret / rcount)

maxdepth = 3, Error = 52.4316621478

``````

Без ограничения глубины. Возникает переобучение.

``````

In [6]:

model = DecisionTree()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Error =', metrics.mean_squared_error(y_test, y_pred))

``````
``````

/home/daniel/anaconda3/lib/python3.5/site-packages/numpy/core/_methods.py:135: RuntimeWarning: Degrees of freedom <= 0 for slice
keepdims=keepdims)
/home/daniel/anaconda3/lib/python3.5/site-packages/numpy/core/_methods.py:105: RuntimeWarning: invalid value encountered in true_divide
arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
/home/daniel/anaconda3/lib/python3.5/site-packages/numpy/core/_methods.py:127: RuntimeWarning: invalid value encountered in double_scalars
ret = ret.dtype.type(ret / rcount)

Error = 107.703720618

``````

Сравнение с решающим деревом из библиотеки sklearn

``````

In [7]:

model = tree.DecisionTreeRegressor(max_depth=3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('max_depth = 3, Error =', metrics.mean_squared_error(y_test, y_pred))

``````
``````

max_depth = 3, Error = 31.9963723586

``````

Проверка адекватности работы алгоритма на примере регрессии функции одного переменного

``````

In [8]:

args = (-20, 20)
x_tr = np.arange(*args, 2)
y_tr = 1/(1 + np.exp(-10*x_tr)) + 0.1*np.random.rand(x_tr.size)
plt.plot(x_tr, y_tr, 'ro')
DT = DecisionTree(maxdepth=5, maxelemsleaf=3)
DT.fit(x_tr.reshape((len(x_tr), 1)), y_tr)
x_test = np.arange(*args)
xt = x_test.reshape((len(x_test), 1))
y_pr = DT.predict(xt)
plt.step(x_test, y_pr, 'b')
plt.show()

``````
``````

/home/daniel/anaconda3/lib/python3.5/site-packages/numpy/core/_methods.py:135: RuntimeWarning: Degrees of freedom <= 0 for slice
keepdims=keepdims)
/home/daniel/anaconda3/lib/python3.5/site-packages/numpy/core/_methods.py:105: RuntimeWarning: invalid value encountered in true_divide
arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
/home/daniel/anaconda3/lib/python3.5/site-packages/numpy/core/_methods.py:127: RuntimeWarning: invalid value encountered in double_scalars
ret = ret.dtype.type(ret / rcount)

``````

Возьмем рандомизированное разбиение на обучение и тест.

``````

In [9]:

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25)

``````

Снова проверим результат работы алгоритмов с ограничением и без

``````

In [10]:

model = DecisionTree(maxdepth=3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('maxdepth = 3, Error =', metrics.mean_squared_error(y_test, y_pred))

``````
``````

/home/daniel/anaconda3/lib/python3.5/site-packages/numpy/core/_methods.py:135: RuntimeWarning: Degrees of freedom <= 0 for slice
keepdims=keepdims)
/home/daniel/anaconda3/lib/python3.5/site-packages/numpy/core/_methods.py:105: RuntimeWarning: invalid value encountered in true_divide
arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
/home/daniel/anaconda3/lib/python3.5/site-packages/numpy/core/_methods.py:127: RuntimeWarning: invalid value encountered in double_scalars
ret = ret.dtype.type(ret / rcount)

maxdepth = 3, Error = 19.7202996317

``````
``````

In [11]:

model = DecisionTree()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Error =', metrics.mean_squared_error(y_test, y_pred))

``````
``````

/home/daniel/anaconda3/lib/python3.5/site-packages/numpy/core/_methods.py:135: RuntimeWarning: Degrees of freedom <= 0 for slice
keepdims=keepdims)
/home/daniel/anaconda3/lib/python3.5/site-packages/numpy/core/_methods.py:105: RuntimeWarning: invalid value encountered in true_divide
arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
/home/daniel/anaconda3/lib/python3.5/site-packages/numpy/core/_methods.py:127: RuntimeWarning: invalid value encountered in double_scalars
ret = ret.dtype.type(ret / rcount)

Error = 14.058014477

``````
``````

In [12]:

model = tree.DecisionTreeRegressor(max_depth=3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('max_depth = 3, Error =', metrics.mean_squared_error(y_test, y_pred))

``````
``````

max_depth = 3, Error = 39.4665486481

``````

Вывод: рандомизация помогает бороться с переобучением

``````

In [ ]:

``````