In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.ndimage import generic_filter as gf
from sklearn import model_selection, datasets, metrics, tree
In [2]:
class DecisionTree():
def __init__(self, maxdepth=6, maxelemsleaf=1):
self.maxdepth = maxdepth
self.maxelemsleaf = maxelemsleaf
self.tree = []
def fit(self, xdf, y=None, curr_depth=0, node=None, isBegin=True):
if xdf.shape[0] == 0:
print('Error: xdf.shape[0] == 0')
return -1
if isBegin:
node = self.tree
xdf = pd.DataFrame(xdf).copy()
xdf['target'] = y
self.dimx = xdf.shape[-1] - 1
N = xdf.shape[0]
if N <= self.maxelemsleaf:
node.append(np.mean(xdf.target.values))
return 0
if curr_depth < self.maxdepth or self.maxdepth == -1:
err = -1
porogs = gf(np.sort((xdf.iloc[:, :-1]).values, axis=0), np.mean, size=(3, 1)).T
im, pm = 0, (porogs[0, 0] if porogs[0].size > 0 else 0)
# im - индекс, pm - порог
# находим разбиение с минимальной ошибкой stdn
for i in range(self.dimx):
for p in np.unique(porogs[i]):
# вычисляем ошибку
L = xdf[xdf[i] <= p]
R = xdf[xdf[i] > p]
tl = L.iloc[:, -1:].values
tr = R.iloc[:, -1:].values
stdn = tl.size*np.std(tl) + tr.size*np.std(tr)
if err == -1:
err = stdn
im, pm = i, p
if err > stdn:
err = stdn
im, pm = i, p
L = xdf[xdf[im] <= pm]
R = xdf[xdf[im] > pm]
if L.size == 0 or R.size == 0:
node.append(np.mean(xdf.target.values))
else:
node += [im, pm, [], []]
child_node_l = node[2]
child_node_r = node[3]
# реккурсивный вызов fit для ветвей дерева
self.fit(L, curr_depth=curr_depth + 1, node=child_node_l, isBegin=False)
self.fit(R, curr_depth=curr_depth + 1, node=child_node_r, isBegin=False)
elif curr_depth == self.maxdepth:
node.append(np.mean(xdf.target.values))
return 1
def predict(self, x, isBegin=True):
if isBegin:
nd = self.tree
def f(x, node=nd):
while len(node) == 4:
node = node[2] if x[node[0]] <= node[1] else node[3]
return node[0]
y = np.apply_along_axis(f, -1, x)
return y
In [3]:
from sklearn.datasets import load_boston
boston = load_boston()
boston_data = pd.DataFrame(boston.data)
boston_data['target'] = boston.target
boston_data.head()
Out[3]:
Разделим датасет на тренировку и тест
In [4]:
X, y = boston_data.iloc[:, :-1], boston_data.iloc[:, -1]
train_len = int(0.75 * len(X))
X_train, X_test, y_train, y_test = X.iloc[:train_len], X.iloc[train_len:], y.iloc[:train_len], y.iloc[train_len:]
# print(list(map(lambda x: x.shape, (X_train, X_test, y_train, y_test))))
С ограничением глубины дерева
In [5]:
model = DecisionTree(maxdepth=3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('maxdepth = 3, Error =', metrics.mean_squared_error(y_test, y_pred))
Без ограничения глубины. Возникает переобучение.
In [6]:
model = DecisionTree()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Error =', metrics.mean_squared_error(y_test, y_pred))
Сравнение с решающим деревом из библиотеки sklearn
In [7]:
model = tree.DecisionTreeRegressor(max_depth=3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('max_depth = 3, Error =', metrics.mean_squared_error(y_test, y_pred))
Проверка адекватности работы алгоритма на примере регрессии функции одного переменного
In [8]:
args = (-20, 20)
x_tr = np.arange(*args, 2)
y_tr = 1/(1 + np.exp(-10*x_tr)) + 0.1*np.random.rand(x_tr.size)
plt.plot(x_tr, y_tr, 'ro')
DT = DecisionTree(maxdepth=5, maxelemsleaf=3)
DT.fit(x_tr.reshape((len(x_tr), 1)), y_tr)
x_test = np.arange(*args)
xt = x_test.reshape((len(x_test), 1))
y_pr = DT.predict(xt)
plt.step(x_test, y_pr, 'b')
plt.show()
Возьмем рандомизированное разбиение на обучение и тест.
In [9]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25)
Снова проверим результат работы алгоритмов с ограничением и без
In [10]:
model = DecisionTree(maxdepth=3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('maxdepth = 3, Error =', metrics.mean_squared_error(y_test, y_pred))
In [11]:
model = DecisionTree()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Error =', metrics.mean_squared_error(y_test, y_pred))
In [12]:
model = tree.DecisionTreeRegressor(max_depth=3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('max_depth = 3, Error =', metrics.mean_squared_error(y_test, y_pred))
Вывод: рандомизация помогает бороться с переобучением
In [ ]: