In [ ]:
# setup our standard computation environment
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')
sns.set_context('poster')
In [ ]:
# set random seed for reproducibility
np.random.seed(12345)
In [ ]:
# load the decision tree module of sklearn
import sklearn.tree
In [ ]:
# simulate some data from a familiar distribution
x_true = np.linspace(0,15,1000)
y_true = np.cos(x_true)
sigma_true = .3
x_train = np.random.choice(x_true, size=100)
y_train = np.random.laplace(np.cos(x_train), sigma_true)
In [ ]:
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.legend()
In [ ]:
# make a DecisionTreeRegressor
dt = ________________________
In [ ]:
# fit it to the simulated training data
X_train = x_train[:,None]
dt.fit(________________________, ________________________)
In [ ]:
# predict for a range of x values
X_true = x_true[:,None]
y_pred = dt.predict(________________________)
In [ ]:
# have a look
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, '-', label='Predicted')
plt.legend()
In [ ]:
# today we are going to look in-depth at the decision tree itself
# can you find it?
dt.________________________
In [ ]:
# what do the sklearn docs have to say about this?
help(dt.tree_)
Let's take a look at each of these things... guess what each will return before you execute the cell.
In [ ]:
dt.tree_.node_count
In [ ]:
dt.tree_.capacity
In [ ]:
dt.tree_.max_depth
In [ ]:
dt.tree_.children_left
In [ ]:
dt.tree_.children_right
In [ ]:
dt.tree_.feature
In [ ]:
dt.tree_.threshold
In [ ]:
dt.tree_.value
In [ ]:
dt.tree_.impurity
In [ ]:
dt.tree_.n_node_samples
In [ ]:
dt.tree_.weighted_n_node_samples
In [ ]:
# how can min samples help?
dt = sklearn.tree.DecisionTreeRegressor(min_samples_leaf=10)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_true)
In [ ]:
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, '-', label='Predicted')
plt.legend()
In [ ]:
# the next line is a tricky little trick to get some tab-completion help
t = dt.tree_
In [ ]:
def print_tree(t, root=0, depth=0):
indent = ' '*depth
left_child = t.children_left[root]
right_child = t.children_right[root]
if left_child == sklearn.tree._tree.TREE_LEAF:
print indent + 'return %.2f # (node %d)' % (t.value[root], root)
else:
print indent + 'if X_i[%d] < %.2f: # (node %d)' % (t.feature[root], t.threshold[root], root)
print_tree(t, root=________________________, depth=________________________)
print indent + 'else:'
print_tree(t,root=________________________, depth=________________________)
print_tree(dt.tree_)
In [ ]:
# make a copy of the decision tree regressor dt (called pt, "p" for pruned)
pt = sklearn.tree.DecisionTreeRegressor(min_samples_leaf=10)
pt.fit(X_train, y_train)
In [ ]:
# have a look at node 12
pt.tree_.value[12]
In [ ]:
# need to set value at soon-to-be leaf node
pt.tree_.value[12] = 1 # NOTE: this is not the right value
In [ ]:
# find the left and right children of node 12
left_child = pt.tree_.______________________[12]
right_child = pt.tree_.______________________[12]
# find the weight of these nodes in the training dataset
wt_left = pt.tree_.______________________[left_child]
wt_right = pt.tree_.______________________[right_child]
# find the value of these nodes in the training dataset
val_left = pt.tree_.______________________[left_child]
val_right = pt.tree_.______________________[right_child]
# calculate the value of node 12 after pruning
pt.tree_.value[12] = (wt_left*val_left + wt_right*val_right) / (wt_left + wt_right)
pt.tree_.children_left[12] = sklearn.tree._tree.TREE_LEAF
pt.tree_.children_right[12] = sklearn.tree._tree.TREE_LEAF
In [ ]:
# have a look at the original tree compared to the pruned version
y_pred = dt.predict(X_true)
plt.plot(x_true, y_pred, '-', label='Original Pred')
y_pred = pt.predict(X_true)
plt.plot(x_true, y_pred, '-', label='Pruned Pred')
#plt.plot(x_train, y_train, 's', label='Train')
plt.legend()
In [ ]:
d_vals = [1,2,4,8,16]
# initialize rmse dict
rmse = {}
for d in d_vals:
rmse[d] = []
# 10 repetitions of 10-fold cross-validation
cv = sklearn.cross_validation.KFold(_____________________, _____________________)
for rep in range(10):
for train, validate in cv:
for d in d_vals:
dt = sklearn.tree.DecisionTreeRegressor(max_depth=d)
dt.fit(X_train[train], y_train[train])
y_pred = dt.predict(X_train[validate])
rmse[k].append(np.sqrt(np.mean((y_pred - y_train[validate])**2)))
In [ ]:
pd.DataFrame(rmse)
In [ ]:
pd.DataFrame(rmse).mean().plot(marker='s')
In [ ]:
dt = sklearn.tree.DecisionTreeRegressor(max_depth=4)
dt.fit(X_train, y_train)
print_tree(dt.tree_)
In [ ]:
dt = sklearn.tree.DecisionTreeRegressor(max_depth=2)
dt.fit(X_train, y_train)
print_tree(dt.tree_)
In [ ]:
dt = sklearn.tree.DecisionTreeRegressor(max_depth=4)
y_pred = {}
for rep in range(500):
train = np.random.choice(range(len(y_train)), size=len(y_train))
dt.fit(X_train[train], y_train[train])
y_pred[rep] = dt.predict(X_true)
In [ ]:
y_pred = pd.DataFrame(y_pred)
y_pred = y_pred.mean(axis=1)
In [ ]:
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, label='Mean of Bootstrapped Prediction')
plt.legend()
In [ ]:
# here are the criteria for regression tree quality that sklearn knows
sklearn.tree.tree.CRITERIA_REG
In [ ]:
# here is a super-tricky way to modify the print_tree function
# so that is includes the impurity
# can you understand how it works?
old_print_tree = print_tree
def print_tree(t, root=0, depth=0):
indent = ' '*depth
print indent + '# node %d: impurity = %.2f' % (root, t.impurity[root])
old_print_tree(t, root, depth)
print_tree(dt.tree_)
In [ ]:
# here is a less-tricky way to do the same thing
# still tricky, since it uses recursion
def print_tree(t, root=0, depth=0):
indent = ' '*depth
print indent + '# node %d: impurity = %.2f' % (root, t.impurity[root])
left_child = t.children_left[root]
right_child = t.children_right[root]
if left_child == sklearn.tree._tree.TREE_LEAF:
print indent + 'return %.2f # (node %d)' % (t.value[root], root)
else:
print indent + 'if X_i[%d] < %.2f: # (node %d)' % (t.feature[root], t.threshold[root], root)
print_tree(t, root=left_child, depth=depth+1)
print indent + 'else:'
print_tree(t,root=right_child, depth=depth+1)
print_tree(dt.tree_)
The mean squared error (MSE) criteria is realitively straight-forward:
In [ ]:
dt.criterion
In [ ]:
y_train.mean()
In [ ]:
np.mean((y_train - y_train.mean())**2)
In [ ]:
np.var(y_train)
Alternatives: mean absolute deviation, median absolute deviation, (something specific to your regression situation?)
In [ ]:
# here are the criteria that sklearn knows for classification problems
sklearn.tree.tree.CRITERIA_CLF
Which did you use in Homework 4? Does using the other one change anything?
One proposed approach to limiting the complexity of decision trees is through pruning to minimize the sum $$ \sum_{m=1}^{|T|} \sum_{i: x_i \in R_m} (y_i - \hat{y}_{R_m})^2 + \alpha |T|. $$
This can be accomplished recursively: for a tree with a root and two leaves, you must determine if the MSE for the root is less than the MSE for the leaves + $\alpha$. For a more complicated tree, i.e. a root with two non-leaf subtrees, do the pruning separately for each subtree, and then see if you end up in the root-and-two-leave case.
Super-hard extra bonus homework: implement this, and used cross-validation to see what it changes in the Exercise 4 example.
In [ ]: