In [ ]:

    
# setup our standard computation environment
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')
sns.set_context('poster')



In [ ]:

    
# set random seed for reproducibility
np.random.seed(12345)



In [ ]:

    
# load the decision tree module of sklearn
import sklearn.tree



In [ ]:

    
# simulate some data from a familiar distribution
x_true = np.linspace(0,15,1000)
y_true = np.cos(x_true)

sigma_true = .3
x_train = np.random.choice(x_true, size=100)
y_train = np.random.laplace(np.cos(x_train), sigma_true)



In [ ]:

    
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.legend()



In [ ]:

    
# make a DecisionTreeRegressor
dt = ________________________



In [ ]:

    
# fit it to the simulated training data
X_train = x_train[:,None]
dt.fit(________________________, ________________________)



In [ ]:

    
# predict for a range of x values
X_true = x_true[:,None]
y_pred = dt.predict(________________________)



In [ ]:

    
# have a look
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, '-', label='Predicted')
plt.legend()



In [ ]:

    
# today we are going to look in-depth at the decision tree itself

# can you find it?
dt.________________________



In [ ]:

    
# what do the sklearn docs have to say about this?
help(dt.tree_)

Let's take a look at each of these things... guess what each will return before you execute the cell.



In [ ]:

    
dt.tree_.node_count



In [ ]:

    
dt.tree_.capacity



In [ ]:

    
dt.tree_.max_depth



In [ ]:

    
dt.tree_.children_left



In [ ]:

    
dt.tree_.children_right



In [ ]:

    
dt.tree_.feature



In [ ]:

    
dt.tree_.threshold



In [ ]:

    
dt.tree_.value



In [ ]:

    
dt.tree_.impurity



In [ ]:

    
dt.tree_.n_node_samples



In [ ]:

    
dt.tree_.weighted_n_node_samples



In [ ]:

    
# how can min samples help?
dt = sklearn.tree.DecisionTreeRegressor(min_samples_leaf=10)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_true)



In [ ]:

    
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, '-', label='Predicted')
plt.legend()

Time to refactor that?

Yes, if we are going to keep experimenting with it. But perhaps we will be moving on a little bit.

describing the contents of a tree, with recursion



In [ ]:

    
# the next line is a tricky little trick to get some tab-completion help
t = dt.tree_



In [ ]:

    
def print_tree(t, root=0, depth=0):
    indent = '    '*depth
    
    left_child = t.children_left[root]
    right_child = t.children_right[root]
    
    if left_child == sklearn.tree._tree.TREE_LEAF:
        print indent + 'return %.2f # (node %d)' % (t.value[root], root)
    else:
        print indent + 'if X_i[%d] < %.2f: # (node %d)' % (t.feature[root], t.threshold[root], root)
        print_tree(t, root=________________________, depth=________________________)
        
        print indent + 'else:'
        print_tree(t,root=________________________, depth=________________________)
    
print_tree(dt.tree_)

Can you draw a tree version of that?

Is it right?

pruning



In [ ]:

    
# make a copy of the decision tree regressor dt (called pt, "p" for pruned)
pt = sklearn.tree.DecisionTreeRegressor(min_samples_leaf=10)
pt.fit(X_train, y_train)



In [ ]:

    
# have a look at node 12
pt.tree_.value[12]



In [ ]:

    
# need to set value at soon-to-be leaf node
pt.tree_.value[12] = 1  # NOTE: this is not the right value



In [ ]:

    
# find the left and right children of node 12
left_child = pt.tree_.______________________[12]
right_child = pt.tree_.______________________[12]

# find the weight of these nodes in the training dataset
wt_left = pt.tree_.______________________[left_child]
wt_right = pt.tree_.______________________[right_child]

# find the value of these nodes in the training dataset
val_left = pt.tree_.______________________[left_child]
val_right = pt.tree_.______________________[right_child]

# calculate the value of node 12 after pruning
pt.tree_.value[12] = (wt_left*val_left + wt_right*val_right) / (wt_left + wt_right)


pt.tree_.children_left[12] = sklearn.tree._tree.TREE_LEAF
pt.tree_.children_right[12] = sklearn.tree._tree.TREE_LEAF



In [ ]:

    
# have a look at the original tree compared to the pruned version
y_pred = dt.predict(X_true)
plt.plot(x_true, y_pred, '-', label='Original Pred')

y_pred = pt.predict(X_true)
plt.plot(x_true, y_pred, '-', label='Pruned Pred')

#plt.plot(x_train, y_train, 's', label='Train')

plt.legend()

Another look at bounded-depth

We will use our skills from last week's class to sweep over a range of depth values, and see which is best



In [ ]:

    
d_vals = [1,2,4,8,16]

# initialize rmse dict
rmse = {}
for d in d_vals:
    rmse[d] = []

# 10 repetitions of 10-fold cross-validation
cv = sklearn.cross_validation.KFold(_____________________, _____________________)
for rep in range(10):
    for train, validate in cv:
        for d in d_vals:
            dt = sklearn.tree.DecisionTreeRegressor(max_depth=d)
            dt.fit(X_train[train], y_train[train])

            y_pred = dt.predict(X_train[validate])

            rmse[k].append(np.sqrt(np.mean((y_pred - y_train[validate])**2)))



In [ ]:

    
pd.DataFrame(rmse)



In [ ]:

    
pd.DataFrame(rmse).mean().plot(marker='s')



In [ ]:

    
dt = sklearn.tree.DecisionTreeRegressor(max_depth=4)
dt.fit(X_train, y_train)
print_tree(dt.tree_)



In [ ]:

    
dt = sklearn.tree.DecisionTreeRegressor(max_depth=2)
dt.fit(X_train, y_train)
print_tree(dt.tree_)

An aside: bootstrap + decision trees = good



In [ ]:

    
dt = sklearn.tree.DecisionTreeRegressor(max_depth=4)

y_pred = {}
for rep in range(500):
    train = np.random.choice(range(len(y_train)), size=len(y_train))
    
    dt.fit(X_train[train], y_train[train])
    y_pred[rep] = dt.predict(X_true)



In [ ]:

    
y_pred = pd.DataFrame(y_pred)
y_pred = y_pred.mean(axis=1)



In [ ]:

    
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, label='Mean of Bootstrapped Prediction')
plt.legend()

One missing piece: measuring split quality



In [ ]:

    
# here are the criteria for regression tree quality that sklearn knows
sklearn.tree.tree.CRITERIA_REG



In [ ]:

    
# here is a super-tricky way to modify the print_tree function
# so that is includes the impurity

# can you understand how it works?

old_print_tree = print_tree

def print_tree(t, root=0, depth=0):
    indent = '    '*depth
    print indent + '# node %d: impurity = %.2f' % (root, t.impurity[root])

    old_print_tree(t, root, depth)
    
print_tree(dt.tree_)



In [ ]:

    
# here is a less-tricky way to do the same thing
# still tricky, since it uses recursion

def print_tree(t, root=0, depth=0):
    indent = '    '*depth
    print indent + '# node %d: impurity = %.2f' % (root, t.impurity[root])
    left_child = t.children_left[root]
    right_child = t.children_right[root]
    
    if left_child == sklearn.tree._tree.TREE_LEAF:
        print indent + 'return %.2f # (node %d)' % (t.value[root], root)
    else:
        print indent + 'if X_i[%d] < %.2f: # (node %d)' % (t.feature[root], t.threshold[root], root)
        print_tree(t, root=left_child, depth=depth+1)
        
        print indent + 'else:'
        print_tree(t,root=right_child, depth=depth+1)
    
print_tree(dt.tree_)

The mean squared error (MSE) criteria is realitively straight-forward:



In [ ]:

    
dt.criterion



In [ ]:

    
y_train.mean()



In [ ]:

    
np.mean((y_train - y_train.mean())**2)



In [ ]:

    
np.var(y_train)

Alternatives: mean absolute deviation, median absolute deviation, (something specific to your regression situation?)

So far, we've looked entirely at DT regression, because it is prettier.

In Week 4 homework, we has a DT classification problem. How did it work?



In [ ]:

    
# here are the criteria that sklearn knows for classification problems
sklearn.tree.tree.CRITERIA_CLF

Which did you use in Homework 4? Does using the other one change anything?

Some people say that a strength of Decision Trees (and hence Random Forests) is that there is no need to transform your variables.

Show that this is correct for categorial labels (classification setting):

Show that transforming variables is relevant when labels are numeric (regression setting):

A systematic approach to pruning

One proposed approach to limiting the complexity of decision trees is through pruning to minimize the sum $$ \sum_{m=1}^{|T|} \sum_{i: x_i \in R_m} (y_i - \hat{y}_{R_m})^2 + \alpha |T|. $$

This can be accomplished recursively: for a tree with a root and two leaves, you must determine if the MSE for the root is less than the MSE for the leaves + $\alpha$. For a more complicated tree, i.e. a root with two non-leaf subtrees, do the pruning separately for each subtree, and then see if you end up in the root-and-two-leave case.

Super-hard extra bonus homework: implement this, and used cross-validation to see what it changes in the Exercise 4 example.



In [ ]: