In [1]:
# setup our standard computation environment
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')
sns.set_context('poster')
In [2]:
# set random seed for reproducibility
np.random.seed(12345)
In [3]:
# load the decision tree module of sklearn
import sklearn.tree
In [4]:
# simulate some data from a familiar distribution
x_true = np.linspace(0,15,1000)
y_true = np.cos(x_true)
sigma_true = .3
x_train = np.random.choice(x_true, size=100)
y_train = np.random.laplace(np.cos(x_train), sigma_true)
In [5]:
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.legend()
Out[5]:
In [6]:
# make a DecisionTreeRegressor
dt = sklearn.tree.DecisionTreeRegressor()
In [7]:
x_train[:5,None]
Out[7]:
In [8]:
# fit it to the simulated training data
X_train = x_train[:,None]
dt.fit(X_train, y_train)
Out[8]:
In [9]:
# predict for a range of x values
X_true = x_true[:,None] # horrible, but remember it!
y_pred = dt.predict(X_true)
In [10]:
# have a look
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, '-', label='Predicted')
plt.legend()
Out[10]:
In [11]:
# today we are going to look in-depth at the decision tree itself
# can you find it?
dt.tree_
Out[11]:
An aside on the strange use of underscores in Python, and just how strange it could get, if you let it:
In [12]:
_ = 10
__ = _ + 20
In [13]:
#dt.tree_ = 5
In [14]:
# what do the sklearn docs have to say about this?
help(dt.tree_)
Let's take a look at each of these things... guess what each will return before you execute the cell.
In [15]:
dt.tree_.node_count
Out[15]:
In [16]:
dt.tree_.capacity
Out[16]:
In [17]:
dt.tree_.max_depth
Out[17]:
In [18]:
dt.tree_.children_left
Out[18]:
In [19]:
dt.tree_.children_right
Out[19]:
In [20]:
dt.tree_.feature
Out[20]:
In [21]:
dt.tree_.threshold
Out[21]:
In [22]:
dt.tree_.value
Out[22]:
In [23]:
np.round(dt.tree_.impurity, 2)
Out[23]:
In [24]:
dt.tree_.n_node_samples
Out[24]:
In [25]:
dt.tree_.weighted_n_node_samples
Out[25]:
In [26]:
# how can min samples help?
dt = sklearn.tree.DecisionTreeRegressor(max_depth=4, splitter='best')
dt.fit(X_train[::-1], -y_train[::-1])
y_pred = dt.predict(X_true)
In [27]:
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, -y_pred, '-', label='Predicted')
plt.legend()
Out[27]:
In [28]:
def experiment_w_dt_options(max_depth=None, min_samples_leaf=1):
dt = sklearn.tree.DecisionTreeRegressor(max_depth=max_depth, min_samples_leaf=min_samples_leaf)
dt.fit(X_train[::-1], -y_train[::-1])
y_pred = dt.predict(X_true)
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train', ms=4)
plt.plot(x_true, -y_pred, '-', label='Predicted')
In [ ]:
i = 0
for max_depth in [8, 4, 2]:
for min_samples_leaf in [1,10,25]:
i += 1
plt.subplot(3,3,i)
experiment_w_dt_options(max_depth, min_samples_leaf)
plt.text(0,2,'max_depth: %d, min_samples_left: %d'%(max_depth, min_samples_leaf), va='top')
plt.legend(loc=(1,.1))
Out[ ]:
In [ ]:
# the next line is a tricky little trick to get some tab-completion help
t = dt.tree_
In [ ]:
# here is a tricky python thing about strings:
'abie'*0, 'abie'*10
Out[ ]:
In [30]:
def print_tree(t, root=0, depth=0):
""" print the contents of a decision tree
as a python function
parameters
t : sklearn.tree.tree_.Tree
root : int, optional - where to start tree
depth : int, optional - how deep we are in orig tree
"""
indent = ' '*depth
left_child = t.children_left[root]
right_child = t.children_right[root]
if left_child == sklearn.tree._tree.TREE_LEAF: # magic number is -1
print indent + 'return %.2f # (node %d)' % (t.value[root], root)
else:
print indent + 'if X_i[%d] < %.2f: # (node %d)' % (t.feature[root], t.threshold[root], root)
print_tree(t, root=left_child, depth=depth+1)
print indent + 'else:'
print_tree(t, root=right_child, depth=depth+1)
print_tree(dt.tree_)
In [29]:
dt = sklearn.tree.DecisionTreeRegressor(min_samples_leaf=10)
dt.fit(X_train, y_train)
# make a copy of the decision tree regressor dt (called pt, "p" for pruned)
pt = sklearn.tree.DecisionTreeRegressor(min_samples_leaf=10)
pt.fit(X_train, y_train)
Out[29]:
In [35]:
print_tree(pt.tree_)
In [32]:
# have a look at node 12
pt.tree_.value[12]
Out[32]:
In [33]:
# need to set value at soon-to-be leaf node
pt.tree_.value[12] = 1 # NOTE: this is not the right value
In [34]:
# find the left and right children of node 12
left_child = pt.tree_.children_left[12]
right_child = pt.tree_.children_right[12]
# find the weight of these nodes in the training dataset
wt_left = pt.tree_.weighted_n_node_samples[left_child]
wt_right = pt.tree_.weighted_n_node_samples[right_child]
# find the value of these nodes in the training dataset
val_left = pt.tree_.value[left_child]
val_right = pt.tree_.value[right_child]
# calculate the value of node 12 after pruning
pt.tree_.value[12] = (wt_left*val_left + wt_right*val_right) / (wt_left + wt_right)
pt.tree_.children_left[12] = sklearn.tree._tree.TREE_LEAF
pt.tree_.children_right[12] = sklearn.tree._tree.TREE_LEAF
In [36]:
# have a look at the original tree compared to the pruned version
y_pred = dt.predict(X_true)
plt.plot(x_true, y_pred, '-', label='Original Pred')
y_pred = pt.predict(X_true)
plt.plot(x_true, y_pred, '-', label='Pruned Pred')
#plt.plot(x_train, y_train, 's', label='Train')
plt.legend()
Out[36]:
In [ ]:
d_vals = [1,2,4,8,16]
# initialize rmse dict
rmse = {}
for d in d_vals:
rmse[d] = []
# 10 repetitions of 10-fold cross-validation
for rep in range(10):
cv = sklearn.cross_validation.KFold(len(y_train), n_folds=10, shuffle=True)
for train, validate in cv:
for d in d_vals:
dt = sklearn.tree.DecisionTreeRegressor(max_depth=d)
dt.fit(X_train[train], y_train[train])
y_pred = dt.predict(X_train[validate])
rmse[d].append(np.sqrt(np.mean((y_pred - y_train[validate])**2)))
In [ ]:
pd.DataFrame(rmse)
In [ ]:
pd.DataFrame(rmse).mean().plot(marker='s')
Out[ ]:
In [ ]:
dt = sklearn.tree.DecisionTreeRegressor(max_depth=4)
dt.fit(X_train, y_train)
print_tree(dt.tree_)
In [ ]:
dt = sklearn.tree.DecisionTreeRegressor(max_depth=2)
dt.fit(X_train, y_train)
print_tree(dt.tree_)
In [ ]:
dt = sklearn.tree.DecisionTreeRegressor(max_depth=4)
y_pred = {}
for rep in range(500):
train = np.random.choice(range(len(y_train)), size=len(y_train))
dt.fit(X_train[train], y_train[train])
y_pred[rep] = dt.predict(X_true)
In [ ]:
y_pred = pd.DataFrame(y_pred)
y_pred = y_pred.mean(axis=1)
In [ ]:
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, label='Mean of Bootstrapped Prediction')
plt.legend()
Out[ ]:
In [ ]:
# here are the criteria for regression tree quality that sklearn knows
sklearn.tree.tree.CRITERIA_REG
Out[ ]:
In [ ]:
# here is a super-tricky way to modify the print_tree function
# so that is includes the impurity
# can you understand how it works?
old_print_tree = print_tree
def print_tree(t, root=0, depth=0):
indent = ' '*depth
print indent + '# node %d: impurity = %.2f' % (root, t.impurity[root])
old_print_tree(t, root, depth)
print_tree(dt.tree_)
In [46]:
# here is a less-tricky way to do the same thing
# still tricky, since it uses recursion
def print_tree(t, root=0, depth=0):
indent = ' '*depth
print indent + '# node %s: impurity = %.2f' % (str(root), t.impurity[root])
left_child = t.children_left[root]
right_child = t.children_right[root]
if left_child == sklearn.tree._tree.TREE_LEAF:
print indent + 'return %s # (node %d)' % (str(t.value[root]), root)
else:
print indent + 'if X_i[%d] < %.2f: # (node %d)' % (t.feature[root], t.threshold[root], root)
print_tree(t, root=left_child, depth=depth+1)
print indent + 'else:'
print_tree(t,root=right_child, depth=depth+1)
print_tree(dt.tree_)
The mean squared error (MSE) criteria is realitively straight-forward:
In [ ]:
dt.criterion
Out[ ]:
In [ ]:
y_train.mean()
Out[ ]:
In [ ]:
np.mean((y_train - y_train.mean())**2)
Out[ ]:
In [ ]:
np.var(y_train)
Out[ ]:
Alternatives: mean absolute deviation, median absolute deviation, (something specific to your regression situation?)
In [ ]:
# here are the criteria that sklearn knows for classification problems
sklearn.tree.tree.CRITERIA_CLF
Out[ ]:
Which did you use in Homework 4? Does using the other one change anything?
We used gini; let's check if entropy is any different!
In [41]:
# Load and prep data from Week 4:
df = pd.read_csv('../Week_4/IHME_PHMRC_VA_DATA_ADULT_Y2013M09D11_0.csv', low_memory=False)
X = np.array(df.filter(like='word_'))
df['Cause'] = df.gs_text34.map({'Stroke':'Stroke', 'Diabetes':'Diabetes'}).fillna('Other')
y = np.array(df.Cause)
# 10-rep 10-fold cross-validate decision tree with class weights
import sklearn.tree
weights = 1000. / df.Cause.value_counts()
sample_weight = np.array(weights[y])
In [ ]:
%%time
clf = sklearn.tree.DecisionTreeClassifier(max_depth=4, criterion='gini')
scores = []
for rep in range(10):
print rep,
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456+rep)
for train, test in cv:
clf.fit(X[train], y[train], sample_weight=sample_weight[train])
y_pred = clf.predict(X[test])
scores.append(sklearn.metrics.accuracy_score(y[test], y_pred, sample_weight=sample_weight[test]))
print
print np.mean(scores)
In [ ]:
%%time
clf = sklearn.tree.DecisionTreeClassifier(max_depth=4, criterion='entropy')
scores = []
for rep in range(10):
print rep,
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456+rep)
for train, test in cv:
clf.fit(X[train], y[train], sample_weight=sample_weight[train])
y_pred = clf.predict(X[test])
scores.append(sklearn.metrics.accuracy_score(y[test], y_pred, sample_weight=sample_weight[test]))
print
print np.mean(scores)
In [44]:
# we will test this out with simulated data
np.random.seed(123456)
# do you understand what this is?
X = np.random.normal(size=(100,10))
beta = np.random.normal(size=10)
# if so, how about this?
y_numeric = np.dot(X, beta)
y_categorical = (y_numeric > 0)
# if you really get it, what would be a better way to learn from (X, y_categorical)?
In [47]:
# case one: categorical labels
# with original data:
clf = sklearn.tree.DecisionTreeClassifier(max_depth=2)
clf.fit(X, y_categorical)
print_tree(clf.tree_)
In [48]:
# with transformed data:
clf = sklearn.tree.DecisionTreeClassifier(max_depth=2)
clf.fit(np.exp(X), y_categorical)
print_tree(clf.tree_)
In [49]:
# same tree, right?
In [50]:
# case two: with numeric labels
# with original data:
clf = sklearn.tree.DecisionTreeRegressor(max_depth=2)
clf.fit(X, y_numeric)
print_tree(clf.tree_)
In [51]:
# with transformed feature vectors:
clf = sklearn.tree.DecisionTreeRegressor(max_depth=2)
clf.fit(np.exp(X), y_numeric)
print_tree(clf.tree_)
In [52]:
# same again, except the thresholds are in the transformed space
In [53]:
# case three: transformed numeric labels
# with original data
clf = sklearn.tree.DecisionTreeRegressor(max_depth=2)
clf.fit(X, y_numeric)
print_tree(clf.tree_)
In [54]:
clf = sklearn.tree.DecisionTreeRegressor(max_depth=2)
clf.fit(X, np.exp(y_numeric))
print_tree(clf.tree_)
In [55]:
# got that?
One proposed approach to limiting the complexity of decision trees is through pruning to minimize the sum $$ \sum_{m=1}^{|T|} \sum_{i: x_i \in R_m} (y_i - \hat{y}_{R_m})^2 + \alpha |T|. $$
This can be accomplished recursively: for a tree with a root and two leaves, you must determine if the MSE for the root is less than the MSE for the leaves + $\alpha$. For a more complicated tree, i.e. a root with two non-leaf subtrees, do the pruning separately for each subtree, and then see if you end up in the root-and-two-leave case.
Super-hard extra bonus homework: implement this, and used cross-validation to see what it changes in the Exercise 4 example.
In [56]:
# refactor pruning code from above into a function
def prune(tree, node):
""" prune decision tree so that specified node becomes a leaf
Parameters
----------
tree : sklearn.tree.tree._tree.Tree
node : int
Results
-------
changes internal arrays of `tree` so that node `node` is a leaf
"""
# find the left and right children of node
left_child = tree.children_left[node]
right_child = tree.children_right[node]
# find the weight of these nodes in the training dataset
wt_left = tree.weighted_n_node_samples[left_child]
wt_right = tree.weighted_n_node_samples[right_child]
# find the value of these nodes in the training dataset
val_left = tree.value[left_child]
val_right = tree.value[right_child]
# calculate the value of node 12 after pruning
tree.value[node] = (wt_left*val_left + wt_right*val_right) / (wt_left + wt_right)
# remove children of node
tree.children_left[node] = sklearn.tree._tree.TREE_LEAF
tree.children_right[node] = sklearn.tree._tree.TREE_LEAF
In [57]:
# test that:
clf.fit(X, np.exp(y_numeric))
print_tree(clf.tree_)
In [58]:
prune(clf.tree_, 4)
print_tree(clf.tree_)
In [59]:
tree = clf.tree_
In [60]:
def recursive_prune(tree, alpha, node=0):
""" Prune tree so to min tree.score[node] + \alpha |T|
Parameters
----------
tree : sklearn.tree.tree._tree.Tree
alpha : float, pruning parameter
node : int, optional; node to consider root of tree, for recursion
Results
-------
Prunes tree to maximize objective, returns contribution of impurity to min sum
and number of leaves in tree that attains it
"""
# find the left and right children of node
left_child = tree.children_left[node]
right_child = tree.children_right[node]
wt = tree.weighted_n_node_samples[node]
if left_child == sklearn.tree._tree.TREE_LEAF:
assert right_child == sklearn.tree._tree.TREE_LEAF, 'Expected binary tree'
return (tree.impurity[node],1)
else:
# find the weight of these nodes in the training dataset
wt_left = tree.weighted_n_node_samples[left_child]
wt_right = tree.weighted_n_node_samples[right_child]
# calculate contribution of children to objective function
left_impurity_sum, left_leaves = recursive_prune(tree, alpha, left_child)
right_impurity_sum, right_leaves = recursive_prune(tree, alpha, right_child)
current_obj_contrib = (wt_left*left_impurity_sum + wt_right*right_impurity_sum) \
/ (wt_left + wt_right)
# compare to contribution to objective function if tree is pruned so that current node
# is a leaf
pruned_obj_contrib = tree.impurity[node]
if pruned_obj_contrib + alpha <= current_obj_contrib + (left_leaves + right_leaves)*alpha:
prune(tree, node)
return (tree.impurity[node],1)
else:
return current_obj_contrib, left_leaves + right_leaves
In [61]:
# test that:
clf.fit(X, np.exp(y_numeric))
print_tree(clf.tree_)
In [62]:
clf.fit(X, np.exp(y_numeric))
recursive_prune(clf.tree_, alpha=0) # should not change tree
print_tree(clf.tree_)
In [63]:
clf.fit(X, np.exp(y_numeric))
recursive_prune(clf.tree_, alpha=np.inf) # should prune to the root
print_tree(clf.tree_)
In [64]:
clf.fit(X, np.exp(y_numeric))
recursive_prune(clf.tree_, alpha=500) # should prune node 1, not node 4
print_tree(clf.tree_)
In [ ]: