In [3]:
# Setup
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.datasets import load_iris
from sklearn import tree
import numpy as np
# Set seed for reproducibility
np.random.seed(1015)
In [4]:
def draw_tree(inp_tree
, out_file = None
, filled=True
, rounded=True
, special_characters=True):
dot_data = tree.export_graphviz(inp_tree
, out_file = out_file
, filled = filled
, rounded = rounded
, special_characters = special_characters)
graph = pydotplus.graph_from_dot_data(dot_data)
img = Image(graph.create_png())
display(img)
In [5]:
# Load the iris data
iris = load_iris()
# Create the train-test datasets
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target)
np.random.seed(1039)
# Just fit a simple random forest classifier with 2 decision trees
rf = RandomForestClassifier(n_estimators = 2)
rf.fit(X = X_train, y = y_train)
# Define a function to draw the decision trees in IPython
# Adapted from: http://scikit-learn.org/stable/modules/tree.html
from IPython.display import display, Image
import pydotplus
# Now plot the trees individually
for dtree in rf.estimators_:
dot_data = tree.export_graphviz(dtree
, out_file = None
, filled = True
, rounded = True
, special_characters = True)
graph = pydotplus.graph_from_dot_data(dot_data)
img = Image(graph.create_png())
display(img)
draw_tree(inp_tree = dtree)
#print(dtree.tree_.feature)
In [84]:
tval = list(np.sort(np.unique(y_train)))
print(tval)
In [91]:
feature_names = ["X" + str(i) for i in range(X_train.shape[1])]
target_vals = list(np.sort(np.unique(y_train)))
target_names = ["y" + str(i) for i in target_vals]
print(feature_names)
print(target_names)
In [94]:
from sklearn import tree
def get_code(tree, feature_names, target_names,
spacer_base=" "):
"""Produce psuedo-code for decision tree.
Args
----
tree -- scikit-leant DescisionTree.
feature_names -- list of feature names.
target_names -- list of target (class) names.
spacer_base -- used for spacing code (default: " ").
Notes
-----
based on http://stackoverflow.com/a/30104792.
"""
left = tree.tree_.children_left
right = tree.tree_.children_right
threshold = tree.tree_.threshold
features = [feature_names[i] for i in tree.tree_.feature]
value = tree.tree_.value
def recurse(left, right, threshold, features, node, depth):
spacer = spacer_base * depth
if (threshold[node] != -2):
print(spacer + "if ( " + features[node] + " <= " + \
str(threshold[node]) + " ) {")
if left[node] != -1:
recurse(left, right, threshold, features,
left[node], depth+1)
print(spacer + "}\n" + spacer +"else {")
if right[node] != -1:
recurse(left, right, threshold, features,
right[node], depth+1)
print(spacer + "}")
else:
target = value[node]
for i, v in zip(np.nonzero(target)[1],
target[np.nonzero(target)]):
target_name = target_names[i]
target_count = int(v)
print(spacer + "return " + str(target_name) + \
" ( " + str(target_count) + " examples )")
recurse(left, right, threshold, features, 0, 0)
In [95]:
for idx, dtree in enumerate(rf.estimators_):
get_code(tree = dtree, feature_names = feature_names, target_names = target_names)
In [133]:
from sklearn.tree import _tree
def _get_tree_paths(tree, node_id = 0, depth = 0):
"""
Returns all paths through the tree as list of node_ids
"""
if node_id == _tree.TREE_LEAF:
raise ValueError("Invalid node_id %s" % _tree.TREE_LEAF)
left_child = tree.children_left[node_id]
right_child = tree.children_right[node_id]
if left_child != _tree.TREE_LEAF:
left_paths = _get_tree_paths(tree, left_child, depth = depth + 1)
left_features = np.array([tree.feature[i] for i in left_paths]).tolist()
print(left_features)
right_paths = _get_tree_paths(tree, right_child, depth = depth + 1)
right_features = np.array([tree.feature[i] for i in right_paths]).tolist()
print(right_features)
# organize paths
for path in left_paths:
path.append(node_id)
for path in right_paths:
path.append(node_id)
paths = left_paths + right_paths
features = left_features + right_features
print(features)
# organize features
#for feature in left_features:
# feature.append(feature(node_id))
#for feature in right_features:
# feature.append(feature(node_id))
else:
paths = [[node_id]]
return paths
In [134]:
leaf_node_paths = dict()
leaf_to_path = dict()
for idx, dtree in enumerate(rf.estimators_):
# leaf_to_path = {}
node_paths = _get_tree_paths(tree = dtree.tree_, node_id = 0, depth = 0)
leaf_node_paths[idx] = node_paths
#map leaves to paths
for path in node_paths:
leaf_to_path[path[-1]] = path
In [106]:
leaf_node_paths
Out[106]:
In [65]:
leaf_to_path
Out[65]:
In [135]:
len(rf.estimators_)
Out[135]:
In [136]:
rf
Out[136]:
In [155]:
first_tree = rf.estimators_[0].tree_
#dir(first_tree)
In [156]:
first_tree.max_depth
Out[156]:
In [172]:
estimator = rf.estimators_[1]
# tree structure and allows access to low level attributes. The binary tree
# tree_ is represented as a number of parallel arrays. The i-th element of each
# array holds information about the node `i`. Node 0 is the tree's root. NOTE:
# Some of the arrays only apply to either leaves or split nodes, resp. In this
# case the values of nodes of the other type are arbitrary!
#
# Among those arrays, we have:
# - left_child, id of the left child of the node
# - right_child, id of the right child of the node
# - feature, feature used for splitting the node
# - threshold, threshold value at the node
#
# Using those arrays, we can parse the tree structure:
# The decision estimator has an attribute called tree_ which stores the entire
# tree structure and allows access to low level attributes. The binary tree
# tree_ is represented as a number of parallel arrays. The i-th element of each
# array holds information about the node `i`. Node 0 is the tree's root. NOTE:
# Some of the arrays only apply to either leaves or split nodes, resp. In this
# case the values of nodes of the other type are arbitrary!
#
# Among those arrays, we have:
# - left_child, id of the left child of the node
# - right_child, id of the right child of the node
# - feature, feature used for splitting the node
# - threshold, threshold value at the node
#
# Using those arrays, we can parse the tree structure:
n_nodes = estimator.tree_.node_count
children_left = estimator.tree_.children_left
children_right = estimator.tree_.children_right
feature = estimator.tree_.feature
threshold = estimator.tree_.threshold
# The tree structure can be traversed to compute various properties such
# as the depth of each node and whether or not it is a leaf.
node_depth = np.zeros(shape=n_nodes)
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
nodes = []
stack = [(0, -1)] # seed is the root node id and its parent depth
while len(stack) > 0:
node_id, parent_depth = stack.pop()
node_depth[node_id] = parent_depth + 1
#np.append(arr=nodes, values=node_id)
nodes.append(node_id)
# If we have a test node
if (children_left[node_id] != children_right[node_id]):
stack.append((children_left[node_id], parent_depth + 1))
stack.append((children_right[node_id], parent_depth + 1))
else:
is_leaves[node_id] = True
In [173]:
node_depth
Out[173]:
In [174]:
is_leaves
Out[174]:
In [176]:
nodes
Out[176]:
In [177]:
len(nodes)
Out[177]:
In [179]:
node_depth.shape
Out[179]:
In [185]:
leaf_nodes = [nodes[idx] for (idx, _) in enumerate(nodes) if is_leaves[idx]]
leaf_nodes
Out[185]:
In [ ]: