Inputs:
In [1]:
# Setup
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.datasets import load_iris
from sklearn import tree
import numpy as np
# Define a function to draw the decision trees in IPython
# Adapted from: http://scikit-learn.org/stable/modules/tree.html
from IPython.display import display, Image
import pydotplus
# Custom util functions
from utils import utils
# Set seed for reproducibility
np.random.seed(1015)
In [2]:
# Load the iris data
iris = load_iris()
# Create the train-test datasets
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target)
np.random.seed(1039)
# Just fit a simple random forest classifier with 2 decision trees
rf = RandomForestClassifier(n_estimators = 2)
rf.fit(X = X_train, y = y_train)
# Now plot the trees individually
for dtree in rf.estimators_:
dot_data = tree.export_graphviz(dtree
, out_file = None
, filled = True
, rounded = True
, special_characters = True)
graph = pydotplus.graph_from_dot_data(dot_data)
img = Image(graph.create_png())
display(img)
utils.draw_tree(inp_tree = dtree)
In [3]:
importances = rf.feature_importances_
std = np.std([dtree.feature_importances_ for dtree in rf.estimators_]
, axis=0)
indices = np.argsort(importances)[::-1]
# Check that the feature importances are standardized to 1
print(sum(importances))
In [4]:
# Print the feature ranking
print("Feature ranking:")
for f in range(X_train.shape[1]):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(X_train.shape[1]), indices)
plt.xlim([-1, X_train.shape[1]])
plt.show()
In [5]:
feature_names = ["X" + str(i) for i in range(X_train.shape[1])]
target_vals = list(np.sort(np.unique(y_train)))
target_names = ["y" + str(i) for i in target_vals]
print(feature_names)
print(target_names)
In [23]:
# Get the second tree - just for testing
estimator = rf.estimators_[1]
# Using those arrays, we can parse the tree structure:
n_nodes = estimator.tree_.node_count
children_left = estimator.tree_.children_left
children_right = estimator.tree_.children_right
feature = estimator.tree_.feature
#print(feature)
features_all = [feature_names[i] for i in feature]
#print("feature_names", feature_names, sep = ":\n")
threshold = estimator.tree_.threshold
# The tree structure can be traversed to compute various properties such
# as the depth of each node and whether or not it is a leaf.
node_depth = np.zeros(shape=n_nodes, dtype = "int64")
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
#nodes = np.empty(shape=n_nodes, dtype = "int64")
nodes = []
used_feature_names = []
stack = [(0, -1)] # seed is the root node id and its parent depth
while len(stack) > 0:
node_id, parent_depth = stack.pop()
node_depth[node_id] = parent_depth + 1
#np.append(arr=nodes, values=node_id)
nodes.append(node_id)
used_feature_names.append(features_all[node_id])
# print(feature[node_id])
# If we have a test node
if (children_left[node_id] != children_right[node_id]):
stack.append((children_left[node_id], parent_depth + 1))
stack.append((children_right[node_id], parent_depth + 1))
else:
is_leaves[node_id] = True
In [24]:
print("nodes", np.asarray(a = nodes, dtype = "int64"), sep = ":\n")
print("node_depth", node_depth, sep = ":\n")
print("leaf_node", is_leaves, sep = ":\n")
print("feature_names", used_feature_names, sep = ":\n")
print("feature", feature, sep = ":\n")
In [27]:
def _get_tree_paths(tree, node_id = 0, depth = 0):
"""
Returns all paths through the tree as list of node_ids
"""
if node_id == _tree.TREE_LEAF:
raise ValueError("Invalid node_id %s" % _tree.TREE_LEAF)
left_child = tree.children_left[node_id]
right_child = tree.children_right[node_id]
if left_child != _tree.TREE_LEAF:
left_paths = _get_tree_paths(tree, left_child, depth=depth + 1)
right_paths = _get_tree_paths(tree, right_child, depth=depth + 1)
for path in left_paths:
path.append(node_id)
for path in right_paths:
path.append(node_id)
paths = left_paths + right_paths
else:
paths = [[node_id]]
return paths
In [28]:
leaf_node_paths = dict()
leaf_to_path = dict()
for idx, dtree in enumerate(rf.estimators_):
# leaf_to_path = {}
node_paths = _get_tree_paths(tree = dtree.tree_, node_id = 0, depth = 0)
leaf_node_paths[idx] = node_paths
#map leaves to paths
for path in node_paths:
leaf_to_path[path[-1]] = path
In [29]:
leaf_node_paths
Out[29]:
In [ ]: