Inputs:
In [1]:
# Setup
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.datasets import load_iris
from sklearn import tree
import numpy as np
# Define a function to draw the decision trees in IPython
# Adapted from: http://scikit-learn.org/stable/modules/tree.html
from IPython.display import display, Image
import pydotplus
# Custom util functions
from utils import utils
# Set seed for reproducibility
np.random.seed(1015)
In [2]:
# Load the iris data
iris = load_iris()
# Create the train-test datasets
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target)
np.random.seed(1039)
# Just fit a simple random forest classifier with 2 decision trees
rf = RandomForestClassifier(n_estimators = 2)
rf.fit(X = X_train, y = y_train)
# Now plot the trees individually
for idx, dtree in enumerate(rf.estimators_):
print(idx)
utils.draw_tree(inp_tree = dtree)
#utils.draw_tree(inp_tree = rf.estimators_[1])
In [3]:
importances = rf.feature_importances_
std = np.std([dtree.feature_importances_ for dtree in rf.estimators_]
, axis=0)
indices = np.argsort(importances)[::-1]
# Check that the feature importances are standardized to 1
print(sum(importances))
In [4]:
# Print the feature ranking
print("Feature ranking:")
for f in range(X_train.shape[1]):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(X_train.shape[1]), indices)
plt.xlim([-1, X_train.shape[1]])
plt.show()
In [5]:
feature_names = ["X" + str(i) for i in range(X_train.shape[1])]
target_vals = list(np.sort(np.unique(y_train)))
target_names = ["y" + str(i) for i in target_vals]
print(feature_names)
print(target_names)
In [51]:
estimator = rf.estimators_[1]
In [7]:
from sklearn.tree import _tree
estimator.tree_.node_count
estimator.tree_.children_left[0]
estimator.tree_.children_right[0]
_tree.TREE_LEAF
Out[7]:
In [8]:
# Now plot the trees individually
utils.draw_tree(inp_tree = estimator)
In [9]:
def binaryTreePaths(dtree, root_node_id = 0):
# Use these lists to parse the tree structure
children_left = dtree.tree_.children_left
children_right = dtree.tree_.children_right
if root_node_id is None:
paths = []
if root_node_id == _tree.TREE_LEAF:
raise ValueError("Invalid node_id %s" % _tree.TREE_LEAF)
# if left/right is None we'll get empty list anyway
if children_left[root_node_id] != _tree.TREE_LEAF:
paths = [str(root_node_id) + '->' + str(l)
for l in binaryTreePaths(dtree, children_left[root_node_id]) +
binaryTreePaths(dtree, children_right[root_node_id])]
else:
paths = [root_node_id]
return paths
In [10]:
x1 = binaryTreePaths(rf.estimators_[1], root_node_id = 0)
x1
Out[10]:
In [27]:
def binaryTreePaths2(dtree, root_node_id = 0):
# Use these lists to parse the tree structure
children_left = dtree.tree_.children_left
children_right = dtree.tree_.children_right
if root_node_id is None:
paths = []
if root_node_id == _tree.TREE_LEAF:
raise ValueError("Invalid node_id %s" % _tree.TREE_LEAF)
# if left/right is None we'll get empty list anyway
if children_left[root_node_id] != _tree.TREE_LEAF:
paths = [np.append(root_node_id, l)
for l in binaryTreePaths2(dtree, children_left[root_node_id]) +
binaryTreePaths2(dtree, children_right[root_node_id])]
else:
paths = [root_node_id]
return paths
In [28]:
x = binaryTreePaths2(rf.estimators_[1], root_node_id = 0)
x
Out[28]:
In [30]:
leaf_nodes = [y[-1] for y in x]
leaf_nodes
Out[30]:
In [60]:
n_node_samples = estimator.tree_.n_node_samples
num_samples = [n_node_samples[y].astype(int) for y in leaf_nodes]
print(n_node_samples)
print(len(n_node_samples))
num_samples
print(num_samples)
print(sum(num_samples))
print(sum(n_node_samples))
In [46]:
X_train.shape
Out[46]:
In [59]:
value = estimator.tree_.value
values = [value[y].astype(int) for y in leaf_nodes]
print(values)
# This should match the number of rows in the training feature set
print(sum(values).sum())
values
Out[59]:
In [ ]:
feature_names = ["X" + str(i) for i in range(X_train.shape[1])]
np.asarray(feature_names)
print(type(feature_names))
print(feature_names[0])
print(feature_names[-2])
feature = estimator.tree_.feature
z = [feature[y].astype(int) for y in x]
z
#[feature_names[i] for i in z]
In [ ]:
max_dpth = estimator.tree_.max_depth
max_dpth
In [ ]:
max_n_class = estimator.tree_.max_n_classes
max_n_class
In [ ]:
print("nodes", np.asarray(a = nodes, dtype = "int64"), sep = ":\n")
print("node_depth", node_depth, sep = ":\n")
print("leaf_node", is_leaves, sep = ":\n")
print("feature_names", used_feature_names, sep = ":\n")
print("feature", feature, sep = ":\n")