In [ ]:
# Setup
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.datasets import load_iris
from sklearn import tree
import numpy as np
# Define a function to draw the decision trees in IPython
# Adapted from: http://scikit-learn.org/stable/modules/tree.html
from IPython.display import display, Image
import pydotplus
# Custom util functions
from utils import utils
RANDOM_STATE_SPLIT = 1001
RANDOM_STATE_CLASSIFIER = 1039
In [2]:
# Load the iris data
iris = load_iris()
# Create the train-test datasets
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state = RANDOM_STATE_SPLIT)
# Just fit a simple random forest classifier with 2 decision trees
rf = RandomForestClassifier(n_estimators = 2, random_state = RANDOM_STATE_CLASSIFIER)
rf.fit(X = X_train, y = y_train)
# Now plot the trees individually
#for idx, dtree in enumerate(rf.estimators_):
# print(idx)
# utils.draw_tree(inp_tree = dtree)
In [3]:
a = 1
test = {} # create the dictionary to store the objects
test['first'] = a
test['rf_obj'] = rf
In [4]:
print(test['first'])
print(test['rf_obj'].feature_importances_)
Yay - seems to work just fine!
In [5]:
importances = rf.feature_importances_
std = np.std([dtree.feature_importances_ for dtree in rf.estimators_]
, axis=0)
indices = np.argsort(importances)[::-1]
# Check that the feature importances are standardized to 1
print(sum(importances))
In [6]:
# Print the feature ranking
print("Feature ranking:")
for f in range(X_train.shape[1]):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(X_train.shape[1]), indices)
plt.xlim([-1, X_train.shape[1]])
plt.show()
In [7]:
feature_names = ["X" + str(i) for i in range(X_train.shape[1])]
target_vals = list(np.sort(np.unique(y_train)))
target_names = ["y" + str(i) for i in target_vals]
print(feature_names)
print(target_names)
In [8]:
estimator = rf.estimators_[1]
In [9]:
from sklearn.tree import _tree
estimator.tree_.children_left[0]
estimator.tree_.children_right[0]
Out[9]:
In [10]:
# Now plot the trees individually
utils.draw_tree(inp_tree = estimator)
We have the following inputs:
We have the following outputs:
In [11]:
# Setup the key variables
threshold = estimator.tree_.threshold
In [12]:
max_node_depth = estimator.tree_.max_depth
max_node_depth
print("Max node depth in tree", max_node_depth, sep = ":\n")
In [13]:
n_nodes = estimator.tree_.node_count
print("number of nodes in tree", n_nodes, sep = ":\n")
In [41]:
# Define the number of features
num_features = X_train.shape[1]
# Get the node features from the decision tree classifier attribute
# It is hard to tell which features this came from i.e. indices are zero,
# positive and negative - we want only non-negative indices for the
# corresponding feature columns
node_features = estimator.tree_.feature
# Get indices for all the features used - 0 indexed and ranging
# to the total number of possible features in the training data
all_features_idx = np.array(range(num_features))
node_features_idx = np.array(range(num_features))[node_features]
# Count the unique number of features used
num_features_used = (np.unique(node_features_idx)).shape[0]
print("number of node features", num_features_used, sep = ":\n")
print("all features indices", all_features_idx, sep = ":\n")
print("node features", node_features, sep = ":\n")
print("node feature indices", node_features_idx, sep = ":\n")
In [15]:
def allTreePaths(dtree, root_node_id = 0):
"""Get all the individual tree paths from root node
to the leaves
"""
# Use these lists to parse the tree structure
children_left = dtree.tree_.children_left
children_right = dtree.tree_.children_right
if root_node_id is None:
paths = []
if root_node_id == _tree.TREE_LEAF:
raise ValueError("Invalid node_id %s" % _tree.TREE_LEAF)
# if left/right is None we'll get empty list anyway
if children_left[root_node_id] != _tree.TREE_LEAF:
paths = [np.append(root_node_id, l)
for l in allTreePaths(dtree, children_left[root_node_id]) +
allTreePaths(dtree, children_right[root_node_id])]
else:
paths = [root_node_id]
return paths
In [16]:
all_leaf_node_paths = allTreePaths(rf.estimators_[1], root_node_id = 0)
all_leaf_node_paths
Out[16]:
In [17]:
leaf_nodes = [path[-1] for path in all_leaf_node_paths]
leaf_nodes
Out[17]:
In [ ]:
features_used = []
In [18]:
leaf_nodes_depths = [np.size(y) - 1 for y in all_leaf_node_paths]
leaf_nodes_depths
Out[18]:
In [19]:
n_node_samples = estimator.tree_.n_node_samples
num_samples = [n_node_samples[y].astype(int) for y in leaf_nodes]
print(n_node_samples)
print(len(n_node_samples))
num_samples
print(num_samples)
print(sum(num_samples))
print(sum(n_node_samples))
In [20]:
X_train.shape
Out[20]:
In [21]:
value = estimator.tree_.value
values = [value[node_id].astype(int) for node_id in leaf_nodes]
print(values)
# This should match the number of rows in the training feature set
print(sum(values).sum())
values
Out[21]:
In [23]:
feature_names = ["X" + str(i) for i in range(X_train.shape[1])]
np.asarray(feature_names)
print(type(feature_names))
print(feature_names[0])
print(feature_names[-2])
#feature = estimator.tree_.feature
#z = [feature[y].astype(int) for y in x]
#z
#[feature_names[i] for i in z]
In [24]:
max_dpth = estimator.tree_.max_depth
max_dpth
Out[24]:
In [25]:
max_n_class = estimator.tree_.max_n_classes
max_n_class
Out[25]:
In [26]:
predict = estimator.tree_.predict
predict
all_leaf_nodes = [path[-1] for path in all_leaf_node_paths]
#[predict(node_id) for node_id in np.asarray(all_leaf_nodes)]
print(all_leaf_nodes)
print(all_leaf_nodes[0])
print(value[all_leaf_nodes[0]])
print(all_features_idx[np.argmax(value[all_leaf_nodes[0]])])
print(node_features_idx)
#predict(class_names[np.argmax(value[all_leaf_nodes[0]])])
In [29]:
#print("nodes", np.asarray(a = nodes, dtype = "int64"), sep = ":\n")
# print("node_depth", node_depth, sep = ":\n")
# print("leaf_node", is_leaves, sep = ":\n")
# print("feature_names", used_feature_names, sep = ":\n")
# print("feature", feature, sep = ":\n")
In [60]:
def getTreeData(dtree, root_node_id = 0):
"""This returns all of the required summary results from an
individual decision tree
"""
max_node_depth = dtree.tree_.max_depth
n_nodes = dtree.tree_.node_count
value = dtree.tree_.value
predict = dtree.tree_.predict
# Get the total number of features in the training data
tot_num_features = X_train.shape[1]
# Get indices for all the features used - 0 indexed and ranging
# to the total number of possible features in the training data
all_features_idx = np.array(range(tot_num_features), dtype = 'int64')
# Get the raw node feature indices from the decision tree classifier attribute
# It is hard to tell which features this came from i.e. indices are zero,
# positive and negative - we want only non-negative indices for the
# corresponding feature columns for consistency in reference
node_features_raw_idx = dtree.tree_.feature
# Get the refined non-negative feature indices for each node
# Start with a range over the total number of features and
# subset the relevant indices from the raw indices array
node_features_idx = np.array(range(tot_num_features))[node_features]
# Count the unique number of features used
num_features_used = (np.unique(node_features_idx)).shape[0]
# Get all of the paths used in the tree
all_leaf_node_paths = allTreePaths(dtree = dtree, root_node_id = root_node_id)
# Get list of leaf nodes
# In all paths it is the final node value
all_leaf_nodes = [path[-1] for path in all_leaf_node_paths]
# Final number of training samples predicted in each class at each leaf node
all_leaf_node_values = [value[node_id].astype(int) for node_id in leaf_nodes]
# Total number of training samples predicted in each class at each leaf node
tot_leaf_node_values = [np.sum(leaf_node_values) for leaf_node_values in all_leaf_node_values]
# All leaf node depths
# The depth is 0 indexed i.e. root node has depth 0
leaf_nodes_depths = [np.size(path) - 1 for path in all_leaf_node_paths]
# Predicted Classes
# Check that we correctly account for ties in determining the class here
all_leaf_node_classes = [all_features_idx[np.argmax(value)] for value in all_leaf_node_values]
# Get all of the features used along the leaf node paths i.e. features used to split a node
# CHECK: Why does the leaf node have a feature associated with it? Investigate further
# Removed the final leaf node value so that this feature does not get included currently
all_leaf_paths_features = [node_features_idx[path[:-1]] for path in all_leaf_node_paths]
# Get the unique list of features along a path
# NOTE: This removes the original ordering of the features along the path
# The original ordering could be preserved using a special function but will increase runtime
all_uniq_leaf_paths_features = [np.unique(feature_path) for feature_path in all_leaf_paths_features]
print("number of node features", num_features_used, sep = ":\n")
print("node feature indices", node_features_idx, sep = ":\n")
print("Max node depth in tree", max_node_depth, sep = ":\n")
print("number of nodes in tree", n_nodes, sep = ":\n")
print("node features", node_features, sep = ":\n")
print("all leaf node paths", all_leaf_node_paths, sep = ":\n")
print("all leaf node indices", all_leaf_nodes, sep = ":\n")
print("all leaf node depths", leaf_nodes_depths, sep = ":\n")
print("all leaf node predicted values", all_leaf_node_values, sep = ":\n")
print("total leaf node predicted values", tot_leaf_node_values, sep = ":\n")
print("all leaf node predicted classes", all_leaf_node_classes, sep = ":\n")
print("all features in leaf node paths", all_leaf_paths_features, sep = ":\n")
print("all unique features in leaf node paths", all_uniq_leaf_paths_features, sep = ":\n")
tree_data = {"num_features_used" : num_features_used,
"node_features_idx" : node_features_idx,
"max_node_depth" : max_node_depth,
"n_nodes" : n_nodes,
"all_leaf_node_paths" : all_leaf_node_paths,
"all_leaf_nodes" : all_leaf_nodes,
"leaf_nodes_depths" : leaf_nodes_depths,
"all_leaf_node_values" : all_leaf_node_values,
"tot_leaf_node_values" : tot_leaf_node_values,
"all_leaf_node_classes" : all_leaf_node_classes,
"all_leaf_paths_features" : all_leaf_paths_features,
"all_uniq_leaf_paths_features" : all_uniq_leaf_paths_features}
return tree_data
In [62]:
tree_dat1 = getTreeData(dtree = estimator, root_node_id = 0)
tree_dat1
print(sum(tree_dat1['tot_leaf_node_values']))