Get the Decision path for the leaf Nodes - Random Forest (scikit)


In [3]:
# Setup
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.datasets import load_iris
from sklearn import tree
import numpy as np

# Set seed for reproducibility
np.random.seed(1015)


/Users/shamindras/anaconda/envs/sklearnprod0/lib/python3.6/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

Run the Previous iris example

  • Produce the decision tree diagrams as a checking mechanism

In [4]:
def draw_tree(inp_tree
              , out_file = None
              , filled=True
              , rounded=True
              , special_characters=True):
    dot_data = tree.export_graphviz(inp_tree
                                    , out_file = out_file
                                    , filled   = filled
                                    , rounded  = rounded
                                    , special_characters = special_characters)  
    graph = pydotplus.graph_from_dot_data(dot_data)  
    img = Image(graph.create_png())
    display(img)

In [5]:
# Load the iris data
iris = load_iris()

# Create the train-test datasets
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target)

np.random.seed(1039)

# Just fit a simple random forest classifier with 2 decision trees
rf = RandomForestClassifier(n_estimators = 2)
rf.fit(X = X_train, y = y_train)

# Define a function to draw the decision trees in IPython
# Adapted from: http://scikit-learn.org/stable/modules/tree.html
from IPython.display import display, Image
import pydotplus
    
# Now plot the trees individually
for dtree in rf.estimators_:
    dot_data = tree.export_graphviz(dtree
                                    , out_file = None
                                    , filled   = True
                                    , rounded  = True
                                    , special_characters = True)  
    graph = pydotplus.graph_from_dot_data(dot_data)  
    img = Image(graph.create_png())
    display(img)
    draw_tree(inp_tree = dtree)
    #print(dtree.tree_.feature)



In [84]:
tval = list(np.sort(np.unique(y_train)))
print(tval)


[0, 1, 2]

In [91]:
feature_names = ["X" + str(i) for i in range(X_train.shape[1])]
target_vals = list(np.sort(np.unique(y_train)))
target_names = ["y" + str(i) for i in target_vals]
print(feature_names)
print(target_names)


['X0', 'X1', 'X2', 'X3']
['y0', 'y1', 'y2']

In [94]:
from sklearn import tree
def get_code(tree, feature_names, target_names,
             spacer_base="    "):
    """Produce psuedo-code for decision tree.

    Args
    ----
    tree -- scikit-leant DescisionTree.
    feature_names -- list of feature names.
    target_names -- list of target (class) names.
    spacer_base -- used for spacing code (default: "    ").

    Notes
    -----
    based on http://stackoverflow.com/a/30104792.
    """
    left      = tree.tree_.children_left
    right     = tree.tree_.children_right
    threshold = tree.tree_.threshold
    features  = [feature_names[i] for i in tree.tree_.feature]
    value     = tree.tree_.value

    def recurse(left, right, threshold, features, node, depth):
        spacer = spacer_base * depth
        if (threshold[node] != -2):
            print(spacer + "if ( " + features[node] + " <= " + \
                  str(threshold[node]) + " ) {")
            if left[node] != -1:
                    recurse(left, right, threshold, features,
                            left[node], depth+1)
            print(spacer + "}\n" + spacer +"else {")
            if right[node] != -1:
                    recurse(left, right, threshold, features,
                            right[node], depth+1)
            print(spacer + "}")
        else:
            target = value[node]
            for i, v in zip(np.nonzero(target)[1],
                            target[np.nonzero(target)]):
                target_name = target_names[i]
                target_count = int(v)
                print(spacer + "return " + str(target_name) + \
                      " ( " + str(target_count) + " examples )")

    recurse(left, right, threshold, features, 0, 0)

In [95]:
for idx, dtree in enumerate(rf.estimators_):    
    get_code(tree = dtree, feature_names = feature_names, target_names = target_names)


if ( X3 <= 0.800000011921 ) {
    return y0 ( 39 examples )
}
else {
    if ( X3 <= 1.70000004768 ) {
        if ( X3 <= 1.34999990463 ) {
            return y1 ( 18 examples )
        }
        else {
            if ( X1 <= 2.65000009537 ) {
                if ( X2 <= 4.94999980927 ) {
                    return y1 ( 1 examples )
                }
                else {
                    return y2 ( 4 examples )
                }
            }
            else {
                if ( X2 <= 5.0 ) {
                    return y1 ( 16 examples )
                }
                else {
                    if ( X3 <= 1.54999995232 ) {
                        return y2 ( 1 examples )
                    }
                    else {
                        return y1 ( 3 examples )
                    }
                }
            }
        }
    }
    else {
        return y2 ( 30 examples )
    }
}
if ( X2 <= 2.34999990463 ) {
    return y0 ( 39 examples )
}
else {
    if ( X3 <= 1.70000004768 ) {
        if ( X0 <= 7.09999990463 ) {
            if ( X2 <= 4.94999980927 ) {
                return y1 ( 31 examples )
            }
            else {
                return y2 ( 4 examples )
            }
        }
        else {
            return y2 ( 1 examples )
        }
    }
    else {
        if ( X0 <= 6.05000019073 ) {
            if ( X1 <= 3.09999990463 ) {
                return y2 ( 6 examples )
            }
            else {
                return y1 ( 1 examples )
            }
        }
        else {
            return y2 ( 30 examples )
        }
    }
}

In [133]:
from sklearn.tree import _tree
def _get_tree_paths(tree, node_id = 0, depth = 0):
    """
    Returns all paths through the tree as list of node_ids
    """
   
    if node_id == _tree.TREE_LEAF:
        raise ValueError("Invalid node_id %s" % _tree.TREE_LEAF)

    left_child = tree.children_left[node_id]
    right_child = tree.children_right[node_id]

    if left_child      != _tree.TREE_LEAF:
        left_paths     = _get_tree_paths(tree, left_child, depth = depth + 1)
        left_features  = np.array([tree.feature[i] for i in left_paths]).tolist()
        print(left_features)
        right_paths    = _get_tree_paths(tree, right_child, depth = depth + 1)
        right_features = np.array([tree.feature[i] for i in right_paths]).tolist()
        print(right_features)

        # organize paths
        for path in left_paths:
            path.append(node_id)            
        for path in right_paths:
            path.append(node_id)            
        paths = left_paths + right_paths
        
        features = left_features + right_features
        print(features)
        # organize features
        #for feature in left_features:
        #    feature.append(feature(node_id))
        #for feature in right_features:
        #    feature.append(feature(node_id))
    else:
        paths = [[node_id]]               
    
    return paths

In [134]:
leaf_node_paths = dict()
leaf_to_path = dict()
for idx, dtree in enumerate(rf.estimators_):    
    # leaf_to_path = {}
    node_paths = _get_tree_paths(tree = dtree.tree_, node_id = 0, depth = 0)
    leaf_node_paths[idx] = node_paths    
    #map leaves to paths
    for path in node_paths:
        leaf_to_path[path[-1]] = path


[[-2]]
[[-2]]
[[-2]]
[[-2]]
[[-2], [-2]]
[[-2, 2], [-2, 2]]
[[-2]]
[[-2]]
[[-2]]
[[-2], [-2]]
[[-2, 3], [-2, 3]]
[[-2], [-2, 3], [-2, 3]]
[array([-2,  2]), array([-2,  3,  2]), array([-2,  3,  2])]
[[-2, 2], [-2, 2], array([-2,  2]), array([-2,  3,  2]), array([-2,  3,  2])]
[array([-2,  2,  1]), array([-2,  2,  1]), array([-2,  2,  1]), array([-2,  3,  2,  1]), array([-2,  3,  2,  1])]
[[-2], array([-2,  2,  1]), array([-2,  2,  1]), array([-2,  2,  1]), array([-2,  3,  2,  1]), array([-2,  3,  2,  1])]
[array([-2,  3]), array([-2,  2,  1,  3]), array([-2,  2,  1,  3]), array([-2,  2,  1,  3]), array([-2,  3,  2,  1,  3]), array([-2,  3,  2,  1,  3])]
[[-2]]
[array([-2,  3]), array([-2,  2,  1,  3]), array([-2,  2,  1,  3]), array([-2,  2,  1,  3]), array([-2,  3,  2,  1,  3]), array([-2,  3,  2,  1,  3]), [-2]]
[array([-2,  3,  3]), array([-2,  2,  1,  3,  3]), array([-2,  2,  1,  3,  3]), array([-2,  2,  1,  3,  3]), array([-2,  3,  2,  1,  3,  3]), array([-2,  3,  2,  1,  3,  3]), array([-2,  3])]
[[-2], array([-2,  3,  3]), array([-2,  2,  1,  3,  3]), array([-2,  2,  1,  3,  3]), array([-2,  2,  1,  3,  3]), array([-2,  3,  2,  1,  3,  3]), array([-2,  3,  2,  1,  3,  3]), array([-2,  3])]
[[-2]]
[[-2]]
[[-2]]
[[-2], [-2]]
[[-2, 2], [-2, 2]]
[[-2]]
[[-2, 2], [-2, 2], [-2]]
[array([-2,  2,  0]), array([-2,  2,  0]), array([-2,  0])]
[[-2]]
[[-2]]
[[-2], [-2]]
[[-2, 1], [-2, 1]]
[[-2]]
[[-2, 1], [-2, 1], [-2]]
[array([-2,  1,  0]), array([-2,  1,  0]), array([-2,  0])]
[array([-2,  2,  0]), array([-2,  2,  0]), array([-2,  0]), array([-2,  1,  0]), array([-2,  1,  0]), array([-2,  0])]
[array([-2,  2,  0,  3]), array([-2,  2,  0,  3]), array([-2,  0,  3]), array([-2,  1,  0,  3]), array([-2,  1,  0,  3]), array([-2,  0,  3])]
[[-2], array([-2,  2,  0,  3]), array([-2,  2,  0,  3]), array([-2,  0,  3]), array([-2,  1,  0,  3]), array([-2,  1,  0,  3]), array([-2,  0,  3])]

In [106]:
leaf_node_paths


Out[106]:
{0: [[1, 0],
  [4, 3, 2, 0],
  [7, 6, 5, 3, 2, 0],
  [8, 6, 5, 3, 2, 0],
  [10, 9, 5, 3, 2, 0],
  [12, 11, 9, 5, 3, 2, 0],
  [13, 11, 9, 5, 3, 2, 0],
  [14, 2, 0]],
 1: [[1, 0],
  [5, 4, 3, 2, 0],
  [6, 4, 3, 2, 0],
  [7, 3, 2, 0],
  [10, 9, 8, 2, 0],
  [11, 9, 8, 2, 0],
  [12, 8, 2, 0]]}

In [65]:
leaf_to_path


Out[65]:
{0: [12, 8, 2, 0]}

In [135]:
len(rf.estimators_)


Out[135]:
2

In [136]:
rf


Out[136]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=2, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [155]:
first_tree = rf.estimators_[0].tree_
#dir(first_tree)

In [156]:
first_tree.max_depth


Out[156]:
6

In [172]:
estimator = rf.estimators_[1]
# tree structure and allows access to low level attributes. The binary tree
# tree_ is represented as a number of parallel arrays. The i-th element of each
# array holds information about the node `i`. Node 0 is the tree's root. NOTE:
# Some of the arrays only apply to either leaves or split nodes, resp. In this
# case the values of nodes of the other type are arbitrary!
#
# Among those arrays, we have:
#   - left_child, id of the left child of the node
#   - right_child, id of the right child of the node
#   - feature, feature used for splitting the node
#   - threshold, threshold value at the node
#

# Using those arrays, we can parse the tree structure:

# The decision estimator has an attribute called tree_  which stores the entire
# tree structure and allows access to low level attributes. The binary tree
# tree_ is represented as a number of parallel arrays. The i-th element of each
# array holds information about the node `i`. Node 0 is the tree's root. NOTE:
# Some of the arrays only apply to either leaves or split nodes, resp. In this
# case the values of nodes of the other type are arbitrary!
#
# Among those arrays, we have:
#   - left_child, id of the left child of the node
#   - right_child, id of the right child of the node
#   - feature, feature used for splitting the node
#   - threshold, threshold value at the node
#

# Using those arrays, we can parse the tree structure:

n_nodes = estimator.tree_.node_count
children_left = estimator.tree_.children_left
children_right = estimator.tree_.children_right
feature = estimator.tree_.feature
threshold = estimator.tree_.threshold


# The tree structure can be traversed to compute various properties such
# as the depth of each node and whether or not it is a leaf.
node_depth = np.zeros(shape=n_nodes)
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
nodes = []

stack = [(0, -1)]  # seed is the root node id and its parent depth
while len(stack) > 0:
    node_id, parent_depth = stack.pop()
    node_depth[node_id] = parent_depth + 1
    
    #np.append(arr=nodes, values=node_id)
    nodes.append(node_id)

    # If we have a test node
    if (children_left[node_id] != children_right[node_id]):
        stack.append((children_left[node_id], parent_depth + 1))
        stack.append((children_right[node_id], parent_depth + 1))
    else:
        is_leaves[node_id] = True

In [173]:
node_depth


Out[173]:
array([ 0.,  1.,  1.,  2.,  3.,  4.,  4.,  3.,  2.,  3.,  4.,  4.,  3.])

In [174]:
is_leaves


Out[174]:
array([False,  True, False, False, False,  True,  True,  True, False,
       False,  True,  True,  True], dtype=bool)

In [176]:
nodes


Out[176]:
[0, 2, 8, 12, 9, 11, 10, 3, 7, 4, 6, 5, 1]

In [177]:
len(nodes)


Out[177]:
13

In [179]:
node_depth.shape


Out[179]:
(13,)

In [185]:
leaf_nodes = [nodes[idx] for (idx, _) in enumerate(nodes) if is_leaves[idx]]
leaf_nodes


Out[185]:
[2, 11, 10, 3, 6, 5, 1]

In [ ]: