Get the Decision path for the leaf Nodes - Random Forest (scikit)


In [157]:
%matplotlib inline
import matplotlib.pyplot as plt

# Load the library with the iris dataset
from sklearn.datasets import load_iris

# Load scikit's train-test split function
from sklearn.cross_validation import train_test_split

# Random Forest Classifier - not the 'Extra Trees Classifier'
from sklearn.ensemble import RandomForestClassifier

# Confusion Matrix
from sklearn.metrics import confusion_matrix

# Import the tree module
from sklearn.tree import _tree

import numpy as np

# Set seed for reproducibility
np.random.seed(1015)

Load the iris data

  • Load the data - check the dimensions
  • Create the train-test datasets and check the dimensions

In [158]:
iris = load_iris()

print(iris.keys())
print("Iris Feature dimensions", iris.data.shape, sep = ":\n")
print("Iris Data output", iris.target, sep = ":\n")
print("Iris Output dimensions", iris.target.shape, sep = ":\n")


dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])
Iris Feature dimensions:
(150, 4)
Iris Data output:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
Iris Output dimensions:
(150,)

In [159]:
# Create the train-test datasets
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target)

In [160]:
print("Training feature data dimensions", X_train.shape, sep = ":\n")
print("Training target data dimensions", y_train.shape, sep = ":\n")
print("Test feature data dimensions", X_test.shape, sep = ":\n")
print("Test target data dimensions", y_test.shape, sep = ":\n")


Training feature data dimensions:
(112, 4)
Training target data dimensions:
(112,)
Test feature data dimensions:
(38, 4)
Test target data dimensions:
(38,)

Everything looks to be in order! - Let's keep going

Fit the Random Forest classifier - minimal number of trees

  • Note that we are not going to fit the ExtraTreesClassifier at this stage - but we will come back to it
  • Just use 2 trees to fit the random forest, we want to display them later and then manually use them as a check for the decision paths

In [161]:
np.random.seed(1039)

rf = RandomForestClassifier(n_estimators = 2)
rf.fit(X = X_train, y = y_train)


Out[161]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=2, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [162]:
predicted = rf.predict(X = X_test)
predicted


Out[162]:
array([2, 0, 2, 0, 1, 0, 0, 1, 2, 1, 0, 1, 2, 0, 2, 1, 2, 0, 2, 1, 2, 0, 0,
       2, 1, 0, 1, 1, 1, 0, 2, 1, 0, 2, 0, 1, 0, 2])

In [163]:
rf.score(X = X_test, y = y_test)


Out[163]:
0.97368421052631582

In [164]:
confusion_matrix(y_true = y_test, y_pred = predicted)


Out[164]:
array([[14,  0,  0],
       [ 0, 11,  0],
       [ 0,  1, 12]])

Some Comments

  • The accurracy is 0.97 using just 2 trees - looks good
  • However it is very sensitive to the random seed value - not good!
  • With more trees, we expect the sensistivity to the random seed value to decrease (should not need it as a source of randomness)
  • We are still free to continue with this example

Get the Feature Importances

  • This can be readily derived from the fitted random forest classifier

In [165]:
importances = rf.feature_importances_
std         = np.std([tree.feature_importances_ for tree in rf.estimators_]
                      , axis = 0)

# Get the indices of the features with highest importances in descending order
indices     = np.argsort(importances)[::-1]

In [166]:
print("Indices in Descending order of feature importance", indices, sep = ":\n")
print([iris.feature_names[idx] for idx in indices])


Indices in Descending order of feature importance:
[3 2 1 0]
['petal width (cm)', 'petal length (cm)', 'sepal width (cm)', 'sepal length (cm)']

In [167]:
# Print the feature ranking
print("Feature ranking:")

for f in range(X_train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X_train.shape[1]), indices)
plt.xlim([-1, X_train.shape[1]])
plt.show()


Feature ranking:
1. feature 3 (0.627351)
2. feature 2 (0.319113)
3. feature 1 (0.041706)
4. feature 0 (0.011829)

In [168]:
print("Display feature importance scores", importances, sep = ":\n")
print("Check that the importance scores are standardized to 100%", sum(importances), sep = ":\n")


Display feature importance scores:
[ 0.01182919  0.04170643  0.31911347  0.62735091]
Check that the importance scores are standardized to 100%:
1.0

In [169]:
from inspect import getmembers
import pprint
#pp = pprint.PrettyPrinter(indent=4)
#for tree in rf.estimators_:    
    #print(tree.tree_.feature)
    #pp.pprint(getmembers(tree.tree_))

In [170]:
# from sklearn import tree
# i_tree = 0
# for tree_in_forest in rf.estimators_:
#     with open('figures/03_Decision_Paths_Leaf_Nodes/tree_' + str(i_tree) + '.png', 'w') as my_file:
#         my_file = tree.export_graphviz(tree_in_forest, out_file = my_file)
#     i_tree = i_tree + 1

In [171]:
from IPython.display import display, Image
import pydotplus
def draw_tree(inp_tree
              , out_file = None
              , filled=True
              , rounded=True
              , special_characters=True):
    dot_data = tree.export_graphviz(inp_tree
                                    , out_file = out_file
                                    , filled   = filled
                                    , rounded  = rounded
                                    , special_characters = special_characters)  
    graph = pydotplus.graph_from_dot_data(dot_data)  
    img = Image(graph.create_png())
    display(img)

In [172]:
for dtree in rf.estimators_:
#    dot_data = tree.export_graphviz(dtree
#                                    , out_file = None
#                                    , filled   = True
#                                    , rounded  = True
#                                    , special_characters = True)  
#    graph = pydotplus.graph_from_dot_data(dot_data)  
#    img = Image(graph.create_png())
#    display(img)
    draw_tree(inp_tree = dtree)
    print(dtree.tree_.feature)


[ 3 -2  3  3 -2  1  2 -2 -2  2 -2  3 -2 -2 -2]
[ 2 -2  3  0  2 -2 -2 -2  0  1 -2 -2 -2]

In [173]:
# Setup
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import numpy as np
# Set seed for reproducibility
np.random.seed(1015)

# Load the iris data
iris = load_iris()

# Create the train-test datasets
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target)

np.random.seed(1039)

# Just fit a simple random forest classifier with 2 decision trees
rf = RandomForestClassifier(n_estimators = 2)
rf.fit(X = X_train, y = y_train)

# Define a function to draw the decision trees in IPython
# Adapted from: http://scikit-learn.org/stable/modules/tree.html
from IPython.display import display, Image
import pydotplus
    
# Now plot the trees individually
for dtree in rf.estimators_:
    dot_data = tree.export_graphviz(dtree
                                    , out_file = None
                                    , filled   = True
                                    , rounded  = True
                                    , special_characters = True)  
    graph = pydotplus.graph_from_dot_data(dot_data)  
    img = Image(graph.create_png())
    display(img)
    draw_tree(inp_tree = dtree)
    #print(dtree.tree_.feature)



In [174]:
def _get_tree_paths(tree, node_id = 0, depth = 0):
    """
    Returns all paths through the tree as list of node_ids
    """
    if node_id == _tree.TREE_LEAF:
        raise ValueError("Invalid node_id %s" % _tree.TREE_LEAF)

    left_child = tree.children_left[node_id]
    right_child = tree.children_right[node_id]

    if left_child != _tree.TREE_LEAF:
        left_paths = _get_tree_paths(tree, left_child, depth=depth + 1)
        right_paths = _get_tree_paths(tree, right_child, depth=depth + 1)

        for path in left_paths:
            path.append(node_id)
        for path in right_paths:
            path.append(node_id)
        paths = left_paths + right_paths
    else:
        paths = [[node_id]]
    return paths

In [175]:
leaf_node_paths = dict()
leaf_to_path = dict()
for idx, dtree in enumerate(rf.estimators_):    
    # leaf_to_path = {}
    node_paths = _get_tree_paths(tree = dtree.tree_, node_id = 0, depth = 0)    
    for node_path in node_paths:
        node_path.reverse()
    leaf_node_paths[idx] = node_paths    
    #map leaves to paths
    for path in node_paths:
        leaf_to_path[path[-1]] = path

In [176]:
leaf_node_paths


Out[176]:
{0: [[0, 1],
  [0, 2, 3, 4],
  [0, 2, 3, 5, 6, 7],
  [0, 2, 3, 5, 6, 8],
  [0, 2, 3, 5, 9, 10],
  [0, 2, 3, 5, 9, 11, 12],
  [0, 2, 3, 5, 9, 11, 13],
  [0, 2, 14]],
 1: [[0, 1],
  [0, 2, 3, 4, 5],
  [0, 2, 3, 4, 6],
  [0, 2, 3, 7],
  [0, 2, 8, 9, 10],
  [0, 2, 8, 9, 11],
  [0, 2, 8, 12]]}

In [177]:
leaf_to_path


Out[177]:
{1: [0, 1],
 4: [0, 2, 3, 4],
 5: [0, 2, 3, 4, 5],
 6: [0, 2, 3, 4, 6],
 7: [0, 2, 3, 7],
 8: [0, 2, 3, 5, 6, 8],
 10: [0, 2, 8, 9, 10],
 11: [0, 2, 8, 9, 11],
 12: [0, 2, 8, 12],
 13: [0, 2, 3, 5, 9, 11, 13],
 14: [0, 2, 14]}

In [178]:
len(rf.estimators_)


Out[178]:
2