Get the Decision path for the leaf Nodes - Random Forest (scikit)



In [157]:

    
%matplotlib inline
import matplotlib.pyplot as plt

# Load the library with the iris dataset
from sklearn.datasets import load_iris

# Load scikit's train-test split function
from sklearn.cross_validation import train_test_split

# Random Forest Classifier - not the 'Extra Trees Classifier'
from sklearn.ensemble import RandomForestClassifier

# Confusion Matrix
from sklearn.metrics import confusion_matrix

# Import the tree module
from sklearn.tree import _tree

import numpy as np

# Set seed for reproducibility
np.random.seed(1015)

Load the iris data

Load the data - check the dimensions
Create the train-test datasets and check the dimensions



In [158]:

    
iris = load_iris()

print(iris.keys())
print("Iris Feature dimensions", iris.data.shape, sep = ":\n")
print("Iris Data output", iris.target, sep = ":\n")
print("Iris Output dimensions", iris.target.shape, sep = ":\n")









    



dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])
Iris Feature dimensions:
(150, 4)
Iris Data output:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
Iris Output dimensions:
(150,)



In [159]:

    
# Create the train-test datasets
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target)



In [160]:

    
print("Training feature data dimensions", X_train.shape, sep = ":\n")
print("Training target data dimensions", y_train.shape, sep = ":\n")
print("Test feature data dimensions", X_test.shape, sep = ":\n")
print("Test target data dimensions", y_test.shape, sep = ":\n")









    



Training feature data dimensions:
(112, 4)
Training target data dimensions:
(112,)
Test feature data dimensions:
(38, 4)
Test target data dimensions:
(38,)

Everything looks to be in order! - Let's keep going

Fit the Random Forest classifier - minimal number of trees

Note that we are not going to fit the ExtraTreesClassifier at this stage - but we will come back to it
Just use 2 trees to fit the random forest, we want to display them later and then manually use them as a check for the decision paths



In [161]:

    
np.random.seed(1039)

rf = RandomForestClassifier(n_estimators = 2)
rf.fit(X = X_train, y = y_train)









    Out[161]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=2, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)



In [162]:

    
predicted = rf.predict(X = X_test)
predicted









    Out[162]:





array([2, 0, 2, 0, 1, 0, 0, 1, 2, 1, 0, 1, 2, 0, 2, 1, 2, 0, 2, 1, 2, 0, 0,
       2, 1, 0, 1, 1, 1, 0, 2, 1, 0, 2, 0, 1, 0, 2])



In [163]:

    
rf.score(X = X_test, y = y_test)









    Out[163]:





0.97368421052631582



In [164]:

    
confusion_matrix(y_true = y_test, y_pred = predicted)









    Out[164]:





array([[14,  0,  0],
       [ 0, 11,  0],
       [ 0,  1, 12]])

Some Comments

The accurracy is 0.97 using just 2 trees - looks good
However it is very sensitive to the random seed value - not good!
With more trees, we expect the sensistivity to the random seed value to decrease (should not need it as a source of randomness)
We are still free to continue with this example

Get the Feature Importances

This can be readily derived from the fitted random forest classifier



In [165]:

    
importances = rf.feature_importances_
std         = np.std([tree.feature_importances_ for tree in rf.estimators_]
                      , axis = 0)

# Get the indices of the features with highest importances in descending order
indices     = np.argsort(importances)[::-1]



In [166]:

    
print("Indices in Descending order of feature importance", indices, sep = ":\n")
print([iris.feature_names[idx] for idx in indices])









    



Indices in Descending order of feature importance:
[3 2 1 0]
['petal width (cm)', 'petal length (cm)', 'sepal width (cm)', 'sepal length (cm)']



In [167]:

    
# Print the feature ranking
print("Feature ranking:")

for f in range(X_train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X_train.shape[1]), indices)
plt.xlim([-1, X_train.shape[1]])
plt.show()









    



Feature ranking:
1. feature 3 (0.627351)
2. feature 2 (0.319113)
3. feature 1 (0.041706)
4. feature 0 (0.011829)



In [168]:

    
print("Display feature importance scores", importances, sep = ":\n")
print("Check that the importance scores are standardized to 100%", sum(importances), sep = ":\n")









    



Display feature importance scores:
[ 0.01182919  0.04170643  0.31911347  0.62735091]
Check that the importance scores are standardized to 100%:
1.0



In [169]:

    
from inspect import getmembers
import pprint
#pp = pprint.PrettyPrinter(indent=4)
#for tree in rf.estimators_:    
    #print(tree.tree_.feature)
    #pp.pprint(getmembers(tree.tree_))



In [170]:

    
# from sklearn import tree
# i_tree = 0
# for tree_in_forest in rf.estimators_:
#     with open('figures/03_Decision_Paths_Leaf_Nodes/tree_' + str(i_tree) + '.png', 'w') as my_file:
#         my_file = tree.export_graphviz(tree_in_forest, out_file = my_file)
#     i_tree = i_tree + 1



In [171]:

    
from IPython.display import display, Image
import pydotplus
def draw_tree(inp_tree
              , out_file = None
              , filled=True
              , rounded=True
              , special_characters=True):
    dot_data = tree.export_graphviz(inp_tree
                                    , out_file = out_file
                                    , filled   = filled
                                    , rounded  = rounded
                                    , special_characters = special_characters)  
    graph = pydotplus.graph_from_dot_data(dot_data)  
    img = Image(graph.create_png())
    display(img)



In [172]:

    
for dtree in rf.estimators_:
#    dot_data = tree.export_graphviz(dtree
#                                    , out_file = None
#                                    , filled   = True
#                                    , rounded  = True
#                                    , special_characters = True)  
#    graph = pydotplus.graph_from_dot_data(dot_data)  
#    img = Image(graph.create_png())
#    display(img)
    draw_tree(inp_tree = dtree)
    print(dtree.tree_.feature)









    












    



[ 3 -2  3  3 -2  1  2 -2 -2  2 -2  3 -2 -2 -2]






    












    



[ 2 -2  3  0  2 -2 -2 -2  0  1 -2 -2 -2]



In [173]:

    
# Setup
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import numpy as np
# Set seed for reproducibility
np.random.seed(1015)

# Load the iris data
iris = load_iris()

# Create the train-test datasets
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target)

np.random.seed(1039)

# Just fit a simple random forest classifier with 2 decision trees
rf = RandomForestClassifier(n_estimators = 2)
rf.fit(X = X_train, y = y_train)

# Define a function to draw the decision trees in IPython
# Adapted from: http://scikit-learn.org/stable/modules/tree.html
from IPython.display import display, Image
import pydotplus
    
# Now plot the trees individually
for dtree in rf.estimators_:
    dot_data = tree.export_graphviz(dtree
                                    , out_file = None
                                    , filled   = True
                                    , rounded  = True
                                    , special_characters = True)  
    graph = pydotplus.graph_from_dot_data(dot_data)  
    img = Image(graph.create_png())
    display(img)
    draw_tree(inp_tree = dtree)
    #print(dtree.tree_.feature)

Get the tree paths

The following code is taken from: https://github.com/andosa/treeinterpreter/blob/master/treeinterpreter/treeinterpreter.py#L12-L33



In [174]:

    
def _get_tree_paths(tree, node_id = 0, depth = 0):
    """
    Returns all paths through the tree as list of node_ids
    """
    if node_id == _tree.TREE_LEAF:
        raise ValueError("Invalid node_id %s" % _tree.TREE_LEAF)

    left_child = tree.children_left[node_id]
    right_child = tree.children_right[node_id]

    if left_child != _tree.TREE_LEAF:
        left_paths = _get_tree_paths(tree, left_child, depth=depth + 1)
        right_paths = _get_tree_paths(tree, right_child, depth=depth + 1)

        for path in left_paths:
            path.append(node_id)
        for path in right_paths:
            path.append(node_id)
        paths = left_paths + right_paths
    else:
        paths = [[node_id]]
    return paths



In [175]:

    
leaf_node_paths = dict()
leaf_to_path = dict()
for idx, dtree in enumerate(rf.estimators_):    
    # leaf_to_path = {}
    node_paths = _get_tree_paths(tree = dtree.tree_, node_id = 0, depth = 0)    
    for node_path in node_paths:
        node_path.reverse()
    leaf_node_paths[idx] = node_paths    
    #map leaves to paths
    for path in node_paths:
        leaf_to_path[path[-1]] = path



In [176]:

    
leaf_node_paths









    Out[176]:





{0: [[0, 1],
  [0, 2, 3, 4],
  [0, 2, 3, 5, 6, 7],
  [0, 2, 3, 5, 6, 8],
  [0, 2, 3, 5, 9, 10],
  [0, 2, 3, 5, 9, 11, 12],
  [0, 2, 3, 5, 9, 11, 13],
  [0, 2, 14]],
 1: [[0, 1],
  [0, 2, 3, 4, 5],
  [0, 2, 3, 4, 6],
  [0, 2, 3, 7],
  [0, 2, 8, 9, 10],
  [0, 2, 8, 9, 11],
  [0, 2, 8, 12]]}



In [177]:

    
leaf_to_path









    Out[177]:





{1: [0, 1],
 4: [0, 2, 3, 4],
 5: [0, 2, 3, 4, 5],
 6: [0, 2, 3, 4, 6],
 7: [0, 2, 3, 7],
 8: [0, 2, 3, 5, 6, 8],
 10: [0, 2, 8, 9, 10],
 11: [0, 2, 8, 9, 11],
 12: [0, 2, 8, 12],
 13: [0, 2, 3, 5, 9, 11, 13],
 14: [0, 2, 14]}



In [178]:

    
len(rf.estimators_)









    Out[178]:





2