In [157]:
%matplotlib inline
import matplotlib.pyplot as plt
# Load the library with the iris dataset
from sklearn.datasets import load_iris
# Load scikit's train-test split function
from sklearn.cross_validation import train_test_split
# Random Forest Classifier - not the 'Extra Trees Classifier'
from sklearn.ensemble import RandomForestClassifier
# Confusion Matrix
from sklearn.metrics import confusion_matrix
# Import the tree module
from sklearn.tree import _tree
import numpy as np
# Set seed for reproducibility
np.random.seed(1015)
In [158]:
iris = load_iris()
print(iris.keys())
print("Iris Feature dimensions", iris.data.shape, sep = ":\n")
print("Iris Data output", iris.target, sep = ":\n")
print("Iris Output dimensions", iris.target.shape, sep = ":\n")
In [159]:
# Create the train-test datasets
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target)
In [160]:
print("Training feature data dimensions", X_train.shape, sep = ":\n")
print("Training target data dimensions", y_train.shape, sep = ":\n")
print("Test feature data dimensions", X_test.shape, sep = ":\n")
print("Test target data dimensions", y_test.shape, sep = ":\n")
Everything looks to be in order! - Let's keep going
In [161]:
np.random.seed(1039)
rf = RandomForestClassifier(n_estimators = 2)
rf.fit(X = X_train, y = y_train)
Out[161]:
In [162]:
predicted = rf.predict(X = X_test)
predicted
Out[162]:
In [163]:
rf.score(X = X_test, y = y_test)
Out[163]:
In [164]:
confusion_matrix(y_true = y_test, y_pred = predicted)
Out[164]:
In [165]:
importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_]
, axis = 0)
# Get the indices of the features with highest importances in descending order
indices = np.argsort(importances)[::-1]
In [166]:
print("Indices in Descending order of feature importance", indices, sep = ":\n")
print([iris.feature_names[idx] for idx in indices])
In [167]:
# Print the feature ranking
print("Feature ranking:")
for f in range(X_train.shape[1]):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(X_train.shape[1]), indices)
plt.xlim([-1, X_train.shape[1]])
plt.show()
In [168]:
print("Display feature importance scores", importances, sep = ":\n")
print("Check that the importance scores are standardized to 100%", sum(importances), sep = ":\n")
In [169]:
from inspect import getmembers
import pprint
#pp = pprint.PrettyPrinter(indent=4)
#for tree in rf.estimators_:
#print(tree.tree_.feature)
#pp.pprint(getmembers(tree.tree_))
In [170]:
# from sklearn import tree
# i_tree = 0
# for tree_in_forest in rf.estimators_:
# with open('figures/03_Decision_Paths_Leaf_Nodes/tree_' + str(i_tree) + '.png', 'w') as my_file:
# my_file = tree.export_graphviz(tree_in_forest, out_file = my_file)
# i_tree = i_tree + 1
In [171]:
from IPython.display import display, Image
import pydotplus
def draw_tree(inp_tree
, out_file = None
, filled=True
, rounded=True
, special_characters=True):
dot_data = tree.export_graphviz(inp_tree
, out_file = out_file
, filled = filled
, rounded = rounded
, special_characters = special_characters)
graph = pydotplus.graph_from_dot_data(dot_data)
img = Image(graph.create_png())
display(img)
In [172]:
for dtree in rf.estimators_:
# dot_data = tree.export_graphviz(dtree
# , out_file = None
# , filled = True
# , rounded = True
# , special_characters = True)
# graph = pydotplus.graph_from_dot_data(dot_data)
# img = Image(graph.create_png())
# display(img)
draw_tree(inp_tree = dtree)
print(dtree.tree_.feature)
In [173]:
# Setup
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import numpy as np
# Set seed for reproducibility
np.random.seed(1015)
# Load the iris data
iris = load_iris()
# Create the train-test datasets
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target)
np.random.seed(1039)
# Just fit a simple random forest classifier with 2 decision trees
rf = RandomForestClassifier(n_estimators = 2)
rf.fit(X = X_train, y = y_train)
# Define a function to draw the decision trees in IPython
# Adapted from: http://scikit-learn.org/stable/modules/tree.html
from IPython.display import display, Image
import pydotplus
# Now plot the trees individually
for dtree in rf.estimators_:
dot_data = tree.export_graphviz(dtree
, out_file = None
, filled = True
, rounded = True
, special_characters = True)
graph = pydotplus.graph_from_dot_data(dot_data)
img = Image(graph.create_png())
display(img)
draw_tree(inp_tree = dtree)
#print(dtree.tree_.feature)
In [174]:
def _get_tree_paths(tree, node_id = 0, depth = 0):
"""
Returns all paths through the tree as list of node_ids
"""
if node_id == _tree.TREE_LEAF:
raise ValueError("Invalid node_id %s" % _tree.TREE_LEAF)
left_child = tree.children_left[node_id]
right_child = tree.children_right[node_id]
if left_child != _tree.TREE_LEAF:
left_paths = _get_tree_paths(tree, left_child, depth=depth + 1)
right_paths = _get_tree_paths(tree, right_child, depth=depth + 1)
for path in left_paths:
path.append(node_id)
for path in right_paths:
path.append(node_id)
paths = left_paths + right_paths
else:
paths = [[node_id]]
return paths
In [175]:
leaf_node_paths = dict()
leaf_to_path = dict()
for idx, dtree in enumerate(rf.estimators_):
# leaf_to_path = {}
node_paths = _get_tree_paths(tree = dtree.tree_, node_id = 0, depth = 0)
for node_path in node_paths:
node_path.reverse()
leaf_node_paths[idx] = node_paths
#map leaves to paths
for path in node_paths:
leaf_to_path[path[-1]] = path
In [176]:
leaf_node_paths
Out[176]:
In [177]:
leaf_to_path
Out[177]:
In [178]:
len(rf.estimators_)
Out[178]: