In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn import tree
import numpy as np
# Just importing pandas to display the dataframe nicely in jupyter
# We should not be using it as a dependency in any of our utils
# or other developed functions
import pandas as pd
# Import our custom utilities
from imp import reload
from utils import utils
reload(utils)
RANDOM_STATE_SPLIT = 1001
RANDOM_STATE_CLASSIFIER = 1039
In [2]:
# Load the iris data
iris = load_iris()
# Create the train-test datasets
X_train, X_test, y_train, y_test = train_test_split(iris.data
, iris.target
, random_state = RANDOM_STATE_SPLIT)
# Just fit a simple random forest classifier with 2 decision trees
rf = RandomForestClassifier(n_estimators = 2
, random_state = RANDOM_STATE_CLASSIFIER)
# fit the classifier
rf.fit(X = X_train, y = y_train)
Out[2]:
In [3]:
print("Training feature dimensions", X_train.shape, sep = ":\n")
print("\n")
print("Training outcome dimensions", y_train.shape, sep = ":\n")
print("\n")
print("Test feature dimensions", X_test.shape, sep = ":\n")
print("\n")
print("Test outcome dimensions", y_test.shape, sep = ":\n")
print("\n")
print("first 10 rows of the training set features", X_train[:10], sep = ":\n")
print("\n")
print("first 10 rows of the training set outcomes", y_train[:10], sep = ":\n")
So we have
In [4]:
# Import our custom utilities
from imp import reload
from utils import utils
reload(utils)
Out[4]:
In [5]:
estimator0 = rf.estimators_[0] # First tree
estimator1 = rf.estimators_[1] # Second tree
In [6]:
tree_dat0 = utils.getTreeData(X_train = X_train, dtree = estimator0, root_node_id = 0)
tree_dat1 = utils.getTreeData(X_train = X_train, dtree = estimator1, root_node_id = 0)
In [7]:
# utils.prettyPrintDict(inp_dict = tree_dat0)
In [8]:
# Now plot the trees individually
# utils.draw_tree(inp_tree = estimator0)
In [9]:
# Now plot the trees individually
utils.draw_tree(decision_tree = estimator1)
In [10]:
utils.prettyPrintDict(inp_dict = tree_dat1)
In [11]:
# Count the number of samples passing through the leaf nodes
sum(tree_dat1['tot_leaf_node_values'])
Out[11]:
In [12]:
feature_importances = rf.feature_importances_
std = np.std([dtree.feature_importances_ for dtree in rf.estimators_]
, axis=0)
feature_importances_rank_idx = np.argsort(feature_importances)[::-1]
# Check that the feature importances are standardized to 1
print(sum(feature_importances))
In [13]:
# Print the feature ranking
print("Feature ranking:")
for f in range(X_train.shape[1]):
print("%d. feature %d (%f)" % (f + 1
, feature_importances_rank_idx[f]
, feature_importances[feature_importances_rank_idx[f]]))
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1])
, feature_importances[feature_importances_rank_idx]
, color="r"
, yerr = std[feature_importances_rank_idx], align="center")
plt.xticks(range(X_train.shape[1]), feature_importances_rank_idx)
plt.xlim([-1, X_train.shape[1]])
plt.show()
In [14]:
# CHECK: If the random forest objects are going to be really large in size
# we could just omit them and only return our custom summary outputs
rf_metrics = utils.getValidationMetrics(rf, y_true = y_test, X_test = X_test)
all_rf_outputs = {"rf_obj" : rf,
"feature_importances" : feature_importances,
"feature_importances_rank_idx" : feature_importances_rank_idx,
"rf_metrics" : rf_metrics}
In [15]:
# CHECK: The following should be paralellized!
# CHECK: Whether we can maintain X_train correctly as required
for idx, dtree in enumerate(rf.estimators_):
dtree_out = utils.getTreeData(X_train = X_train, dtree = dtree, root_node_id = 0)
# Append output to dictionary
all_rf_outputs["dtree" + str(idx)] = dtree_out
In [17]:
utils.prettyPrintDict(inp_dict = all_rf_outputs)
In [ ]: