In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
import numpy as np
from functools import reduce
# Needed for the scikit-learn wrapper function
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from math import ceil
# Import our custom utilities
from imp import reload
from utils import irf_jupyter_utils
from utils import irf_utils
reload(irf_jupyter_utils)
reload(irf_utils)
Out[1]:
In [2]:
load_breast_cancer = load_breast_cancer()
In [3]:
X_train, X_test, y_train, y_test, rf = irf_jupyter_utils.generate_rf_example(n_estimators=20,
feature_weight=None)
In [4]:
print("Training feature dimensions", X_train.shape, sep = ":\n")
print("\n")
print("Training outcome dimensions", y_train.shape, sep = ":\n")
print("\n")
print("Test feature dimensions", X_test.shape, sep = ":\n")
print("\n")
print("Test outcome dimensions", y_test.shape, sep = ":\n")
print("\n")
print("first 2 rows of the training set features", X_train[:2], sep = ":\n")
print("\n")
print("first 2 rows of the training set outcomes", y_train[:2], sep = ":\n")
In [5]:
all_rf_tree_data = irf_utils.get_rf_tree_data(
rf=rf, X_train=X_train, X_test=X_test, y_test=y_test)
In [6]:
np.random.seed(12)
all_rit_tree_data = irf_utils.get_rit_tree_data(
all_rf_tree_data=all_rf_tree_data,
bin_class_type=1,
M=100,
max_depth=2,
noisy_split=False,
num_splits=2)
In [7]:
# Print the feature ranking
print("Feature ranking:")
feature_importances_rank_idx = all_rf_tree_data['feature_importances_rank_idx']
feature_importances = all_rf_tree_data['feature_importances']
for f in range(X_train.shape[1]):
print("%d. feature %d (%f)" % (f + 1
, feature_importances_rank_idx[f]
, feature_importances[feature_importances_rank_idx[f]]))
In [8]:
# Plot the feature importances of the forest
feature_importances_std = all_rf_tree_data['feature_importances_std']
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1])
, feature_importances[feature_importances_rank_idx]
, color="r"
, yerr = feature_importances_std[feature_importances_rank_idx], align="center")
plt.xticks(range(X_train.shape[1]), feature_importances_rank_idx)
plt.xlim([-1, X_train.shape[1]])
plt.show()
In [9]:
# Now plot the trees individually
#irf_jupyter_utils.draw_tree(decision_tree = all_rf_tree_data['rf_obj'].estimators_[0])
In [10]:
#irf_jupyter_utils.pretty_print_dict(inp_dict = all_rf_tree_data['dtree0'])
In [11]:
# Count the number of samples passing through the leaf nodes
sum(all_rf_tree_data['dtree0']['tot_leaf_node_values'])
Out[11]:
In [12]:
#irf_jupyter_utils.pretty_print_dict(inp_dict = all_rf_tree_data['dtree0']['all_leaf_paths_features'])
We will run the iRF with the following parameters
Will be useful when we build the interface later
In [13]:
all_rf_weights, all_K_iter_rf_data, \
all_rf_bootstrap_output, all_rit_bootstrap_output, \
stability_score = irf_utils.run_iRF(X_train=X_train,
X_test=X_test,
y_train=y_train,
y_test=y_test,
K=5,
n_estimators=20,
B=30,
random_state_classifier=2018,
propn_n_samples=.2,
bin_class_type=1,
M=20,
max_depth=5,
noisy_split=False,
num_splits=2,
n_estimators_bootstrap=5)
In [14]:
irf_utils._get_histogram(stability_score, sort = True)
That's interesting - feature 22, 27, 20, 23 keep popping up!
We should probably look at the feature importances to understand if there is a useful correlation
In [15]:
for k in range(5):
iteration = "rf_iter{}".format(k)
feature_importances_std = all_K_iter_rf_data[iteration]['feature_importances_std']
feature_importances_rank_idx = all_K_iter_rf_data[iteration]['feature_importances_rank_idx']
feature_importances = all_K_iter_rf_data[iteration]['feature_importances']
plt.figure(figsize=(8, 6))
title = "Feature importances; iteration = {}".format(k)
plt.title(title)
plt.bar(range(X_train.shape[1])
, feature_importances[feature_importances_rank_idx]
, color="r"
, yerr = feature_importances_std[feature_importances_rank_idx], align="center")
plt.xticks(range(X_train.shape[1]), feature_importances_rank_idx, rotation='vertical')
plt.xlim([-1, X_train.shape[1]])
plt.show()
Let's look at the final iteration RF - the key validation metrics
In [16]:
irf_jupyter_utils.pretty_print_dict(all_K_iter_rf_data['rf_iter4']['rf_validation_metrics'])
In [17]:
# Now plot the trees individually
irf_jupyter_utils.draw_tree(decision_tree = all_K_iter_rf_data['rf_iter4']['rf_obj'].estimators_[0])
In [18]:
irf_jupyter_utils.pretty_print_dict(
all_K_iter_rf_data['rf_iter4']['dtree0']['all_leaf_paths_features'])
This checks nicely against the plotted diagram above.
In fact - we can go further and plot some interesting data from the Decision Trees
In [19]:
irf_jupyter_utils.pretty_print_dict(
all_K_iter_rf_data['rf_iter4']['dtree0']['all_leaf_node_values'])
In [21]:
irf_utils._hist_features(all_K_iter_rf_data['rf_iter4'], n_estimators = 20, \
title = 'Frequency of features along decision paths : iteration = 4')
The most common features that appeared were 27,22,23, and 7. This matches well with the feature importance plot above.
In [17]:
all_K_iter_rf_data.keys()
print(all_K_iter_rf_data['rf_iter0']['feature_importances'])
Compare to the original single fitted random forest
In [18]:
rf = RandomForestClassifier(n_estimators=20, random_state=2018)
rf.fit(X=X_train, y=y_train)
print(rf.feature_importances_)
And they match perfectly as expected.