In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.datasets import load_breast_cancer
import numpy as np
from functools import reduce
# Import our custom utilities
from imp import reload
from utils import irf_jupyter_utils
from utils import irf_utils
reload(irf_jupyter_utils)
reload(irf_utils)
Out[1]:
In [2]:
%timeit
X_train, X_test, y_train, y_test, rf = irf_jupyter_utils.generate_rf_example(sklearn_ds = load_breast_cancer())
In [3]:
print("Training feature dimensions", X_train.shape, sep = ":\n")
print("\n")
print("Training outcome dimensions", y_train.shape, sep = ":\n")
print("\n")
print("Test feature dimensions", X_test.shape, sep = ":\n")
print("\n")
print("Test outcome dimensions", y_test.shape, sep = ":\n")
print("\n")
print("first 5 rows of the training set features", X_train[:5], sep = ":\n")
print("\n")
print("first 5 rows of the training set outcomes", y_train[:5], sep = ":\n")
In [4]:
X_train.shape[0]
breast_cancer = load_breast_cancer()
breast_cancer.data.shape[0]
Out[4]:
In [5]:
# Import our custom utilities
rf.n_estimators
Out[5]:
In [6]:
estimator0 = rf.estimators_[0] # First tree
estimator1 = rf.estimators_[1] # Second tree
estimator2 = rf.estimators_[2] # Second tree
In [7]:
tree_dat0 = irf_utils.get_tree_data(X_train = X_train, dtree = estimator0, root_node_id = 0)
tree_dat1 = irf_utils.get_tree_data(X_train = X_train, dtree = estimator1, root_node_id = 0)
tree_dat1 = irf_utils.get_tree_data(X_train = X_train, dtree = estimator2, root_node_id = 0)
In [8]:
# Now plot the trees individually
irf_jupyter_utils.draw_tree(decision_tree = estimator0)
In [9]:
irf_jupyter_utils.pretty_print_dict(inp_dict = tree_dat0)
In [10]:
# Count the number of samples passing through the leaf nodes
sum(tree_dat0['tot_leaf_node_values'])
Out[10]:
In [11]:
feature_importances = rf.feature_importances_
std = np.std([dtree.feature_importances_ for dtree in rf.estimators_]
, axis=0)
feature_importances_rank_idx = np.argsort(feature_importances)[::-1]
# Check that the feature importances are standardized to 1
print(sum(feature_importances))
In [12]:
# Print the feature ranking
print("Feature ranking:")
for f in range(X_train.shape[1]):
print("%d. feature %d (%f)" % (f + 1
, feature_importances_rank_idx[f]
, feature_importances[feature_importances_rank_idx[f]]))
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1])
, feature_importances[feature_importances_rank_idx]
, color="r"
, yerr = std[feature_importances_rank_idx], align="center")
plt.xticks(range(X_train.shape[1]), feature_importances_rank_idx)
plt.xlim([-1, X_train.shape[1]])
plt.show()
In [13]:
# Import our custom utilities
from imp import reload
from utils import irf_jupyter_utils
from utils import irf_utils
reload(irf_jupyter_utils)
reload(irf_utils)
Out[13]:
In [14]:
rf.n_classes_
Out[14]:
In [15]:
estimator0.n_classes_
Out[15]:
In [16]:
type(rf).__name__
Out[16]:
In [17]:
rf_metrics = irf_utils.get_validation_metrics(inp_class_reg_obj = rf, y_true = y_test, X_test = X_test)
rf_metrics['confusion_matrix']
Out[17]:
In [139]:
dtree1 = rf.estimators_[1]
dtree_metrics = irf_utils.get_validation_metrics(inp_class_reg_obj = dtree1, y_true = y_test, X_test = X_test)
dtree_metrics['confusion_matrix']
Out[139]:
In [136]:
rf_metrics = irf_utils.get_validation_metrics(inp_class_reg_obj = rf, y_true = y_test, X_test = X_test)
rf_metrics['confusion_matrix']
Out[136]:
In [152]:
# CHECK: If the random forest objects are going to be really large in size
# we could just omit them and only return our custom summary outputs
rf_metrics = irf_utils.get_validation_metrics(inp_class_reg_obj = rf, y_true = y_test, X_test = X_test)
all_rf_outputs = {"rf_obj" : rf,
"feature_importances" : feature_importances,
"feature_importances_rank_idx" : feature_importances_rank_idx,
"rf_metrics" : rf_metrics}
In [157]:
# CHECK: The following should be paralellized!
# CHECK: Whether we can maintain X_train correctly as required
for idx, dtree in enumerate(rf.estimators_):
dtree_out = irf_utils.get_tree_data(X_train=X_train,
X_test=X_test,
y_test=y_test,
dtree=dtree, root_node_id = 0)
# Append output to dictionary
all_rf_outputs["dtree{}".format(idx)] = dtree_out
In [158]:
estimator0_out = irf_utils.get_tree_data(X_train=X_train,
X_test=X_test,
y_test=y_test,
dtree=estimator0,
root_node_id=0)
In [159]:
print(estimator0_out['all_leaf_nodes'])
In [192]:
print(estimator0_out['all_leaf_nodes'])
print(sum(estimator0_out['tot_leaf_node_values']))
print(estimator0_out['tot_leaf_node_values'])
print(estimator0_out['all_leaf_node_samples'])
print(estimator0.tree_.n_node_samples[0])
print([round(i, 1) for i in estimator0_out['all_leaf_node_samples_percent']])
print(sum(estimator0_out['all_leaf_node_samples_percent']))
In [193]:
irf_jupyter_utils.pretty_print_dict(inp_dict = all_rf_outputs)
At it's core, the RIT is comprised of 3 main modules
For now we will just work with a single decision tree outputs
In [194]:
irf_jupyter_utils.pretty_print_dict(inp_dict = all_rf_outputs['rf_metrics'])
In [195]:
all_rf_outputs['dtree0']
Out[195]:
In [199]:
estimator0.tree_.value[:]
Out[199]:
In [204]:
X_train_n_samples = X_train.shape[0]
all_leaf_vals = all_rf_outputs['dtree0']['all_leaf_node_values']
scaled_values = [i/X_train_n_samples for i in all_leaf_vals]
scaled_values
Out[204]:
In [205]:
[(i) for i in all_leaf_vals]
Out[205]:
In [164]:
uniq_feature_paths = all_rf_outputs['dtree0']['all_uniq_leaf_paths_features']
leaf_node_classes = all_rf_outputs['dtree0']['all_leaf_node_classes']
ones_only = [i for i, j in zip(uniq_feature_paths, leaf_node_classes)
if j == 1]
ones_only
Out[164]:
In [165]:
print("Number of leaf nodes", len(all_rf_outputs['dtree0']['all_uniq_leaf_paths_features']), sep = ":\n")
print("Number of leaf nodes with 1 class", len(ones_only), sep = ":\n")
In [166]:
# Just pick the last seven cases, we are going to manually construct
# binary RIT of depth 3 i.e. max 2**3 -1 = 7 intersecting nodes
ones_only_seven = ones_only[-7:]
ones_only_seven
Out[166]:
In [167]:
# Construct a binary version of the RIT manually!
# This should come in useful for unit tests!
node0 = ones_only_seven[-1]
node1 = np.intersect1d(node0, ones_only_seven[-2])
node2 = np.intersect1d(node1, ones_only_seven[-3])
node3 = np.intersect1d(node1, ones_only_seven[-4])
node4 = np.intersect1d(node0, ones_only_seven[-5])
node5 = np.intersect1d(node4, ones_only_seven[-6])
node6 = np.intersect1d(node4, ones_only_seven[-7])
intersected_nodes_seven = [node0, node1, node2, node3, node4, node5, node6]
for idx, node in enumerate(intersected_nodes_seven):
print("node" + str(idx), node)
In [168]:
rit_output = reduce(np.union1d, (node2, node3, node5, node6))
rit_output
Out[168]:
In [169]:
# Import our custom utilities
from imp import reload
from utils import irf_jupyter_utils
from utils import irf_utils
reload(irf_jupyter_utils)
reload(irf_utils)
Out[169]:
In [170]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
raw_data = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
raw_data.data, raw_data.target, train_size=0.9,
random_state=2017)
rf = RandomForestClassifier(
n_estimators=3, random_state=2018)
rf.fit(X=X_train, y=y_train)
estimator0 = rf.estimators_[0]
estimator0_out = irf_utils.get_tree_data(X_train=X_train,
X_test=X_test,
y_test=y_test,
dtree=estimator0,
root_node_id=0)
print(estimator0_out['all_leaf_nodes'])
In [171]:
estimator0_out['validation_metrics']
Out[171]:
In [172]:
np.random.seed(12)
tree = irf_utils.build_tree(feature_paths=irf_utils.select_random_path(),
max_depth=3,
noisy_split=False,
num_splits=5)
In [173]:
print("Root:\n", tree._val)
#print("Some child:\n", tree.children[0].children[1]._val)
In [174]:
# If noisy split is False, this should pass
assert(len(tree) == 1 + 5 + 5**2)
In [175]:
list(tree.traverse_depth_first())
Out[175]:
In [176]:
estimator0_out_fltr = irf_utils.filter_leaves_classifier(dtree_data=estimator0_out,bin_class_type=1)
estimator0_out_fltr
Out[176]:
In [177]:
print("Total Number of classes", len(estimator0_out['all_leaf_node_classes']), sep=":\n")
print("Total Number of 1-value classes", sum(estimator0_out['all_leaf_node_classes']), sep=":\n")
In [180]:
print("Total Number of 1-value classes", len(estimator0_out_fltr['leaf_nodes_depths']), sep=":\n")
In [181]:
all_rf_outputs['dtree0']['all_uniq_leaf_paths_features']
Out[181]:
In [41]:
irf_utils.filter_leaves_classifier(dtree_data=all_rf_outputs['dtree0'],bin_class_type=1)
Out[41]:
In [182]:
all_rf_outputs['dtree0']
Out[182]:
In [183]:
# Import our custom utilities
from imp import reload
from utils import irf_jupyter_utils
from utils import irf_utils
reload(irf_jupyter_utils)
reload(irf_utils)
Out[183]:
In [184]:
filtered = irf_utils.filter_leaves_classifier(dtree_data=all_rf_outputs['dtree0'],bin_class_type=1)
In [185]:
filtered
Out[185]:
In [186]:
filtered['validation_metrics']['accuracy_score']
Out[186]:
In [127]:
filtered['uniq_feature_paths']
Out[127]:
In [50]:
import random
filtered
Out[50]:
In [48]:
random.choice([1,5,9,10,12])
Out[48]:
In [130]:
random.choice(filtered['uniq_feature_paths'])
Out[130]:
In [55]:
from scipy import stats
In [107]:
def weighted_choice(values, weights):
"""Discrete distribution, drawing values with the frequency specified in weights.
Weights do not need to be normalized.
"""
if not len(weights) == len(values):
raise ValueError('Equal number of values and weights expected')
weights = np.array(weights)
weights = weights / weights.sum()
dist = stats.rv_discrete(values=(range(len(weights)), weights))
while True:
yield values[dist.rvs()]
In [108]:
g = weighted_choice(filtered['uniq_feature_paths'], filtered['tot_leaf_node_values'])
In [109]:
for i in range(100):
print(next(g))
In [110]:
filtered0 = irf_utils.filter_leaves_classifier(dtree_data=all_rf_outputs['dtree0'],bin_class_type=1)
filtered1 = irf_utils.filter_leaves_classifier(dtree_data=all_rf_outputs['dtree1'],bin_class_type=1)
In [129]:
all_weights = []
all_paths = []
for tree in range(2):
filtered = irf_utils.filter_leaves_classifier(dtree_data=all_rf_outputs['dtree{}'.format(tree)],
bin_class_type=1)
all_weights.extend(filtered['tot_leaf_node_values'])
all_paths.extend(filtered['uniq_feature_paths'])
g = weighted_choice(all_paths, all_weights)
In [113]:
all_weights
Out[113]:
In [114]:
all_paths
Out[114]:
In [112]:
for i in range(50):
print(next(g))
In [ ]: