In [86]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
import numpy as np
from functools import reduce
# Import our custom utilities
from imp import reload
from utils import irf_jupyter_utils
from utils import irf_utils
reload(irf_jupyter_utils)
reload(irf_utils)
Out[86]:
In [87]:
load_breast_cancer = load_breast_cancer()
In [88]:
X_train, X_test, y_train, y_test, rf = irf_jupyter_utils.generate_rf_example(n_estimators=10)
In [89]:
print("Training feature dimensions", X_train.shape, sep = ":\n")
print("\n")
print("Training outcome dimensions", y_train.shape, sep = ":\n")
print("\n")
print("Test feature dimensions", X_test.shape, sep = ":\n")
print("\n")
print("Test outcome dimensions", y_test.shape, sep = ":\n")
print("\n")
print("first 5 rows of the training set features", X_train[:2], sep = ":\n")
print("\n")
print("first 5 rows of the training set outcomes", y_train[:2], sep = ":\n")
In [90]:
all_rf_tree_data = irf_utils.get_rf_tree_data(rf=rf,
X_train=X_train, y_train=y_train,
X_test=X_test, y_test=y_test)
#all_rf_tree_data
In [91]:
np.random.seed(12)
gen_random_leaf_paths = irf_utils.generate_rit_samples(all_rf_tree_data=all_rf_tree_data,
bin_class_type=1)
In [92]:
#for i in range(7):
# print(next(gen_random_leaf_paths))
In [93]:
rit0 = irf_utils.build_tree(feature_paths=gen_random_leaf_paths,
max_depth=3,
noisy_split=False,
num_splits=2)
In [94]:
for node in rit0.traverse_depth_first():
print(node[0], node[1]._val)
In [95]:
for idx, node in enumerate(rit0.leaf_nodes()):
print(idx, node[1]._val)
In [96]:
print("Root:\n", rit0._val)
#print("Some child:\n", tree.children[0].children[1]._val)
In [97]:
len(rit0)
Out[97]:
In [98]:
# If noisy split is False, this should pass
#assert(len(rit0) == 1 + 5 + 5**2)
assert(len(rit0) <= 1 + 5 + 5**2)
In [99]:
M = 10
np.random.seed(12)
# Create the weighted randomly sampled paths as a generator
gen_random_leaf_paths = irf_utils.generate_rit_samples(all_rf_tree_data=all_rf_tree_data,
bin_class_type=1)
# Create the RIT object
rit = irf_utils.build_tree(feature_paths=gen_random_leaf_paths,
max_depth=3,
noisy_split=False,
num_splits=2)
# Get the intersected node values
rit_intersected_values = [node[1]._val for node in rit.traverse_depth_first()]
# Leaf node values i.e. final intersected features
rit_leaf_node_values = [node[1]._val for node in rit.leaf_nodes()]
rit_leaf_node_union_value = reduce(np.union1d, rit_leaf_node_values)
rit_outputs = {"rit": rit,
"rit_intersected_values": rit_intersected_values,
"rit_leaf_node_values": rit_leaf_node_values,
"rit_leaf_node_union_value": rit_leaf_node_union_value}
rit_outputs['rit_intersected_values']
Out[99]:
In [100]:
def get_rit_tree_data(all_rf_tree_data,
bin_class_type=1,
random_state=12,
#M=10, # number of trees (RIT) to build
feature_paths=gen_random_leaf_paths,
max_depth=3,
noisy_split=False,
num_splits=2):
"""
A wrapper for the Random Intersection Trees (RIT) algorithm
"""
# Set the random seed for reproducibility
np.random.seed(12)
# Create the weighted randomly sampled paths as a generator
gen_random_leaf_paths = irf_utils.generate_rit_samples(
all_rf_tree_data=all_rf_tree_data,
bin_class_type=bin_class_type)
# Create the RIT object
rit = irf_utils.build_tree(feature_paths=gen_random_leaf_paths,
max_depth=max_depth,
noisy_split=noisy_split,
num_splits=num_splits)
# Get the intersected node values
# CHECK remove this for the final value
rit_intersected_values = [node[1]._val for node in rit.traverse_depth_first()]
# Leaf node values i.e. final intersected features
rit_leaf_node_values = [node[1]._val for node in rit.leaf_nodes()]
rit_leaf_node_union_value = reduce(np.union1d, rit_leaf_node_values)
rit_outputs = {"rit": rit,
"rit_intersected_values": rit_intersected_values,
"rit_leaf_node_values": rit_leaf_node_values,
"rit_leaf_node_union_value": rit_leaf_node_union_value}
return rit_outputs
In [101]:
a = get_rit_tree_data(all_rf_tree_data=all_rf_tree_data,
bin_class_type=1,
random_state=12,
#M=10, # number of trees (RIT) to build
max_depth=3,
noisy_split=False,
num_splits=2)
In [102]:
a['rit_intersected_values']
Out[102]:
In [103]:
b = irf_utils.get_rit_tree_data(all_rf_tree_data=all_rf_tree_data,
bin_class_type=1,
random_state=12,
M=1,
max_depth=3,
noisy_split=False,
num_splits=2)
In [106]:
b['rit0']['rit_intersected_values']
Out[106]:
In [107]:
c = irf_utils.get_rit_tree_data(all_rf_tree_data=all_rf_tree_data,
bin_class_type=1,
random_state=12,
M=10,
max_depth=3,
noisy_split=False,
num_splits=2)
In [108]:
c['rit1']
Out[108]:
In [16]:
# Print the feature ranking
print("Feature ranking:")
feature_importances_rank_idx = all_rf_tree_data['feature_importances_rank_idx']
feature_importances = all_rf_tree_data['feature_importances']
for f in range(X_train.shape[1]):
print("%d. feature %d (%f)" % (f + 1
, feature_importances_rank_idx[f]
, feature_importances[feature_importances_rank_idx[f]]))
In [17]:
# Plot the feature importances of the forest
feature_importances_std = all_rf_tree_data['feature_importances_std']
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1])
, feature_importances[feature_importances_rank_idx]
, color="r"
, yerr = feature_importances_std[feature_importances_rank_idx], align="center")
plt.xticks(range(X_train.shape[1]), feature_importances_rank_idx)
plt.xlim([-1, X_train.shape[1]])
plt.show()
In [18]:
uniq_feature_paths = all_rf_tree_data['dtree0']['all_uniq_leaf_paths_features']
leaf_node_classes = all_rf_tree_data['dtree0']['all_leaf_node_classes']
ones_only = [i for i, j in zip(uniq_feature_paths, leaf_node_classes)
if j == 1]
ones_only
Out[18]:
In [19]:
ones_only_seven = ones_only[-7:]
ones_only_seven
Out[19]:
In [20]:
# Construct a binary version of the RIT manually!
# This should come in useful for unit tests!
node0 = ones_only_seven[0]
node1 = np.intersect1d(node0, ones_only_seven[1])
node2 = np.intersect1d(node1, ones_only_seven[2])
node3 = np.intersect1d(node1, ones_only_seven[3])
node4 = np.intersect1d(node0, ones_only_seven[4])
node5 = np.intersect1d(node4, ones_only_seven[5])
node6 = np.intersect1d(node4, ones_only_seven[6])
intersected_nodes_seven = [node0, node1, node2, node3, node4, node5, node6]
for idx, node in enumerate(intersected_nodes_seven):
print("node" + str(idx), node)
In [21]:
rit_output = reduce(np.union1d, (node2, node3, node5, node6))
rit_output
Out[21]:
In [22]:
ones_only_seven_gen = (n for n in ones_only_seven)
#for i in range(len(ones_only_seven)):
# print(next(ones_only_seven_gen))
In [23]:
rit_man0 = irf_utils.build_tree(
feature_paths=ones_only_seven_gen,
max_depth=3,
noisy_split=False,
num_splits=2)
In [24]:
print("Root:\n", rit_man0._val)
In [25]:
print("Root:\n", rit_man0._val)
print("node1:\n", rit_man0.children[0]._val)
print("node4:\n", rit_man0.children[1]._val)
print("node2:\n", rit_man0.children[0].children[0]._val)
print("node3:\n", rit_man0.children[0].children[1]._val)
print("node5:\n", rit_man0.children[1].children[0]._val)
print("node6:\n", rit_man0.children[1].children[1]._val)
In [26]:
for node in rit_man0.traverse_depth_first():
print(node[1]._val)
In [27]:
for node in rit_man0.leaf_nodes():
print(node[1]._val)
In [28]:
# Now plot the trees individually
irf_jupyter_utils.draw_tree(decision_tree = all_rf_tree_data['rf_obj'].estimators_[0])
In [29]:
irf_jupyter_utils.pretty_print_dict(inp_dict = all_rf_tree_data['dtree0'])
In [30]:
# Count the number of samples passing through the leaf nodes
sum(all_rf_tree_data['dtree0']['tot_leaf_node_values'])
Out[30]:
In [31]:
irf_jupyter_utils.pretty_print_dict(inp_dict = all_rf_tree_data['dtree0']['all_leaf_paths_features'])