M Number of trees to buildD Max Tree Depthp Children sample node probability threshold (= 0 for no split, i.e. based on uniform (0, 1) RNG with respect the the threshold)n Min number of children to sample at each node (if p != 0 then at each node if the split node prob <= p, then sample n children at that node, else sample n + 1 children at that node each node)i.e. if we want just a binary RIT i.e. always 2 children sampled at each node then set p = 0 and n = 2.
def random_intersection_tree(#RF params **rf_params, rf_B, #number of decision trees to fit for each random forest rf_weighted=True, K=4,
#RIT params
M_trees=20,
max_depth=5,
n_splits=2,
noisy_splits=False):
for k in range(K):
if k == 0 and rf_weighted = True:
#set weights uniformly here for the first iteration
#get the number of features to set this uniform parameter
rf_weights = 1/p
#Run the random forest with bootstrap samples on the training
#set
rf = RandomForestClassifier(**rf_params,
n_estimators=B,
rf_weights=rf_weights)
#reset the weight parameters for the next random forest iteration
rf_weights = rf.feature_importances_
#get the random forest metrics i.e. validation scores on test data
rf_metrics = irf_utils.get_validation_metrics(inp_class_reg_obj = rf,
y_true = y_test,
X_test = X_test)
#load into a dictionary for easy re-use later
all_rf_outputs = {"rf_obj" : rf,
"feature_importances" : feature_importances,
"feature_importances_rank_idx" : feature_importances_rank_idx,
"rf_metrics" : rf_metrics}
#get the individual decision tree output for the random forest
#CHECK: The following could be paralellized e.g. joblib!
for idx, dtree in enumerate(rf.estimators_):
dtree_out = irf_utils.get_tree_data(X_train=X_train,
dtree=dtree,
root_node_id=0)
#Append output to dictionary
all_rf_outputs["dtree" + str(idx)] = dtree_out
#Run the RIT using the decision tree outputs
#should be a dictionary structure similar to
all_rit_outputs = random_intersection_trees(all_rf_outputs, M_trees=20,
max_depth=5,
n_splits=2,
noisy_splits=False)
#should be able to access the rit_output
stability_score = ...
#Append the stability score to the RIT
all_rit_outputs['stability_score'] = stability_score
#return the dictionar
return all_rf_outputs, all_rit_outputs
In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
import numpy as np
from functools import reduce
# Import our custom utilities
from imp import reload
from utils import irf_jupyter_utils
from utils import irf_utils
reload(irf_jupyter_utils)
reload(irf_utils)
Out[1]:
In [4]:
X_train, X_test, y_train, y_test, rf = irf_jupyter_utils.generate_rf_example(
sklearn_ds=load_breast_cancer(), n_estimators=10)
In [5]:
all_rf_tree_data = irf_utils.get_rf_tree_data(rf=rf,
X_train=X_train, y_train=y_train,
X_test=X_test, y_test=y_test)
In [6]:
rf_weights = all_rf_tree_data['feature_importances']
In [8]:
gen_random_leaf_paths = irf_utils.generate_rit_samples(all_rf_tree_data=all_rf_tree_data,
bin_class_type=1)
rit0 = irf_utils.build_tree(feature_paths=gen_random_leaf_paths,
max_depth=3,
noisy_split=False,
num_splits=5)
In [ ]: