Key Requirements for the iRF scikit-learn implementation

  • The following is a documentation of the main requirements for the iRF implementation

Typical Setup


In [9]:
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.datasets import load_breast_cancer
import numpy as np
from functools import reduce

# Import our custom utilities
from imp import reload
from utils import irf_jupyter_utils
from utils import irf_utils
reload(irf_jupyter_utils)
reload(irf_utils)


Out[9]:
<module 'utils.irf_utils' from '/Users/shamindras/PERSONAL/LEARNING/REPOS/scikit-learn-sandbox/jupyter/utils/irf_utils.py'>

In [10]:
np.random.seed(12)
tree = irf_utils.build_tree(feature_paths=irf_utils.select_random_path(), 
                            max_depth=3, 
                            noisy_split=False, 
                            num_splits=5)

In [11]:
print("Root:\n", tree._val)
#print("Some child:\n", tree.children[0].children[1]._val)


Root:
 [ 0  1  4  5  6  8 12 13 14 16 17 19 23 24 25 30 31 33 34 35 37 39 40 41 42
 43 44 45 46 49 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 71 72 73
 74 76 78 79 80 83 84 85 86 87 89 90 92 93 96 97 99]

In [12]:
# If noisy split is False, this should pass
assert(len(tree) == 1 + 5 + 5**2)
#assert(len(tree) == 6)

In [13]:
list(tree.traverse_depth_first())


Out[13]:
[(0, <utils.irf_utils.RITTree at 0x11b554400>),
 (1, <utils.irf_utils.RITNode at 0x11b554358>),
 (2, <utils.irf_utils.RITNode at 0x11b5545c0>),
 (3, <utils.irf_utils.RITNode at 0x11b12ab00>),
 (4, <utils.irf_utils.RITNode at 0x11b5545f8>),
 (5, <utils.irf_utils.RITNode at 0x110b29048>),
 (6, <utils.irf_utils.RITNode at 0x11b554908>),
 (7, <utils.irf_utils.RITNode at 0x11b554978>),
 (8, <utils.irf_utils.RITNode at 0x11b554588>),
 (9, <utils.irf_utils.RITNode at 0x11b5540b8>),
 (10, <utils.irf_utils.RITNode at 0x11b5549b0>),
 (11, <utils.irf_utils.RITNode at 0x11b5547f0>),
 (12, <utils.irf_utils.RITNode at 0x11b554860>),
 (13, <utils.irf_utils.RITNode at 0x11b554630>),
 (14, <utils.irf_utils.RITNode at 0x11b5549e8>),
 (15, <utils.irf_utils.RITNode at 0x11b554b00>),
 (16, <utils.irf_utils.RITNode at 0x11b554ac8>),
 (17, <utils.irf_utils.RITNode at 0x11b554f28>),
 (18, <utils.irf_utils.RITNode at 0x11b554e10>),
 (19, <utils.irf_utils.RITNode at 0x11b554e80>),
 (20, <utils.irf_utils.RITNode at 0x11b554ef0>),
 (21, <utils.irf_utils.RITNode at 0x11b554eb8>),
 (22, <utils.irf_utils.RITNode at 0x11b554e48>),
 (23, <utils.irf_utils.RITNode at 0x11b554dd8>),
 (24, <utils.irf_utils.RITNode at 0x11b554da0>),
 (25, <utils.irf_utils.RITNode at 0x11b554b70>),
 (26, <utils.irf_utils.RITNode at 0x11b554c18>),
 (27, <utils.irf_utils.RITNode at 0x11b554be0>),
 (28, <utils.irf_utils.RITNode at 0x11b554a58>),
 (29, <utils.irf_utils.RITNode at 0x118da3550>),
 (30, <utils.irf_utils.RITNode at 0x11b5a4198>)]

In [49]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
raw_data = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
    raw_data.data, raw_data.target, train_size=0.9,
    random_state=2017)
rf = RandomForestClassifier(
    n_estimators=10, random_state=2018)
rf.fit(X=X_train, y=y_train)
estimator0 = rf.estimators_[0]
estimator0_out = irf_utils.get_tree_data(X_train=X_train,
                                 dtree=estimator0,
                                 root_node_id=0)
print(estimator0_out['all_leaf_nodes'])


[6, 8, 9, 10, 12, 14, 15, 19, 22, 23, 24, 25, 26, 29, 30, 32, 34, 36, 37, 40, 41, 42]

In [50]:
# Import our custom utilities
from imp import reload
from utils import irf_jupyter_utils
from utils import irf_utils
reload(irf_jupyter_utils)
reload(irf_utils)


Out[50]:
<module 'utils.irf_utils' from '/Users/shamindras/PERSONAL/LEARNING/REPOS/scikit-learn-sandbox/jupyter/utils/irf_utils.py'>

In [51]:
estimator0_out_fltr = irf_utils.filter_leaves_classifier(dtree_data=estimator0_out,bin_class_type=1)

In [52]:
estimator0_out_fltr


Out[52]:
{'f_leaf_nodes_depths': [6, 7, 5, 6, 6, 8, 7, 4, 4, 5, 6, 3],
 'f_tot_leaf_node_values': [239, 8, 8, 2, 27, 1, 10, 7, 2, 2, 1, 6],
 'f_uniq_feature_paths': [array([ 1,  5,  6, 13, 23, 26]),
  array([ 1,  5,  6,  9, 13, 23, 26]),
  array([ 1,  6, 22, 23, 26]),
  array([ 1,  6, 13, 22, 23, 26]),
  array([ 1,  3, 23, 26, 27]),
  array([ 1,  3, 17, 19, 23, 26, 27]),
  array([ 1,  3, 19, 23, 26, 27]),
  array([ 3, 18, 23, 26]),
  array([ 3,  8, 23, 26]),
  array([ 3,  8, 22, 23, 26]),
  array([ 3,  4,  8, 22, 23, 26]),
  array([22, 23, 26])]}

In [53]:
print("Total Number of classes", len(estimator0_out['all_leaf_node_classes']), sep=":\n")
print("Total Number of 1-value classes", sum(estimator0_out['all_leaf_node_classes']), sep=":\n")


Total Number of classes:
22
Total Number of 1-value classes:
12

In [55]:
print("Total Number of 1-value classes", len(estimator0_out_fltr['f_leaf_nodes_depths']), sep=":\n")


Total Number of 1-value classes:
12

In [ ]: