In [122]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
import numpy as np
from functools import reduce
# Needed for the scikit-learn wrapper function
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from math import ceil
# Import our custom utilities
from imp import reload
from utils import irf_jupyter_utils
from utils import irf_utils
reload(irf_jupyter_utils)
reload(irf_utils)
Out[122]:
In [123]:
load_breast_cancer = load_breast_cancer()
In [128]:
X_train, X_test, y_train, y_test, rf = irf_jupyter_utils.generate_rf_example(n_estimators=10,
feature_weight=None)
In [129]:
print("Training feature dimensions", X_train.shape, sep = ":\n")
print("\n")
print("Training outcome dimensions", y_train.shape, sep = ":\n")
print("\n")
print("Test feature dimensions", X_test.shape, sep = ":\n")
print("\n")
print("Test outcome dimensions", y_test.shape, sep = ":\n")
print("\n")
print("first 5 rows of the training set features", X_train[:2], sep = ":\n")
print("\n")
print("first 5 rows of the training set outcomes", y_train[:2], sep = ":\n")
In [130]:
all_rf_tree_data = irf_utils.get_rf_tree_data(rf=rf,
X_train=X_train, y_train=y_train,
X_test=X_test, y_test=y_test)
In [131]:
#all_rf_tree_data
rf.feature_importances_
Out[131]:
In [132]:
all_rit_tree_data = irf_utils.get_rit_tree_data(
all_rf_tree_data=all_rf_tree_data,
bin_class_type=1,
random_state=12,
M=100,
max_depth=2,
noisy_split=False,
num_splits=2)
In [133]:
#for i in range(100):
# print(all_rit_tree_data['rit{}'.format(i)]['rit_leaf_node_union_value'])
In [134]:
# Print the feature ranking
print("Feature ranking:")
feature_importances_rank_idx = all_rf_tree_data['feature_importances_rank_idx']
feature_importances = all_rf_tree_data['feature_importances']
for f in range(X_train.shape[1]):
print("%d. feature %d (%f)" % (f + 1
, feature_importances_rank_idx[f]
, feature_importances[feature_importances_rank_idx[f]]))
In [135]:
# Plot the feature importances of the forest
feature_importances_std = all_rf_tree_data['feature_importances_std']
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1])
, feature_importances[feature_importances_rank_idx]
, color="r"
, yerr = feature_importances_std[feature_importances_rank_idx], align="center")
plt.xticks(range(X_train.shape[1]), feature_importances_rank_idx)
plt.xlim([-1, X_train.shape[1]])
plt.show()
In [136]:
# Now plot the trees individually
#irf_jupyter_utils.draw_tree(decision_tree = all_rf_tree_data['rf_obj'].estimators_[0])
In [137]:
#irf_jupyter_utils.pretty_print_dict(inp_dict = all_rf_tree_data['dtree0'])
In [138]:
# Count the number of samples passing through the leaf nodes
sum(all_rf_tree_data['dtree0']['tot_leaf_node_values'])
Out[138]:
In [139]:
#irf_jupyter_utils.pretty_print_dict(inp_dict = all_rf_tree_data['dtree0']['all_leaf_paths_features'])
In [142]:
def run_RIT(X_train,
X_test,
y_train,
y_test,
K,
n_estimators,
B,
random_state_classifier=2018,
propn_n_samples=0.2,
bin_class_type=1,
random_state=12,
M=4,
max_depth=2,
noisy_split=False,
num_splits=2):
""" This function will allow us to run the RIT
for the given parameters
"""
# Set the random state for reproducibility
np.random.seed(random_state_classifier)
# Convert the bootstrap resampling proportion to the number
# of rows to resample from the training data
n_samples = ceil(propn_n_samples * X_train.shape[0])
# Initialize dictionary of rf weights
# CHECK: change this name to be `all_rf_weights_output`
all_rf_weights = {}
# Initialize dictionary of bootstrap rf output
all_rf_bootstrap_output = {}
# Initialize dictionary of bootstrap RIT output
all_rit_bootstrap_output = {}
for k in range(K):
if k == 0:
# Initially feature weights are None
feature_importances = None
# Update the dictionary of all our RF weights
all_rf_weights["rf_weight{}".format(k)] = feature_importances
# fit RF feature weights i.e. initially None
rf = RandomForestClassifier(n_estimators=n_estimators)
# fit the classifier
rf.fit(
X=X_train,
y=y_train,
feature_weight=all_rf_weights["rf_weight{}".format(k)])
# Update feature weights using the
# new feature importance score
feature_importances = rf.feature_importances_
# Load the weights for the next iteration
all_rf_weights["rf_weight{}".format(k + 1)] = feature_importances
else:
# fit weighted RF
# Use the weights from the previous iteration
rf = RandomForestClassifier(n_estimators=n_estimators)
# fit the classifier
rf.fit(
X=X_train,
y=y_train,
feature_weight=all_rf_weights["rf_weight{}".format(k)])
# Update feature weights using the
# new feature importance score
feature_importances = rf.feature_importances_
# Load the weights for the next iteration
all_rf_weights["rf_weight{}".format(k + 1)] = feature_importances
# Run the RITs
for b in range(B):
# Take a bootstrap sample from the training data
# based on the specified user proportion
X_train_rsmpl, y_rsmpl = resample(
X_train, y_train, n_samples=n_samples)
# Set up the weighted random forest
# Using the weight from the (K-1)th iteration i.e. RF(w(K))
rf_bootstrap = RandomForestClassifier(
#CHECK: different number of trees to fit for bootstrap samples
n_estimators=n_estimators)
# Fit RF(w(K)) on the bootstrapped dataset
rf_bootstrap.fit(
X=X_train_rsmpl,
y=y_rsmpl,
feature_weight=all_rf_weights["rf_weight{}".format(K - 1)])
# All RF tree data
# CHECK: why do we need y_train here?
all_rf_tree_data = irf_utils.get_rf_tree_data(
rf=rf_bootstrap,
X_train=X_train_rsmpl,
y_train=y_rsmpl,
X_test=X_test,
y_test=y_test)
# Update the rf bootstrap output dictionary
all_rf_bootstrap_output['rf_bootstrap{}'.format(b)] = all_rf_tree_data
# Run RIT on the interaction rule set
# CHECK - each of these variables needs to be passed into
# the main run_RIT function
all_rit_tree_data = irf_utils.get_rit_tree_data(
all_rf_tree_data=all_rf_tree_data,
bin_class_type=1,
random_state=12,
M=4,
max_depth=2,
noisy_split=False,
num_splits=2)
# Update the rf bootstrap output dictionary
# We will reference the RIT for a particular rf bootstrap
# using the specific bootstrap id - consistent with the
# rf bootstrap output data
all_rit_bootstrap_output['rf_bootstrap{}'.format(b)] = all_rit_tree_data
return all_rf_weights, all_rf_bootstrap_output, all_rit_bootstrap_output
In [145]:
all_rf_weights, all_rf_bootstrap_output, all_rit_bootstrap_output =\
run_RIT(X_train=X_train,
X_test=X_test,
y_train=y_train,
y_test=y_test,
K=5,
n_estimators=20,
B=3,
random_state_classifier=2018,
propn_n_samples=0.2,
bin_class_type=1,
random_state=12,
M=4,
max_depth=2,
noisy_split=False,
num_splits=2)
In [146]:
all_rf_weights
Out[146]:
In [147]:
all_rf_weights, all_rf_bootstrap_output, all_rit_bootstrap_output = run_RIT(
X_train=X_train,
X_test=X_test,
y_train=y_train,
y_test=y_test,
K=5,
n_estimators=20,
B=3,
random_state_classifier=2018,
propn_n_samples=0.2)
In [148]:
all_rit_bootstrap_output
Out[148]:
In [105]:
all_rf_weights, all_rf_bootstrap_output, all_rit_bootstrap_output = run_RIT(
X_train=X_train,
X_test=X_test,
y_train=y_train,
y_test=y_test,
K=1,
n_estimators=1000,
B=3,
random_state_classifier=2018,
propn_n_samples=0.2)
In [110]:
print(np.ndarray.tolist(all_rf_weights['rf_weight1']))
In [117]:
rf.feature_importances_
Out[117]:
In [85]:
rf_weight5 = np.ndarray.tolist(all_rf_weights['rf_weight1'])
rf_weight5
Out[85]:
In [ ]:
#
In [48]:
sorted([i for i, e in enumerate(rf_weight10) if e != 0])
Out[48]:
In [83]:
Out[83]: