In [21]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
import numpy as np
from functools import reduce
# Needed for the scikit-learn wrapper function
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from math import ceil
# Import our custom utilities
from imp import reload
from utils import irf_jupyter_utils
from utils import irf_utils
reload(irf_jupyter_utils)
reload(irf_utils)
Out[21]:
In [22]:
load_breast_cancer = load_breast_cancer()
In [23]:
X_train, X_test, y_train, y_test, rf = irf_jupyter_utils.generate_rf_example(n_estimators=1000,
feature_weight=None)
In [24]:
print("Training feature dimensions", X_train.shape, sep = ":\n")
print("\n")
print("Training outcome dimensions", y_train.shape, sep = ":\n")
print("\n")
print("Test feature dimensions", X_test.shape, sep = ":\n")
print("\n")
print("Test outcome dimensions", y_test.shape, sep = ":\n")
print("\n")
print("first 5 rows of the training set features", X_train[:2], sep = ":\n")
print("\n")
print("first 5 rows of the training set outcomes", y_train[:2], sep = ":\n")
In [25]:
all_rf_tree_data = irf_utils.get_rf_tree_data(rf=rf,
X_train=X_train, y_train=y_train,
X_test=X_test, y_test=y_test)
In [26]:
all_rit_tree_data = irf_utils.get_rit_tree_data(
all_rf_tree_data=all_rf_tree_data,
bin_class_type=1,
random_state=12,
M=100,
max_depth=2,
noisy_split=False,
num_splits=2)
In [27]:
#for i in range(100):
# print(all_rit_tree_data['rit{}'.format(i)]['rit_leaf_node_union_value'])
In [28]:
# Print the feature ranking
print("Feature ranking:")
feature_importances_rank_idx = all_rf_tree_data['feature_importances_rank_idx']
feature_importances = all_rf_tree_data['feature_importances']
for f in range(X_train.shape[1]):
print("%d. feature %d (%f)" % (f + 1
, feature_importances_rank_idx[f]
, feature_importances[feature_importances_rank_idx[f]]))
In [29]:
# Plot the feature importances of the forest
feature_importances_std = all_rf_tree_data['feature_importances_std']
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1])
, feature_importances[feature_importances_rank_idx]
, color="r"
, yerr = feature_importances_std[feature_importances_rank_idx], align="center")
plt.xticks(range(X_train.shape[1]), feature_importances_rank_idx)
plt.xlim([-1, X_train.shape[1]])
plt.show()
In [30]:
# Now plot the trees individually
#irf_jupyter_utils.draw_tree(decision_tree = all_rf_tree_data['rf_obj'].estimators_[0])
In [31]:
#irf_jupyter_utils.pretty_print_dict(inp_dict = all_rf_tree_data['dtree0'])
In [32]:
# Count the number of samples passing through the leaf nodes
sum(all_rf_tree_data['dtree0']['tot_leaf_node_values'])
Out[32]:
In [33]:
#irf_jupyter_utils.pretty_print_dict(inp_dict = all_rf_tree_data['dtree0']['all_leaf_paths_features'])
In [34]:
def get_stability_score(all_rit_bootstrap_output):
"""
Get the stabilty score from B bootstrap Random Forest
Fits with RITs
"""
# Initialize values
bootstrap_interact = []
B = len(all_rit_bootstrap_output)
for b in range(B):
rit_counts = irf_utils.rit_interactions(
all_rit_bootstrap_output['rf_bootstrap{}'.format(b)])
rit_counts = list(rit_counts.keys())
bootstrap_interact.append(rit_counts)
flatten = lambda l: [item for sublist in l for item in sublist]
all_rit_interactions = flatten(bootstrap_interact)
stability = {m:all_rit_interactions.count(m)/B for m in all_rit_interactions}
return stability
In [35]:
from datetime import datetime
print(datetime.now())
for b in range(3):
# Take a bootstrap sample from the training data
# based on the specified user proportion
X_train_rsmpl, y_rsmpl = resample(
X_train, y_train, n_samples=3, random_state=0)
print("iteration {}".format(b), X_train_rsmpl.shape[0],
y_rsmpl.shape[0])
print(np.sum(X_train_rsmpl))
print(np.sum(y_rsmpl))
In [38]:
def run_rit(X_train,
X_test,
y_train,
y_test,
K=7,
n_estimators=20,
B=10,
random_state_classifier=2018,
propn_n_samples=0.2,
bin_class_type=1,
random_state=12,
M=4,
max_depth=2,
noisy_split=False,
num_splits=2,
n_estimators_bootstrap=5):
"""
Runs the iRF algorithm in full.
Parameters
--------
X_train : array-like or sparse matrix, shape = [n_samples, n_features]
Training vector, where n_samples in the number of samples and
n_features is the number of features.
X_test : array-like or sparse matrix, shape = [n_samples, n_features]
Test vector, where n_samples in the number of samples and
n_features is the number of features.
y_train : 1d array-like, or label indicator array / sparse matrix
Ground truth (correct) target values for training.
y_test : 1d array-like, or label indicator array / sparse matrix
Ground truth (correct) target values for testing.
K : int, optional (default = 7)
The number of iterations in iRF.
n_estimators : int, optional (default = 20)
The number of trees in the random forest when computing weights.
B : int, optional (default = 10)
The number of bootstrap samples
random_state_classifier : int, optional (default = 2018)
The random seed for reproducibility.
propn_n_samples : float, optional (default = 0.2)
The proportion of samples drawn for bootstrap.
bin_class_type : int, optional (default = 1)
...
max_depth : int, optional (default = 2)
The built tree will never be deeper than `max_depth`.
num_splits : int, optional (default = 2)
At each node, the maximum number of children to be added.
noisy_split: bool, optional (default = False)
At each node if True, then number of children to
split will be (`num_splits`, `num_splits + 1`)
based on the outcome of a bernoulli(0.5)
random variable
n_estimators_bootstrap : int, optional (default = 5)
The number of trees in the random forest when fitting to bootstrap samples
Returns
--------
all_rf_weights: dict
stores feature weights across all iterations
all_rf_bootstrap_output: dict
stores rf information across all bootstrap samples
all_rit_bootstrap_output: dict
stores rit information across all bootstrap samples
stability_score: dict
stores interactions in as its keys and stabilities scores as the values
"""
# Set the random state for reproducibility
np.random.seed(random_state_classifier)
# Convert the bootstrap resampling proportion to the number
# of rows to resample from the training data
n_samples = ceil(propn_n_samples * X_train.shape[0])
# Initialize dictionary of rf weights
# CHECK: change this name to be `all_rf_weights_output`
all_rf_weights = {}
# Initialize dictionary of bootstrap rf output
all_rf_bootstrap_output = {}
# Initialize dictionary of bootstrap RIT output
all_rit_bootstrap_output = {}
for k in range(K):
if k == 0:
# Initially feature weights are None
feature_importances = None
# Update the dictionary of all our RF weights
all_rf_weights["rf_weight{}".format(k)] = feature_importances
# fit RF feature weights i.e. initially None
rf = RandomForestClassifier(n_estimators=n_estimators)
# fit the classifier
rf.fit(
X=X_train,
y=y_train,
feature_weight=all_rf_weights["rf_weight{}".format(k)])
# Update feature weights using the
# new feature importance score
feature_importances = rf.feature_importances_
# Load the weights for the next iteration
all_rf_weights["rf_weight{}".format(k + 1)] = feature_importances
else:
# fit weighted RF
# Use the weights from the previous iteration
rf = RandomForestClassifier(n_estimators=n_estimators)
# fit the classifier
rf.fit(
X=X_train,
y=y_train,
feature_weight=all_rf_weights["rf_weight{}".format(k)])
# Update feature weights using the
# new feature importance score
feature_importances = rf.feature_importances_
# Load the weights for the next iteration
all_rf_weights["rf_weight{}".format(k + 1)] = feature_importances
# Run the RITs
for b in range(B):
# Take a bootstrap sample from the training data
# based on the specified user proportion
X_train_rsmpl, y_rsmpl = resample(
X_train, y_train, n_samples=n_samples)
# Set up the weighted random forest
# Using the weight from the (K-1)th iteration i.e. RF(w(K))
rf_bootstrap = RandomForestClassifier(
#CHECK: different number of trees to fit for bootstrap samples
n_estimators=n_estimators_bootstrap)
# Fit RF(w(K)) on the bootstrapped dataset
rf_bootstrap.fit(
X=X_train_rsmpl,
y=y_rsmpl,
feature_weight=all_rf_weights["rf_weight{}".format(K)])
# All RF tree data
# CHECK: why do we need y_train here?
all_rf_tree_data = irf_utils.get_rf_tree_data(
rf=rf_bootstrap,
X_train=X_train_rsmpl,
y_train=y_rsmpl,
X_test=X_test,
y_test=y_test)
# Update the rf bootstrap output dictionary
all_rf_bootstrap_output['rf_bootstrap{}'.format(b)] = all_rf_tree_data
# Run RIT on the interaction rule set
# CHECK - each of these variables needs to be passed into
# the main run_rit function
all_rit_tree_data = irf_utils.get_rit_tree_data(
all_rf_tree_data=all_rf_tree_data,
bin_class_type=bin_class_type,
M=M,
max_depth=max_depth,
noisy_split=noisy_split,
num_splits=num_splits)
# Update the rf bootstrap output dictionary
# We will reference the RIT for a particular rf bootstrap
# using the specific bootstrap id - consistent with the
# rf bootstrap output data
all_rit_bootstrap_output['rf_bootstrap{}'.format(b)] = all_rit_tree_data
stability_score = get_stability_score(all_rit_bootstrap_output=all_rit_bootstrap_output)
return all_rf_weights, all_rf_bootstrap_output, all_rit_bootstrap_output, stability_score
In [20]:
all_rf_weights, all_rf_bootstrap_output, all_rit_bootstrap_output, stability_score =\
run_rit(X_train=X_train,
X_test=X_test,
y_train=y_train,
y_test=y_test,
K=5,
n_estimators=20,
B=10,
random_state_classifier=2018,
propn_n_samples=.2,
bin_class_type=1,
random_state=12,
M=4,
max_depth=2,
noisy_split=False,
num_splits=2,
n_estimators_bootstrap=5)
print(all_rf_weights)
In [100]:
stability_score
Out[100]:
In [101]:
B = 10
for i in range(B):
rit_output = all_rit_bootstrap_output['rf_bootstrap{}'.format(i)]
interactions = irf_utils.rit_interactions(rit_output)
print('rf_bootstrap{}'.format(i), interactions)
In [97]:
all_rit_bootstrap_output['rf_bootstrap1']['rit2']
Out[97]:
In [56]:
all_rit_counts = []
for i in range(10):
rit_counts = irf_utils.rit_interactions(
all_rit_bootstrap_output['rf_bootstrap{}'.format(i)])
rit_counts = rit_counts.fromkeys(rit_counts, 1)
all_rit_counts.append(rit_counts)
In [49]:
all_rf_weights_1iter, all_rf_bootstrap_output_1iter, all_rit_bootstrap_output_1iter =\
run_rit(X_train=X_train,
X_test=X_test,
y_train=y_train,
y_test=y_test,
K=1,
n_estimators=1000,
B=10,
random_state_classifier=2018,
propn_n_samples=0.2,
bin_class_type=1,
random_state=12,
M=4,
max_depth=2,
noisy_split=False,
num_splits=2)
In [55]:
print(all_rf_weights_1iter['rf_weight1'])
In [60]:
rf.feature_importances_
Out[60]:
In [61]:
rf_weight5 = np.ndarray.tolist(all_rf_weights['rf_weight1'])
rf_weight5
Out[61]:
#
In [48]:
sorted([i for i, e in enumerate(rf_weight10) if e != 0])
Out[48]:
In [83]:
Out[83]: