In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
import numpy as np
from functools import reduce
# Import our custom utilities
from imp import reload
from utils import irf_jupyter_utils
from utils import irf_utils
reload(irf_jupyter_utils)
reload(irf_utils)
# Import RF related functions
from sklearn.ensemble import RandomForestClassifier
Out[3]:
In [4]:
feature_weight0 = None
In [5]:
X_train, X_test, y_train, y_test, rf = irf_jupyter_utils.generate_rf_example(n_estimators=1000,
feature_weight=feature_weight0)
In [8]:
all_rf_tree_data = irf_utils.get_rf_tree_data(rf=rf, X_train=X_train, X_test=X_test, y_test=y_test)
#all_rf_tree_data
In [9]:
# Print the feature importance
feature_importances_rank_idx0 = all_rf_tree_data['feature_importances_rank_idx']
feature_importances0 = all_rf_tree_data['feature_importances']
print(feature_importances0)
In [10]:
correct_feature_importance =[ 0.04153319, 0.0136872, 0.05287382, 0.05537257, 0.00571718, 0.01101297,
0.04525511, 0.08925701, 0.00407582, 0.00337926, 0.01301454, 0.00396505,
0.01022279, 0.03255195, 0.00498767, 0.00438016, 0.00771317, 0.00459407,
0.0037973, 0.00448982, 0.10938616, 0.01690837, 0.14415417, 0.1204331,
0.01276175, 0.01472586, 0.03019196, 0.12449026, 0.00858072, 0.00648698]
In [11]:
feature_weight1 = [1]*30
In [12]:
X_train, X_test, y_train, y_test, rf = irf_jupyter_utils.generate_rf_example(n_estimators=1000,
feature_weight=feature_weight1)
In [14]:
all_rf_tree_data = irf_utils.get_rf_tree_data(rf=rf, X_train=X_train, X_test=X_test, y_test=y_test)
#all_rf_tree_data
In [15]:
#feature importance
feature_importances_rank_idx1 = all_rf_tree_data['feature_importances_rank_idx']
feature_importances1 = all_rf_tree_data['feature_importances']
print(feature_importances1)
In [16]:
feature_weight2 = correct_feature_importance
In [17]:
X_train, X_test, y_train, y_test, rf = irf_jupyter_utils.generate_rf_example(n_estimators=1000,
feature_weight=feature_weight2)
In [19]:
all_rf_tree_data = irf_utils.get_rf_tree_data(rf=rf, X_train=X_train, X_test=X_test, y_test=y_test)
#all_rf_tree_data
In [20]:
#feature importance
feature_importances_rank_idx2 = all_rf_tree_data['feature_importances_rank_idx']
feature_importances2 = all_rf_tree_data['feature_importances']
for f in range(X_train.shape[1]):
print("%2d. feature %2d (%10.9f) and feature %2d (%10.9f)" % (f + 1
, feature_importances_rank_idx1[f]
, feature_importances1[feature_importances_rank_idx1[f]]
, feature_importances_rank_idx2[f]
, feature_importances2[feature_importances_rank_idx2[f]]))
In [44]:
def test_iRF_weight1():
#Check when label is random, whether the feature importance of every feature is the same.
n_samples = 1000
n_features = 10
random_state_classifier = 2018
np.random.seed(random_state_classifier)
X_train = np.random.uniform(low=0, high=1, size=(n_samples, n_features))
y_train = np.random.choice([0, 1], size=(n_samples,), p=[.5, .5])
X_test = np.random.uniform(low=0, high=1, size=(n_samples, n_features))
y_test = np.random.choice([0, 1], size=(n_samples,), p=[.5, .5])
all_rf_weights, all_K_iter_rf_data, \
all_rf_bootstrap_output, all_rit_bootstrap_output, \
stability_score = irf_utils.run_iRF(X_train=X_train,
X_test=X_test,
y_train=y_train,
y_test=y_test,
K=5,
n_estimators=20,
B=30,
random_state_classifier=2018,
propn_n_samples=.2,
bin_class_type=1,
M=20,
max_depth=5,
noisy_split=False,
num_splits=2,
n_estimators_bootstrap=5)
assert np.max(all_rf_weights['rf_weight5'])<.135
test_iRF_weight1()
In [53]:
def test_iRF_weight2():
#Check when feature 1 fully predict the label, its importance should be 1.
n_samples = 1000
n_features = 10
random_state_classifier = 2018
np.random.seed(random_state_classifier)
X_train = np.random.uniform(low=0, high=1, size=(n_samples, n_features))
y_train = np.random.choice([0, 1], size=(n_samples,), p=[.5, .5])
X_test = np.random.uniform(low=0, high=1, size=(n_samples, n_features))
y_test = np.random.choice([0, 1], size=(n_samples,), p=[.5, .5])
# first feature is very important
X_train[:,1] = X_train[:,1] + y_train
X_test[:,1] = X_test[:,1] + y_test
all_rf_weights, all_K_iter_rf_data, \
all_rf_bootstrap_output, all_rit_bootstrap_output, \
stability_score = irf_utils.run_iRF(X_train=X_train,
X_test=X_test,
y_train=y_train,
y_test=y_test,
K=5,
n_estimators=20,
B=30,
random_state_classifier=2018,
propn_n_samples=.2,
bin_class_type=1,
M=20,
max_depth=5,
noisy_split=False,
num_splits=2,
n_estimators_bootstrap=5)
print(all_rf_weights['rf_weight5'])
assert all_rf_weights['rf_weight5'][1] == 1
test_iRF_weight2()
In [ ]: