Check the output of weighted random forest



In [3]:

    
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
import numpy as np
from functools import reduce

# Import our custom utilities
from imp import reload
from utils import irf_jupyter_utils
from utils import irf_utils
reload(irf_jupyter_utils)
reload(irf_utils)

# Import RF related functions
from sklearn.ensemble import RandomForestClassifier









    Out[3]:





<module 'utils.irf_utils' from '/home/yu/github/scikit-learn-sandbox/jupyter/utils/irf_utils.py'>

When feature_weight = None, the output should match Random Forest.

original RF result is stored in feature_weight1 below.



In [4]:

    
feature_weight0 = None



In [5]:

    
X_train, X_test, y_train, y_test, rf = irf_jupyter_utils.generate_rf_example(n_estimators=1000, 
                                                                             feature_weight=feature_weight0)



In [8]:

    
all_rf_tree_data = irf_utils.get_rf_tree_data(rf=rf, X_train=X_train, X_test=X_test, y_test=y_test)
#all_rf_tree_data



In [9]:

    
# Print the feature importance
feature_importances_rank_idx0 = all_rf_tree_data['feature_importances_rank_idx']
feature_importances0 = all_rf_tree_data['feature_importances']

print(feature_importances0)









    



[ 0.04633261  0.01279795  0.04649524  0.05372707  0.00613899  0.01410021
  0.04617194  0.08346155  0.00365378  0.00379191  0.01370514  0.00441408
  0.01102853  0.03730455  0.00456819  0.00397247  0.00564707  0.00500305
  0.00441179  0.00429459  0.11649582  0.01827778  0.14157085  0.11806595
  0.01195991  0.01511598  0.03314478  0.11802327  0.00953361  0.00679134]



In [10]:

    
correct_feature_importance =[ 0.04153319,  0.0136872,   0.05287382,  0.05537257,  0.00571718,  0.01101297,
  0.04525511,  0.08925701,  0.00407582,  0.00337926,  0.01301454,  0.00396505,
  0.01022279,  0.03255195,  0.00498767,  0.00438016,  0.00771317,  0.00459407,
  0.0037973,   0.00448982,  0.10938616,  0.01690837,  0.14415417,  0.1204331,
  0.01276175,  0.01472586,  0.03019196,  0.12449026,  0.00858072,  0.00648698]

When feature_weight is uniform, it should give the same feature importance.



In [11]:

    
feature_weight1 = [1]*30



In [12]:

    
X_train, X_test, y_train, y_test, rf = irf_jupyter_utils.generate_rf_example(n_estimators=1000, 
                                                                             feature_weight=feature_weight1)



In [14]:

    
all_rf_tree_data = irf_utils.get_rf_tree_data(rf=rf, X_train=X_train, X_test=X_test, y_test=y_test)
#all_rf_tree_data



In [15]:

    
#feature importance 
feature_importances_rank_idx1 = all_rf_tree_data['feature_importances_rank_idx']
feature_importances1 = all_rf_tree_data['feature_importances']

print(feature_importances1)









    



[ 0.04156421  0.01371803  0.05286511  0.0553587   0.00568501  0.01101232
  0.04530112  0.08926779  0.0040873   0.00338411  0.01303412  0.00399057
  0.01019653  0.03253443  0.00497113  0.00433343  0.00780205  0.00452657
  0.00379767  0.00448269  0.10939605  0.01686833  0.14411923  0.12039774
  0.01277833  0.01474184  0.03015998  0.12455529  0.00859024  0.00648009]

When feature_weight is weighted, it should give the roughly same feature ranking.



In [16]:

    
feature_weight2 = correct_feature_importance



In [17]:

    
X_train, X_test, y_train, y_test, rf = irf_jupyter_utils.generate_rf_example(n_estimators=1000, 
                                                                             feature_weight=feature_weight2)



In [19]:

    
all_rf_tree_data = irf_utils.get_rf_tree_data(rf=rf, X_train=X_train, X_test=X_test, y_test=y_test)
#all_rf_tree_data



In [20]:

    
#feature importance 
feature_importances_rank_idx2 = all_rf_tree_data['feature_importances_rank_idx']
feature_importances2 = all_rf_tree_data['feature_importances']
for f in range(X_train.shape[1]):
    print("%2d. feature %2d (%10.9f) and feature %2d (%10.9f)" % (f + 1
                                   , feature_importances_rank_idx1[f]
                                   , feature_importances1[feature_importances_rank_idx1[f]]
                                   , feature_importances_rank_idx2[f]
                                   , feature_importances2[feature_importances_rank_idx2[f]]))









    



 1. feature 22 (0.144119233) and feature 22 (0.362963504)
 2. feature 27 (0.124555293) and feature 27 (0.188353629)
 3. feature 23 (0.120397738) and feature 23 (0.150679142)
 4. feature 20 (0.109396048) and feature  7 (0.099982502)
 5. feature  7 (0.089267789) and feature 20 (0.093378339)
 6. feature  3 (0.055358700) and feature 26 (0.012814561)
 7. feature  2 (0.052865109) and feature  6 (0.012635612)
 8. feature  6 (0.045301121) and feature  3 (0.012576257)
 9. feature  0 (0.041564208) and feature  2 (0.010215511)
10. feature 13 (0.032534434) and feature 21 (0.009935611)
11. feature 26 (0.030159979) and feature 13 (0.009467799)
12. feature 21 (0.016868331) and feature  1 (0.006934552)
13. feature 25 (0.014741839) and feature  0 (0.006521312)
14. feature  1 (0.013718027) and feature 24 (0.003568667)
15. feature 10 (0.013034123) and feature 25 (0.003392227)
16. feature 24 (0.012778328) and feature 10 (0.002856610)
17. feature  5 (0.011012323) and feature 28 (0.001972390)
18. feature 12 (0.010196528) and feature 12 (0.001924793)
19. feature 28 (0.008590244) and feature 29 (0.001764246)
20. feature 16 (0.007802045) and feature  5 (0.001437076)
21. feature 29 (0.006480088) and feature 16 (0.001074582)
22. feature  4 (0.005685009) and feature  4 (0.000997067)
23. feature 14 (0.004971128) and feature 11 (0.000704047)
24. feature 17 (0.004526568) and feature 18 (0.000688298)
25. feature 19 (0.004482690) and feature 14 (0.000629995)
26. feature 15 (0.004333429) and feature  8 (0.000596955)
27. feature  8 (0.004087299) and feature 17 (0.000582948)
28. feature 11 (0.003990568) and feature 19 (0.000560846)
29. feature 18 (0.003797667) and feature 15 (0.000504888)
30. feature  9 (0.003384114) and feature  9 (0.000286034)



In [44]:

    
def test_iRF_weight1():
    #Check when label is random, whether the feature importance of every feature is the same.
    n_samples = 1000
    n_features = 10
    random_state_classifier = 2018
    np.random.seed(random_state_classifier)
    X_train = np.random.uniform(low=0, high=1, size=(n_samples, n_features))
    y_train = np.random.choice([0, 1], size=(n_samples,), p=[.5, .5])
    X_test = np.random.uniform(low=0, high=1, size=(n_samples, n_features))
    y_test = np.random.choice([0, 1], size=(n_samples,), p=[.5, .5])
    all_rf_weights, all_K_iter_rf_data, \
    all_rf_bootstrap_output, all_rit_bootstrap_output, \
    stability_score = irf_utils.run_iRF(X_train=X_train,
                                        X_test=X_test,
                                        y_train=y_train,
                                        y_test=y_test,
                                        K=5,
                                        n_estimators=20,
                                        B=30,
                                        random_state_classifier=2018,
                                        propn_n_samples=.2,
                                        bin_class_type=1,
                                        M=20,
                                        max_depth=5,
                                        noisy_split=False,
                                        num_splits=2,
                                        n_estimators_bootstrap=5)
    assert np.max(all_rf_weights['rf_weight5'])<.135
test_iRF_weight1()



In [53]:

    
def test_iRF_weight2():
    #Check when feature 1 fully predict the label, its importance should be 1.
    n_samples = 1000
    n_features = 10
    random_state_classifier = 2018
    np.random.seed(random_state_classifier)
    X_train = np.random.uniform(low=0, high=1, size=(n_samples, n_features))
    y_train = np.random.choice([0, 1], size=(n_samples,), p=[.5, .5])
    X_test = np.random.uniform(low=0, high=1, size=(n_samples, n_features))
    y_test = np.random.choice([0, 1], size=(n_samples,), p=[.5, .5])
    # first feature is very important
    X_train[:,1] = X_train[:,1] + y_train
    X_test[:,1] = X_test[:,1] + y_test
    all_rf_weights, all_K_iter_rf_data, \
    all_rf_bootstrap_output, all_rit_bootstrap_output, \
    stability_score = irf_utils.run_iRF(X_train=X_train,
                                        X_test=X_test,
                                        y_train=y_train,
                                        y_test=y_test,
                                        K=5,
                                        n_estimators=20,
                                        B=30,
                                        random_state_classifier=2018,
                                        propn_n_samples=.2,
                                        bin_class_type=1,
                                        M=20,
                                        max_depth=5,
                                        noisy_split=False,
                                        num_splits=2,
                                        n_estimators_bootstrap=5)
    print(all_rf_weights['rf_weight5'])
    assert all_rf_weights['rf_weight5'][1] == 1
test_iRF_weight2()









    



[ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.]



In [ ]: