Run Benchmarks skeleton code


In [3]:
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
import numpy as np
from functools import reduce

# Needed for the scikit-learn wrapper function
from sklearn.tree import irf_utils
from sklearn.ensemble import RandomForestClassifier
from math import ceil
from sklearn.model_selection import train_test_split

import itertools
import py_irf_benchmarks_utils

# Import our custom utilities
from imp import reload
import sys
sys.path.insert(0, '../jupyter')
sys.path.insert(0, '../')

import py_irf_benchmarks_utils
from utils import irf_jupyter_utils
reload(irf_jupyter_utils)


---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-3-23d0ab9dea3f> in <module>()
     13 
     14 import itertools
---> 15 import py_irf_benchmarks_utils
     16 
     17 # Import our custom utilities

ModuleNotFoundError: No module named 'py_irf_benchmarks_utils'

Load data and specs


In [2]:
features = np.loadtxt('./data/breast_cancer_features.csv', delimiter=',')
responses = np.loadtxt('./data/breast_cancer_responses.csv', delimiter=',')

In [3]:
specs = py_irf_benchmarks2.yaml_to_dict(inp_yaml='./specs/iRF_mod01.yaml')
print(specs)


{'inp_dsname': ['breast_cancer'], 'n_trials': [5], 'n_iter': [5], 'train_split_propn': [0.8], 'n_estimators': [2, 4, 6], 'n_bootstraps': [20], 'propn_n_samples': [0.2], 'bin_class_type': [1], 'n_RIT': [20], 'max_depth': [5], 'noisy_split': [False], 'num_splits': [2], 'n_estimators_bootstrap': [5], 'N_obs': ['all'], 'N_features': ['all']}

Set up loop


In [4]:
varNames = sorted(specs)
spec_comb = [dict(zip(varNames, prod)) \
    for prod in itertools.product(*(specs[name] for name in varNames))]
print(spec_comb[0])

len(spec_comb[0])


[{'N_features': 'all', 'N_obs': 'all', 'bin_class_type': 1, 'inp_dsname': 'breast_cancer', 'max_depth': 5, 'n_RIT': 20, 'n_bootstraps': 20, 'n_estimators': 2, 'n_estimators_bootstrap': 5, 'n_iter': 5, 'n_trials': 5, 'noisy_split': False, 'num_splits': 2, 'propn_n_samples': 0.2, 'train_split_propn': 0.8}, {'N_features': 'all', 'N_obs': 'all', 'bin_class_type': 1, 'inp_dsname': 'breast_cancer', 'max_depth': 5, 'n_RIT': 20, 'n_bootstraps': 20, 'n_estimators': 4, 'n_estimators_bootstrap': 5, 'n_iter': 5, 'n_trials': 5, 'noisy_split': False, 'num_splits': 2, 'propn_n_samples': 0.2, 'train_split_propn': 0.8}, {'N_features': 'all', 'N_obs': 'all', 'bin_class_type': 1, 'inp_dsname': 'breast_cancer', 'max_depth': 5, 'n_RIT': 20, 'n_bootstraps': 20, 'n_estimators': 6, 'n_estimators_bootstrap': 5, 'n_iter': 5, 'n_trials': 5, 'noisy_split': False, 'num_splits': 2, 'propn_n_samples': 0.2, 'train_split_propn': 0.8}]
Out[4]:
3

Run IRF


In [5]:
for i in range(len(spec_comb)): 
    
    print(spec_comb[i])
    
    [X_train, X_test, y_train, y_test] =\
             py_irf_benchmarks2.parse_data(features, responses, spec_comb[i]['train_split_propn'],\
                        N_obs = 'all', N_features = 'all', seed = 200)
    
    assert np.shape(X_train)[0] == np.shape(y_train)[0]
    assert np.shape(X_test)[0] == np.shape(y_test)[0]
        
    for j in range(spec_comb[i]['n_trials']): 
        irf_utils.run_iRF(X_train=X_train,
                              X_test=X_test,
                              y_train=y_train,
                              y_test=y_test,
                              K=spec_comb[i]['n_iter'],
                              n_estimators=spec_comb[i]['n_estimators'],
                              B=spec_comb[i]['n_bootstraps'],
                              random_state_classifier=152,
                              propn_n_samples=spec_comb[i]['propn_n_samples'],
                              bin_class_type=spec_comb[i]['bin_class_type'],
                              M=spec_comb[i]['n_RIT'],
                              max_depth=spec_comb[i]['max_depth'],
                              noisy_split=spec_comb[i]['noisy_split'],
                              num_splits=spec_comb[i]['num_splits'],
                              n_estimators_bootstrap=spec_comb[i]['n_estimators_bootstrap'])


3
{'N_features': 'all', 'N_obs': 'all', 'bin_class_type': 1, 'inp_dsname': 'breast_cancer', 'max_depth': 5, 'n_RIT': 20, 'n_bootstraps': 20, 'n_estimators': 2, 'n_estimators_bootstrap': 5, 'n_iter': 5, 'n_trials': 5, 'noisy_split': False, 'num_splits': 2, 'propn_n_samples': 0.2, 'train_split_propn': 0.8}
{'N_features': 'all', 'N_obs': 'all', 'bin_class_type': 1, 'inp_dsname': 'breast_cancer', 'max_depth': 5, 'n_RIT': 20, 'n_bootstraps': 20, 'n_estimators': 4, 'n_estimators_bootstrap': 5, 'n_iter': 5, 'n_trials': 5, 'noisy_split': False, 'num_splits': 2, 'propn_n_samples': 0.2, 'train_split_propn': 0.8}
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-5-654775170e55> in <module>()
     24                               noisy_split=spec_comb[i]['noisy_split'],
     25                               num_splits=spec_comb[i]['num_splits'],
---> 26                               n_estimators_bootstrap=spec_comb[i]['n_estimators_bootstrap'])

/home/runjing_liu/Documents/iRF/scikit-learn/sklearn/tree/irf_utils.py in run_iRF(X_train, X_test, y_train, y_test, K, n_estimators, B, random_state_classifier, propn_n_samples, bin_class_type, M, max_depth, noisy_split, num_splits, n_estimators_bootstrap)
   1047             X_train=X_train_rsmpl,
   1048             X_test=X_test,
-> 1049             y_test=y_test)
   1050 
   1051         # Update the rf bootstrap output dictionary

/home/runjing_liu/Documents/iRF/scikit-learn/sklearn/tree/irf_utils.py in get_rf_tree_data(rf, X_train, X_test, y_test)
    481                                    y_test=y_test,
    482                                    dtree=dtree,
--> 483                                    root_node_id=0)
    484 
    485         # Append output to our combined random forest outputs dict

/home/runjing_liu/Documents/iRF/scikit-learn/sklearn/tree/irf_utils.py in _get_tree_data(X_train, X_test, y_test, dtree, root_node_id)
    302     # Start with a range over the total number of features and
    303     # subset the relevant indices from the raw indices array
--> 304     node_features_idx = all_features_idx[np.array(node_features_raw_idx)]
    305 
    306     # Count the unique number of features used

IndexError: index 992 is out of bounds for axis 1 with size 30

In [ ]: