In [8]:
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from IPython.display import display, Image
from sklearn.datasets import load_breast_cancer
# Import our custom utilities
from imp import reload
from utils import irf_jupyter_utils
from utils import irf_utils
from utils import iRF_benchmarks_lib
reload(irf_jupyter_utils)
reload(irf_utils)
Out[8]:
In [9]:
# load breast cancer data
#raw_data = load_breast_cancer()
#responses = raw_data.target
#features = raw_data.data
features = np.loadtxt('data/breast_cancer_features.csv', delimiter=',')
responses = np.loadtxt('data/breast_cancer_responses.csv', delimiter=',')
In [3]:
# load splicing data
# assumes Y_X_splicing.txt is in same folder as this notebook
data = np.loadtxt('Y_X_splicing.txt', skiprows = 1, usecols = range(1,307))
indices_high = data[:,-1] > 0.7
indices_low = data[:,-1] < 0.3
responses = np.zeros(np.shape(data)[0])
responses[indices_high] = 1
features = data[:, 0:270]
# keep data with high or low responses
responses = responses[np.logical_or(indices_high, indices_low)]
features = features[np.logical_or(indices_high, indices_low), :]
In [3]:
n_trials = 10 # number of times to run random forest for our benchmarks
n_estimators = 20 # number of trees in the random forest
train_split_propn = 0.8
specs = {'n_trials': n_trials,
'n_estimators': n_estimators,
'train_split_propn': train_split_propn,
'N_obs': np.shape(features)[0], # use all data points
'N_features': np.shape(features)[1] # use all features
}
rf_bm = iRF_benchmarks_lib.consolidate_bm_RF(features, responses, specs, seed = 2017)
print(specs)
"""
n_trials = 10 # number of times to run random forest for our benchmarks
n_estimators = 500 # number of trees in the random forest
train_split_propn = 0.8
metrics_all, metrics_summary, feature_importances = \
iRF_benchmarks_lib.RF_benchmarks(features, responses,
n_trials = n_trials,
train_split_propn = train_split_propn,
n_estimators=n_estimators,
seed = 2017)
"""
Out[3]:
In [4]:
print('Dimensions of full dataset (#samples , # features): ', np.shape(features))
print('Number of training samples: ', np.round(np.shape(features)[0] * specs['train_split_propn'][0]))
print('Number of test samples: ', np.round(np.shape(features)[0]*(1-specs['train_split_propn'][0])))
print('number of trees in the random forest: ', specs['n_estimators'][0])
In [5]:
print('time (seconds) to compute RF [mean, std]: ', rf_bm[0]['metrics_summary']['time'])
print('accuracy_score [mean, std]: ', rf_bm[0]['metrics_summary']['accuracy_score'])
print('hammming_loss [mean, std]: ', rf_bm[0]['metrics_summary']['hamming_loss'])
In [6]:
print('top five feature importances across trials')
for i in range(n_trials):
# sort by feature importance
importances_rank = np.argsort(rf_bm[0]['feature_importances'][i])[::-1]
print('trial' + str(i) + ': ', importances_rank[0:10])
In [7]:
n_trials = 10 # number of times to run random forest for our benchmarks
n_estimators = [20, 50, 100, 150, 200, 300, 400, 500] # number of trees in the random forest
train_split_propn = 0.8
specs = {'n_trials': n_trials,
'n_estimators': n_estimators,
'train_split_propn': train_split_propn,
'N_obs': np.shape(features)[0], # use all data points
'N_features': np.shape(features)[1] # use all features
}
rf_bm = iRF_benchmarks_lib.consolidate_bm_RF(features, responses, specs, seed = 2017)
In [10]:
iRF_benchmarks_lib.plot_bm_RF(rf_bm, specs, 'n_estimators', 'time')
iRF_benchmarks_lib.plot_bm_RF(rf_bm, specs, 'n_estimators', 'accuracy_score')
In [7]:
n_trials = 2
train_split_propn = 0.8
n_estimators = 20
n_bootstraps = 20
n_RIT = 20
max_depth = 5
n_estimators_bootstrap = 5
iRF_specs = {'n_trials': n_trials,
'n_iter': n_estimators,
'train_split_propn': train_split_propn,
'n_estimators': n_estimators,
'n_bootstraps': n_bootstraps,
'propn_n_samples': 0.2,
'bin_class_type': 1,
'n_RIT': n_RIT,
'max_depth': max_depth,
'noisy_split': False,
'num_splits': 2,
'n_estimators_bootstrap': n_estimators_bootstrap,
'N_obs': np.shape(features)[0], # use all data points
'N_features': np.shape(features)[1] # use all features
}
In [ ]:
iRF_bm = iRF_benchmarks_lib.consolidate_bm_iRF(features, responses, iRF_specs, seed = None)
In [5]:
print('Dimensions of full dataset (#samples , # features): ', np.shape(features))
print('Number of training samples: ', np.round(np.shape(features)[0] * train_split_propn))
print('Number of test samples: ', np.round(np.shape(features)[0]*(1-train_split_propn)))
print('\n')
print('number of trees in full random forest: ', n_estimators)
print('number of bootstrap samples: ', n_bootstraps)
print('number of trees in RIT: ', n_RIT)
print('max depth of RIT: ', max_depth)
print('number of trees is RF bootstrap: ', n_estimators_bootstrap)
In [6]:
print('time (seconds) to compute iRF [mean, std]: ', iRF_bm[0]['metrics_summary']['time'])
print('\n')
print('accuracy_score [mean, std]: ', iRF_bm[0]['metrics_summary']['accuracy_score'])
print('hammming_loss [mean, std]: ', iRF_bm[0]['metrics_summary']['hamming_loss'])
Again, feature importances are measured for the last forest
In [7]:
print('top five important features across trials')
for i in range(n_trials):
importances_rank = np.argsort(iRF_bm[0]['feature_importances'][i])[::-1]
print('trial' + str(i) + ': ', importances_rank[0:5])
In [8]:
print('top five stable interactions across trials')
for i in range(n_trials):
# sort by stability
stability = sorted(iRF_bm[0]['stability_all'][i].values(), reverse=True)
interactions = sorted(iRF_bm[0]['stability_all'][i], key=iRF_bm[0]['stability_all'][i].get, reverse=True)
print('trial' + str(i) + ': ', interactions[0:5])
In [ ]:
n_trials = 3
train_split_propn = 0.8
n_estimators = [20, 50, 100]
n_bootstraps = 20
n_RIT = 20
max_depth = 5
n_estimators_bootstrap = 5
iRF_specs = {'n_trials': n_trials,
'n_iter': 5,
'train_split_propn': train_split_propn,
'n_estimators': n_estimators,
'n_bootstraps': n_bootstraps,
'propn_n_samples': 0.2,
'bin_class_type': 1,
'n_RIT': n_RIT,
'max_depth': max_depth,
'noisy_split': False,
'num_splits': 2,
'n_estimators_bootstrap': n_estimators_bootstrap,
'N_obs': np.shape(features)[0], # use all data points
'N_features': np.shape(features)[1] # use all features
}
iRF_bm = iRF_benchmarks_lib.consolidate_bm_iRF(features, responses, iRF_specs, seed = 2018)
In [ ]:
plot_bm(iRF_bm, specs, 'n_estimators', 'time')
In [ ]: