In [1]:
source('utils/iRF_benchmarks_Rlib.R')
library(iRF)
In [2]:
#load breast cancer data
raw_data <- read.delim("http://ftp.cs.wisc.edu/math-prog/cpo-dataset/machine-learn/cancer/WDBC/WDBC.dat",
sep = ",", header = FALSE)
raw_data <- raw_data[, -1] # remove first column, its just an identifier
# name the columns
names(raw_data)[2:dim(raw_data)[2]] <- paste("x_", 0:29, sep="")
names(raw_data)[1] <- 'y'
head(raw_data)
features <- as.matrix(raw_data[,-1])
responses <- rep(0,dim(features)[1])
responses[raw_data[,1] == 'M'] <- 1
responses <- as.factor(responses)
In [36]:
# load splicing data
splice.data <- read.table('Y_X_splicing.txt')
idcs.high <- splice.data$psi > 0.7
idcs.low <- splice.data$psi < 0.3
splice.data$y <- rep(0, nrow(splice.data))
splice.data$y[idcs.high] <- 1
splice.data <- splice.data[idcs.high | idcs.low,]
n.full <- nrow(splice.data)
idcs.train <- sample(1:n.full, floor(n.full * 0.5))
idcs.test <- (1:n.full)[-idcs.train]
x <- splice.data[,!colnames(splice.data) %in% c('psi', 'y')]
x <- x[,1:270]
features <- as.matrix(x)
responses <- as.factor(splice.data$y)
head(splice.data)
In [6]:
train_split_propn = 0.8
n_estimators = 100
n_trials = 10
benchmarks <- RF_benchmarks(features, responses, train_split_propn = train_split_propn,
n_estimators = n_estimators, n_trials=n_trials)
In [7]:
cat('Dimensions of full dataset (#samples , # features): ', dim(features), '\n')
cat('Number of training samples: ', round(dim(features)[1] * train_split_propn), '\n')
cat('Number of test samples: ', round(dim(features)[1]*(1-train_split_propn)), '\n')
cat('number of trees in the random forest: ', n_estimators)
In [8]:
print('mean and std of various metrics across trials')
benchmarks$metrics_summary
In [9]:
print('top five feature importances across trials')
for(i in 1:n_trials){
# sort by feature importance
x <- benchmarks$feature_importance[[i]]
ord <- order(x, decreasing = TRUE)
print(ord[0:10] - 1) # -1 to compare with python output
#x <- x[ord, ]
#print(x[0:5])
}
In [3]:
n_trials <- 1
iRF_bm <- iRF_benchmarks(features, responses, n_trials=n_trials,
K=5,
train_split_propn=0.8,
n_estimators=20,
B=30,
M=20,
max_depth=5,
noisy_split=False,
num_splits=2,
seed=2018)
In [4]:
print('mean and std of various metrics across trials')
iRF_bm$metrics_summary
In [5]:
print('top five feature importances across trials')
for(i in 1:n_trials){
# sort by feature importance
x <- iRF_bm$feature_importance[[i]]
ord <- order(x, decreasing = TRUE)
print(ord[0:10] - 1) # -1 to compare with python output
#x <- x[ord, ]
#print(x[0:5])
}
In [11]:
for(i in 1:n_trials){
# sort by feature importance
x <- iRF_bm$stability_all[[i]]
print(x)
}
barplot(x)
In [ ]: