Bagging

Implemented without using packages in R.


In [1]:
suppressMessages(library(tree))
load("../transformed data/golub3571.rda")
load("../transformed data/paper9.rda")
# Settings as specified in the paper
p = 40 # number of genes for FLDA
B = 50 # Aggregation predictors
N = 200 # repeat classification N times
d = c(0.05, 0.1,0.25, 0.5, 0.75, 1) # CPD parameter
set.seed(2017)

In [2]:
cbine_data = data.frame(response = factor(total3571_response), scale_golub_merge)
# initialize the result vector
bg_error = numeric(N)
# implement helper function for each bagging
my_baghelper = function(train, test){
    bg = sample(nrow(train), replace = T)
    temp_md = tree(response~., data = train[bg, ])
    predict(temp_md, test, type = "class")
}

# perform N time classification(bootstrap)
for(i in 1:N){
    bg_index = mysplit(nrow(cbine_data))
    bg_train = cbine_data[-bg_index,]
    bg_test = cbine_data[bg_index,]
    
    # gene selection
    temp_bw = order(BW(bg_train[, -1], bg_train$response), decreasing = T)[1:p]
    bg_train_p = data.frame(response = bg_train$response, bg_train[,temp_bw+1])
    bg_test_p= data.frame(response = bg_test$response, bg_test[,temp_bw+1])
    
    t1 = replicate(B, my_baghelper(bg_train_p, bg_test_p))
    pred = apply(t1, 1, function(x) ifelse(sum(x == "AML")>sum(x =="ALL"), "AML", "ALL"))
    bg_error[i] = sum(pred != bg_test_p$response)
}
resultBag = c(Median = median(bg_error), Upper_quartile = quantile(bg_error, 0.75))

In [3]:
resultBag


Median
2
Upper_quartile.75%
3