Feature Selection

General settings as specified in the paper and helper functions for BW(Feature selection) and train-test split.



In [1]:

    
load("../transformed data/golub3571.rda")
set.seed(201703)



In [2]:

    
# Settings as specified in the paper
p = 40 # number of genes for FLDA
B = 50 # Aggregation predictors
N = 200 # repeat classification N times
d = c(0.05, 0.1,0.25, 0.5, 0.75, 1) # CPD parameter



In [3]:

    
# Split train test as specified in the paper
mysplit = function(n){
    sample(1:n, floor(n/3))
}
# implement function for calculating BW as stated in the paper(the ratio of between-group to within group sums of squares)
BW = function(predictor, response){
    overall = colMeans(predictor)
    ALL_mean = apply(predictor, 2, function(x) mean(x[response == "ALL"]))
    AML_mean = apply(predictor, 2, function(x) mean(x[response == "AML"]))
    numerator = sum(response == "ALL")*(ALL_mean-overall)^2+sum(response == "AML")*(AML_mean-overall)^2
    denumerator = colSums((t(t(predictor[response == "ALL", ])-ALL_mean))^2)+colSums((t(t(predictor[response == "AML", ])-AML_mean))^2)
    numerator/denumerator
}
                     
# randomly feature select once for comparison for furthur study
id = mysplit(nrow(scale_golub_merge))
train_p = scale_golub_merge[-id,]
train_r = total3571_response[-id]
test_p = scale_golub_merge[id,]
test_r = total3571_response[id]
temp_bw = order(BW(train_p, train_r), decreasing = T)[1:50]
train_BW_predictor = train_p[,temp_bw]
test_BW_predictor = test_p[,temp_bw]
save(train_BW_predictor, train_r, test_BW_predictor, test_r,mysplit, BW, file = "../transformed data/paper9.rda")