Feature selection using PLS

We use the package of 'ropls' in bioconductor.

(a). Select the same 50 genes as in Golub paper. See details in Paper1 notebook.


In [5]:
load("DP.rda")
library(ropls)
library(MASS)
suppressMessages(library(caret))
set.seed(201703)

In [6]:
# Neighbourhood analysis
get_p = function(train_d, train_r){
    tr_m_aml =  colMeans(train_d[train_r == "AML",])
    tr_sd_aml = apply(train_d[train_r == "AML",], 2, sd)
    tr_m_all = colMeans(train_d[train_r == "ALL",])
    tr_sd_all = apply(train_d[train_r == "ALL",], 2, sd)
    p = (tr_m_aml-tr_m_all)/(tr_sd_aml+tr_sd_all)
}
nna = matrix(0, 400, 3051)
# Permutation test
for(i in 1:400){
    t_r = sample(golub_train_r)
    nna[i, ] = get_p(golub_train_p_trans, t_r)
}

# Predictor selection based on the result of Neighbourhood analysis
nna_q = apply(nna, 2, quantile, prob = c(0.005, 0.995))
p = get_p(golub_train_p_trans, golub_train_r)

# With 0.01 significant level
index_1 = (1:3051)[p>=nna_q[2,] | p<=nna_q[1,]]
golub_train_p_trans = golub_train_p_trans[, index_1]
train_m_aml = colMeans(golub_train_p_trans[golub_train_r == "AML",])
train_m_all = colMeans(golub_train_p_trans[golub_train_r =="ALL",])
golub_test_p_trans =golub_test_p_trans[, index_1]
p = p[index_1]
cl_index = c(head(order(p), 25), head(order(p, decreasing = T), 25))
p_50 = p[cl_index]
b = (train_m_aml[cl_index]+train_m_all[cl_index])/2
train_cl = golub_train_p_trans[, cl_index]
test_cl = golub_test_p_trans[, cl_index]

(b). PLS selection.


In [7]:
# Number of predictors as recommended in the paper
K = 3
# PLS selection
pls_slt = getLoadingMN(opls(train_cl, golub_train_r, printL = F, predI = K))
pls_train_s = t(t(pls_slt)%*%t(train_cl))
pls_test_s = t(t(pls_slt)%*%t(test_cl))
pls_train = data.frame(response = golub_train_r, pls_train_s)

In [8]:
save(pls_train_s, pls_test_s, pls_train, file = "PLS.rda")