Load data

The paper states nothing about data preprocessing. However, the scale of each predictor is important when calculating SNR. Hence, we just assume the preprocessing is the same as in the Golub. paper. Also, the SNR mentioned in the paper is essentially the same as the PS in the Golub paper.


In [1]:
## Most code is commented in this cell since it is unnecessary and time-consuming to run it everytime.
# options(repos='http://cran.rstudio.com/') 
# source("http://bioconductor.org/biocLite.R")
# biocLite("golubEsets")
suppressMessages(library(golubEsets))
suppressMessages(library(bnlearn))
#Training data predictor and response
data(Golub_Train)
golub_train_p = t(exprs(Golub_Train))
golub_train_r =pData(Golub_Train)[, "ALL.AML"]
#Testing data predictor
data(Golub_Test)
golub_test_p = t(exprs(Golub_Test))
golub_test_r = pData(Golub_Test)[, "ALL.AML"]
#Show summary
rbind(Train = dim(golub_train_p), Test = dim(golub_test_p))
cbind(Train = table(golub_train_r),Test = table(golub_test_r))
# Thresholding
golub_train_pp = golub_train_p
golub_train_pp[golub_train_pp<100] = 100
golub_train_pp[golub_train_pp>16000] = 16000

# Filtering
golub_filter = function(x, r = 5, d=500){
    minval = min(x)
    maxval = max(x)
    (maxval/minval>r)&&(maxval-minval>d)
}
index = apply(golub_train_pp, 2, golub_filter)
golub_index = (1:7129)[index]
golub_train_pp = golub_train_pp[, golub_index]

golub_test_pp = golub_test_p
golub_test_pp[golub_test_pp<100] = 100
golub_test_pp[golub_test_pp>16000] = 16000
golub_test_pp = golub_test_pp[, golub_index]

# Log Transformation
golub_train_p_trans = log10(golub_train_pp)
golub_test_p_trans = log10(golub_test_pp)

# Normalization
train_m = colMeans(golub_train_p_trans)
train_sd = apply(golub_train_p_trans, 2, sd)
golub_train_p_trans = t((t(golub_train_p_trans)-train_m)/train_sd)
golub_test_p_trans  = t((t(golub_test_p_trans)-train_m)/train_sd)
rbind(Train = dim(golub_train_p_trans), Test = dim(golub_test_p_trans))
cbind(Train = table(golub_train_r),Test = table(golub_test_r))


Train38 7129
Test34 7129
TrainTest
ALL2720
AML1114
Train38 3051
Test34 3051
TrainTest
ALL2720
AML1114

In [2]:
save(golub_train_p_trans, golub_train_r,golub_test_p_trans,golub_test_r, file = "DP.rda")