Load and transform dataset.

Install Bioconductor biocLite package in order to access the golubEsets library. golubEsets contains the raw data used by Todd Golub in the original paper.

We use the scale method in the original paper instead of the thresholding algorithm in this paper for now.


In [1]:
## Most code is commented in this cell since it is unnecessary and time-consuming to run it everytime.
# options(repos='http://cran.rstudio.com/') 
# source("http://bioconductor.org/biocLite.R")
# biocLite("golubEsets")
suppressMessages(library(golubEsets))
#Training data predictor and response

data(Golub_Train)
golub_train_p = t(exprs(Golub_Train))
golub_train_r =pData(Golub_Train)[, "ALL.AML"]
#Testing data predictor
data(Golub_Test)
golub_test_p = t(exprs(Golub_Test))
golub_test_r = pData(Golub_Test)[, "ALL.AML"]

# Thresholding
golub_train_pp = golub_train_p
golub_train_pp[golub_train_pp<100] = 100
golub_train_pp[golub_train_pp>16000] = 16000

# Filtering
golub_filter = function(x, r = 5, d=500){
    minval = min(x)
    maxval = max(x)
    (maxval/minval>r)&&(maxval-minval>d)
}
index = apply(golub_train_pp, 2, golub_filter)
golub_index = (1:7129)[index]
golub_train_pp = golub_train_pp[, golub_index]

golub_test_pp = golub_test_p
golub_test_pp[golub_test_pp<100] = 100
golub_test_pp[golub_test_pp>16000] = 16000
golub_test_pp = golub_test_pp[, golub_index]

# Log Transformation
golub_train_p_trans = log10(golub_train_pp)
golub_test_p_trans = log10(golub_test_pp)

# Normalization
train_m = colMeans(golub_train_p_trans)
train_sd = apply(golub_train_p_trans, 2, sd)
golub_train_p_trans = t((t(golub_train_p_trans)-train_m)/train_sd)
golub_test_p_trans  = t((t(golub_test_p_trans)-train_m)/train_sd)
save(golub_train_p_trans, golub_test_p_trans, golub_train_r, golub_test_r, golub_train_pp, golub_test_pp,"DP.rda")


Error in save(golub_train_p_trans, golub_test_p_trans, golub_train_r, : object ‘DP.rda’ not found
Traceback:

1. save(golub_train_p_trans, golub_test_p_trans, golub_train_r, 
 .     golub_test_r, golub_train_pp, golub_test_pp, "DP.rda")
2. stop(sprintf(ngettext(n, "object %s not found", "objects %s not found"), 
 .     paste(sQuote(list[!ok]), collapse = ", ")), domain = NA)

In [ ]: