Load and transform dataset


In [2]:
# The implementation below is very similar to the one in paper1 notehbook and details chould be checked there.

## The code below is commented out since it is unnecessary and time-consuming to run it everytime. Run it if needed.
#options(repos='http://cran.rstudio.com/') 
#source("http://bioconductor.org/biocLite.R")
#biocLite("golubEsets")
#install.packages("tree")
#install.packages("fastAdaboost")
#install.packages("sparsediscrim", dependencies = T)
suppressMessages(library(sparsediscrim))
suppressMessages(library(tree))
suppressMessages(library(golubEsets))
suppressMessages(library(fastAdaboost))

# load data from golubEsets
data(Golub_Merge)
golub_merge_p = t(exprs(Golub_Merge))
golub_merge_r =pData(Golub_Merge)[, "ALL.AML"]
golub_merge_l = ifelse(golub_merge_r == "AML", 1, 0)

#show summary
dim(golub_merge_p) 
table(golub_merge_r)

# Thresholding
golub_merge_pp = golub_merge_p
golub_merge_pp[golub_merge_pp<100] = 100
golub_merge_pp[golub_merge_pp>16000] = 16000

# Filtering
golub_filter = function(x, r = 5, d=500){
    minval = min(x)
    maxval = max(x)
    (maxval/minval>r)&&(maxval-minval>d)
}
merge_index = apply(golub_merge_pp, 2, golub_filter)
golub_merge_index = (1:7129)[merge_index]
golub_merge_pp = golub_merge_pp[, golub_merge_index]

# Base 10 logarithmic transformation
golub_merge_p_trans = log10(golub_merge_pp)

#show summary again
dim(golub_merge_p_trans)
table(golub_merge_r)

total3571_predictor = golub_merge_p_trans
total3571_response = golub_merge_r
save(total3571_predictor, total3571_response, file = "../transformed data/golub3571.rda")
# Further standardization to mean 0 variance 1.
scale_golub_merge = scale(golub_merge_p_trans)


  1. 72
  2. 7129
golub_merge_r
ALL AML 
 47  25 
  1. 72
  2. 3571
golub_merge_r
ALL AML 
 47  25 

In [ ]: