In [2]:
# golub_train_predict, golub_train_response, golub_test_predict, golub_test_response
load("GolubData.rda")
set.seed(2017)

In [3]:
# Paper 1
## Helper Functions
golub_filter = function(x, r = 5, d=500){
    minval = min(x)
    maxval = max(x)
    (maxval/minval>r)&&(maxval-minval>d)
}
## Neighbourhood analysis
### signal-to-noise ratio/PS in the paper
get_p = function(train_d, train_r){
    tr_m_aml =  colMeans(train_d[train_r == "AML",])
    tr_sd_aml = apply(train_d[train_r == "AML",], 2, sd)
    tr_m_all = colMeans(train_d[train_r == "ALL",])
    tr_sd_all = apply(train_d[train_r == "ALL",], 2, sd)
    p = (tr_m_aml-tr_m_all)/(tr_sd_aml+tr_sd_all)
    return(p)
}
PPFS1 = function(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response){
    "carry out preprocessing on original Golub data and apply feature selection as both were done in Paper 1; 
     output is called TransformedData1"
    # Preprocessing
    ## Thresholding
    golub_train_pp = golub_train_predict
    golub_train_pp[golub_train_pp<100] = 100
    golub_train_pp[golub_train_pp>16000] = 16000

    ## Filtering
    index = apply(golub_train_pp, 2, golub_filter)
    golub_index = (1:7129)[index]
    golub_train_pp = golub_train_pp[, golub_index]

    golub_test_pp = golub_test_predict
    golub_test_pp[golub_test_pp<100] = 100
    golub_test_pp[golub_test_pp>16000] = 16000
    golub_test_pp = golub_test_pp[, golub_index]

    ## Log Transformation
    golub_train_p_trans = log10(golub_train_pp)
    golub_test_p_trans = log10(golub_test_pp)

    ## Normalization
    train_m = colMeans(golub_train_p_trans)
    train_sd = apply(golub_train_p_trans, 2, sd)
    golub_train_p_trans = t((t(golub_train_p_trans)-train_m)/train_sd)
    golub_test_p_trans  = t((t(golub_test_p_trans)-train_m)/train_sd)
    golub_train_3051 = golub_train_p_trans
    golub_test_3051 = golub_test_p_trans
    
    # Feature Selection
    nna = matrix(0, 400, 3051)
    set.seed(201702)
    ## Permutation test
    for(i in 1:400){
        t_r = sample(golub_train_response)
        nna[i, ] = get_p(golub_train_p_trans, t_r)
    }
    
    ## Predictor selection based on the result of Neighbourhood analysis
    nna_q = apply(nna, 2, quantile, prob = c(0.005, 0.995))
    p = get_p(golub_train_p_trans, golub_train_response)
    
    ## Keep the one with 0.01 significant level
    index_1 = (1:3051)[p>=nna_q[2,] | p<=nna_q[1,]]
    golub_train_p_trans = golub_train_p_trans[, index_1]
    train_m_aml = colMeans(golub_train_p_trans[golub_train_response == "AML",])
    train_m_all = colMeans(golub_train_p_trans[golub_train_response =="ALL",])
    golub_test_p_trans =golub_test_p_trans[, index_1]
    p = p[index_1]
    
    ## 50 most informative genes
    cl_index = c(head(order(p), 25), head(order(p, decreasing = T), 25))
    p_50 = p[cl_index]
    b = (train_m_aml[cl_index]+train_m_all[cl_index])/2
    train_cl = golub_train_p_trans[, cl_index]
    test_cl = golub_test_p_trans[, cl_index]
    golub_train_50 = train_cl
    golub_test_50 = test_cl
    train_vote = t(p_50*t(sweep(train_cl, 2, b)))
    TransformedData1 = list(train_predict = golub_train_50, test_predict = golub_test_50, train_response = golub_train_response, test_response = golub_test_response)
    return(TransformedData1)
}