Table 1

Golub Data


In [1]:
# suppressMessages(library(golubEsets))
# data(Golub_Merge)
# golub_merge_predict = t(exprs(Golub_Merge))
# golub_merge_response =pData(Golub_Merge)[, "ALL.AML"]
# #Training data predictor and response
# data(Golub_Train)
# golub_train_predict = t(exprs(Golub_Train))
# golub_train_response =pData(Golub_Train)[, "ALL.AML"]
# #Testing data predictor
# data(Golub_Test)
# golub_test_predict = t(exprs(Golub_Test))
# golub_test_response = pData(Golub_Test)[, "ALL.AML"]
# save(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response, golub_merge_predict, golub_merge_response, file = "GolubData.rda")

In [2]:
# golub_train_predict, golub_train_response, golub_test_predict, golub_test_response
load("GolubData.rda")
suppressMessages(library(e1071))
suppressMessages(library(fastAdaboost))
suppressMessages(library(caret))
suppressMessages(library(sparsediscrim))
suppressMessages(library(tree))
suppressMessages(library(fastAdaboost))
suppressMessages(library(bnlearn))
library(ropls)
library(MASS)
set.seed(2017)

Preprocessing and Feature Selection(PPFS) Functions


In [3]:
# Paper 1
## Helper Functions
golub_filter = function(x, r = 5, d=500){
    minval = min(x)
    maxval = max(x)
    (maxval/minval>r)&&(maxval-minval>d)
}
## Neighbourhood analysis
### signal-to-noise ratio/PS in the paper
get_p = function(train_d, train_r){
    tr_m_aml =  colMeans(train_d[train_r == "AML",])
    tr_sd_aml = apply(train_d[train_r == "AML",], 2, sd)
    tr_m_all = colMeans(train_d[train_r == "ALL",])
    tr_sd_all = apply(train_d[train_r == "ALL",], 2, sd)
    p = (tr_m_aml-tr_m_all)/(tr_sd_aml+tr_sd_all)
    return(p)
}
PPFS1 = function(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response){
    "carry out preprocessing on original Golub data and apply feature selection as both were done in Paper 1; 
     output is called TransformedData1"
    # Preprocessing
    ## Thresholding
    golub_train_pp = golub_train_predict
    golub_train_pp[golub_train_pp<100] = 100
    golub_train_pp[golub_train_pp>16000] = 16000

    ## Filtering
    index = apply(golub_train_pp, 2, golub_filter)
    golub_index = (1:7129)[index]
    golub_train_pp = golub_train_pp[, golub_index]

    golub_test_pp = golub_test_predict
    golub_test_pp[golub_test_pp<100] = 100
    golub_test_pp[golub_test_pp>16000] = 16000
    golub_test_pp = golub_test_pp[, golub_index]

    ## Log Transformation
    golub_train_p_trans = log10(golub_train_pp)
    golub_test_p_trans = log10(golub_test_pp)

    ## Normalization
    train_m = colMeans(golub_train_p_trans)
    train_sd = apply(golub_train_p_trans, 2, sd)
    golub_train_p_trans = t((t(golub_train_p_trans)-train_m)/train_sd)
    golub_test_p_trans  = t((t(golub_test_p_trans)-train_m)/train_sd)
    golub_train_3051 = golub_train_p_trans
    golub_test_3051 = golub_test_p_trans
    
    # Feature Selection
    nna = matrix(0, 400, 3051)
    set.seed(201702)
    ## Permutation test
    for(i in 1:400){
        t_r = sample(golub_train_response)
        nna[i, ] = get_p(golub_train_p_trans, t_r)
    }
    
    ## Predictor selection based on the result of Neighbourhood analysis
    nna_q = apply(nna, 2, quantile, prob = c(0.005, 0.995))
    p = get_p(golub_train_p_trans, golub_train_response)
    
    ## Keep the one with 0.01 significant level
    index_1 = (1:3051)[p>=nna_q[2,] | p<=nna_q[1,]]
    golub_train_p_trans = golub_train_p_trans[, index_1]
    train_m_aml = colMeans(golub_train_p_trans[golub_train_response == "AML",])
    train_m_all = colMeans(golub_train_p_trans[golub_train_response =="ALL",])
    golub_test_p_trans =golub_test_p_trans[, index_1]
    p = p[index_1]
    
    ## 50 most informative genes
    cl_index = c(head(order(p), 25), head(order(p, decreasing = T), 25))
    p_50 = p[cl_index]
    b = (train_m_aml[cl_index]+train_m_all[cl_index])/2
    train_cl = golub_train_p_trans[, cl_index]
    test_cl = golub_test_p_trans[, cl_index]
    golub_train_50 = train_cl
    golub_test_50 = test_cl
    train_vote = t(p_50*t(sweep(train_cl, 2, b)))
    TransformedData1 = list(train_predict = golub_train_50, test_predict = golub_test_50, train_response = golub_train_response, test_response = golub_test_response)
    return(TransformedData1)
}

In [4]:
# Paper 3
## Helper Functions
### TNoM score
r_score = function(slabel){
    total = length(slabel)
    n = sum(slabel == 0) 
    p = sum(slabel == 1) 
    temp = min(n, p)
    d = ifelse(n < p, 1, -1)
    for(i in 1:(total-1)){
        count = sum(slabel[1:i] == 0)
        t_t = min(n-2*count+i, p+2*count-i)
        t_d = ifelse((n-2*count+i)<(p+2*count-i),1,-1)
        if(t_t < temp){
            temp = t_t
            d = t_d
        }
    }
    c(temp, t_d)
}

### Significance (using bootstrap with size 1000 to replace)
r_bootstrap = function(gene, label){
    total = length(label)
    index = order(gene)
    s_l = label[index]
    score= r_score(s_l)
    dist_score = numeric(200)
    for(i in 1:200){
        temp = sample(1:total)
        t = r_score(label[temp])
        dist_score[i] = t[1]
    }
    prob = mean(dist_score<score[1])
    c(score[1], score[2], prob)
}

PPFS3 = function(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response){
    "carry out preprocessing on original Golub data and apply feature selection as both were done in Paper 3; 
    output is called TransformedData3"
    golub_train_l = ifelse(golub_train_response == "AML", 1, 0)
    golub_test_l = ifelse(golub_test_response == "AML", 1, 0)
    set.seed(201703)
    # perform the caculation, this may take a while since the inevitable loops in above functions
    a = apply(golub_train_predict, 2, r_bootstrap, label = golub_train_l)
    # select informative genes and subset the train and test data
    index = (1:7129)[a[1,]<14 & a[3,]<0.01]
    b = order(a[1,index])[1:50]
    
    # data subsetting
    train_cl = golub_train_predict[, index]
    test_cl = golub_test_predict[, index]
    train_paper3 = train_cl[, b]
    test_paper3 = test_cl[, b]
    TransformedData3 = list(train_predict = train_paper3, test_predict = test_paper3, train_response = golub_train_response, test_response = golub_test_response)
    return(TransformedData3)
}

In [5]:
# Paper 6

PPFS6PCA = function(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response, K = 3){
    "carry out preprocessing on original Golub data and apply PCA feature selection as both were done in Paper 6; 
    output is called TransformedData6PCA"
    # First Same 50 genes as in PPFS1
    rslt = PPFS1(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response)
    golub_train_p = rslt$train_predict
    golub_train_r = rslt$train_response
    golub_test_p = rslt$test_predict
    golub_test_r = rslt$test_response
    pca_rslt= getLoadingMN(opls(golub_train_p, printL = F, predI = K))
    pca_train_s = t(t(pca_rslt)%*%t(golub_train_p))
    pca_test_s = t(t(pca_rslt)%*%t(golub_test_p))
    TransformedData6PCA = list(train_predict = pca_train_s, test_predict = pca_test_s, train_response = golub_train_r, test_response = golub_test_r)
    return(TransformedData6PCA)
}

PPFS6PLS =  function(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response, K = 3){
    "carry out preprocessing on original Golub data and apply PLS feature selection as both were done in Paper 6; 
    output is called TransformedData6PLS"
    # First Same 50 genes as in PPFS1
    rslt = PPFS1(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response)
    golub_train_p = rslt$train_predict
    golub_train_r = rslt$train_response
    golub_test_p = rslt$test_predict
    golub_test_r = rslt$test_response
    pls_rslt = getLoadingMN(opls(golub_train_p, golub_train_r, printL = F, predI = K))
    pls_train_s = t(t(pls_rslt)%*%t(golub_train_p))
    pls_test_s = t(t(pls_rslt)%*%t(golub_test_p))
    TransformedData6PLS = list(train_predict = pls_train_s, test_predict = pls_test_s, train_response = golub_train_r, test_response = golub_test_r)
    return(TransformedData6PLS)
}

In [6]:
# Paper 9
## Helper Function
### implement function for calculating BW as stated in the paper(the ratio of between-group to within group sums of squares)
BW = function(predictor, response){
    overall = colMeans(predictor)
    ALL_mean = apply(predictor, 2, function(x) mean(x[response == "ALL"]))
    AML_mean = apply(predictor, 2, function(x) mean(x[response == "AML"]))
    numerator = sum(response == "ALL")*(ALL_mean-overall)^2+sum(response == "AML")*(AML_mean-overall)^2
    denumerator = colSums((t(t(predictor[response == "ALL", ])-ALL_mean))^2)+colSums((t(t(predictor[response == "AML", ])-AML_mean))^2)
    numerator/denumerator
}
                     
PPFS9 =  function(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response, K = 3){
    "carry out preprocessing on original Golub data and apply feature selection as both were done in Paper 9; 
    output is called TransformedData9"
    n_train  = nrow(golub_train_predict)
    golub_merge_predict = rbind(golub_train_predict, golub_test_predict)
    golub_merge_response = c(golub_train_response, golub_test_response)
    golub_merge_l = ifelse(golub_merge_response == "AML", 1, 0)
    n_total = nrow(golub_merge_predict)
    # 
    ## Thresholding
    golub_merge_pp = golub_merge_predict
    golub_merge_pp[golub_merge_pp<100] = 100
    golub_merge_pp[golub_merge_pp>16000] = 16000

    merge_index = apply(golub_merge_pp, 2, golub_filter)
    golub_merge_index = (1:7129)[merge_index]
    golub_merge_pp = golub_merge_pp[, golub_merge_index]
    
    ## Base 10 logarithmic transformation
    golub_merge_p_trans = log10(golub_merge_pp)
    # Further standardization to mean 0 variance 1.
    scale_golub_merge = scale(golub_merge_p_trans)
    # randomly split once to do feature selection for comparison for furthur study
    train_p = scale_golub_merge[1:n_train,]
    train_r = golub_train_response
    test_p = scale_golub_merge[(n_train+1):n_total,]
    test_r = golub_test_response
    temp_bw = order(BW(train_p, train_r), decreasing = T)[1:50]
    train_BW_predictor = train_p[,temp_bw]
    test_BW_predictor = test_p[,temp_bw]
    TransformedData9 = list(train_predict = train_BW_predictor, test_predict = test_BW_predictor, train_response = train_r, test_response = test_r)
    return(TransformedData9)
}

In [7]:
# Paper 29
## Helper Function
### SNR: signal-to-noise ratio
get_SNR = function(train_d, train_r){
    tr_m_aml =  colMeans(train_d[train_r == "AML",])
    tr_sd_aml = apply(train_d[train_r == "AML",], 2, sd)
    tr_m_all = colMeans(train_d[train_r == "ALL",])
    tr_sd_all = apply(train_d[train_r == "ALL",], 2, sd)
    p = (tr_m_aml-tr_m_all)/(tr_sd_aml+tr_sd_all)
    return(p)
}
### Kmeans clustering and then SNR ranking selection
get_kmeans = function(k, train_d, train_r){
    cl = kmeans(t(train_d), k, iter.max=50)$cluster
    result = numeric(k)
    for(i in 1:k){
        id = (cl == i)
        oid = (1:ncol(train_d))[id]
        iSNR = get_SNR(t(t(train_d)[id,]),train_r)
        temp = which.max(abs(iSNR))
        result[i] = oid[temp]
    }
    return(result)
}
PPFS29 = function(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response, K = 50){
    "carry out preprocessing on original Golub data and apply feature selection as both were done in Paper 29;
    output is called TransformedData29"
    # Thresholding
    golub_train_pp = golub_train_predict
    golub_train_pp[golub_train_pp<100] = 100
    golub_train_pp[golub_train_pp>16000] = 16000

    # Filtering
    index = apply(golub_train_pp, 2, golub_filter)
    golub_index = (1:7129)[index]
    golub_train_pp = golub_train_pp[, golub_index]

    golub_test_pp = golub_test_predict
    golub_test_pp[golub_test_pp<100] = 100
    golub_test_pp[golub_test_pp>16000] = 16000
    golub_test_pp = golub_test_pp[, golub_index]

    # Log Transformation
    golub_train_p_trans = log10(golub_train_pp)
    golub_test_p_trans = log10(golub_test_pp)

    # Normalization
    train_m = colMeans(golub_train_p_trans)
    train_sd = apply(golub_train_p_trans, 2, sd)
    golub_train_p_trans = t((t(golub_train_p_trans)-train_m)/train_sd)
    golub_test_p_trans  = t((t(golub_test_p_trans)-train_m)/train_sd)

    kmeans_id = get_kmeans(K, golub_train_p_trans, golub_train_response)
    train_kmeans = golub_train_p_trans[,kmeans_id]
    test_kmeans = golub_test_p_trans[,kmeans_id]
    TransformedData29 = list(train_predict = train_kmeans, test_predict = test_kmeans, train_response = golub_train_response, test_response = golub_test_response)
    return(TransformedData29)
}

Model/Classifier


In [8]:
# Paper 1
## Helper function 
get_p = function(train_d, train_r){
    tr_m_aml =  colMeans(train_d[train_r == "AML",])
    tr_sd_aml = apply(train_d[train_r == "AML",], 2, sd)
    tr_m_all = colMeans(train_d[train_r == "ALL",])
    tr_sd_all = apply(train_d[train_r == "ALL",], 2, sd)
    p = (tr_m_aml-tr_m_all)/(tr_sd_aml+tr_sd_all)
    return(p)
}
classifier1 = function(train_p, train_r, test_p, test_r){
    "carry out classification as in Paper 1; 
     output is called Rates1 and is the value in row 1, column_i 
     call this function 6 times using every TransformedData_i as input in turn to get a 1x6 output for row 1"
    train_m_aml = colMeans(train_p[train_r == "AML",])
    train_m_all = colMeans(train_p[train_r =="ALL",])
    b = (train_m_aml+train_m_all)/2
    p = get_p(train_p, train_r)
    #train
    train_vote = t(p*t(sweep(train_p, 2, b)))
    train_V1 = apply(train_vote, 1, function(x) sum(x[x>0]))
    train_V2 = abs(apply(train_vote, 1, function(x) sum(x[x<=0])))
    train_pred = (train_V1-train_V2)/(train_V1+train_V2)
    train_pred_r = ifelse(abs(train_pred)>0.3, ifelse(train_pred>0, "AML", "ALL"), "Uncertain")
    train_table = table(Train_Predict = train_pred_r, Train_Actual = train_r)
    ##train_table
    #test
    test_vote = t(p*t(sweep(test_p, 2, b)))
    test_V1 = apply(test_vote, 1, function(x) sum(x[x>0]))
    test_V2 = abs(apply(test_vote, 1, function(x) sum(x[x<=0])))
    test_pred = (test_V1-test_V2)/(test_V1+test_V2)
    test_pred_r = ifelse(abs(test_pred)>0.3, ifelse(test_pred>0, "AML", "ALL"), "Uncertain")
    test_table = table(Test_Predict = test_pred_r, Test_Actual = test_r)
    ##test accuracy rate
    Rates1 = mean(test_pred_r == test_r)
    return(Rates1)
}

In [9]:
# Paper 3
## NN
### helper function
cl_nn_helper = function(new_s, train, train_label){
    # use Pearson correlation
    corr = apply(train, 1, cor, new_s)
    train_label[corr==max(corr)]
}

classifier3nn = function(train_p, train_r, test_p, test_r){
    "carry out NN classification as in Paper 3;
    output is called Rates3NN and is the value in row 2, column_i
    call this function 6 times using every TransformedData_i as input in turn to get a 1x6 output for row 2"
    nn_test_pr_pl = apply(test_p,1, cl_nn_helper, train_p, train_r)
    Rates3NN = mean(nn_test_pr_pl == test_r)
    return(Rates3NN)
}

## Linear SVM
classifier3lsvm = function(train_p, train_r, test_p, test_r){
    "carry out linear SVM classification as in Paper 3;
    output is called Rates3LSVM and is the value in row 3, column_i
    call this function 6 times using every TransformedData_i as input in turn to get a 1x6 output for row 3"
    r_train = data.frame(train_p, Y = factor(train_r))
    r_test =data.frame( test_p, Y = factor(test_r))
    svm_linear = svm(Y~., data = r_train)
    svm_l_tepr = predict(svm_linear, newdata = r_test)
    Rates3LSVM = mean(svm_l_tepr == test_r)
    return(Rates3LSVM)
}

## Polynomial SVM
classifier3psvm = function(train_p, train_r, test_p, test_r){
    "carry out polynomial degree 2 SVM classification as in Paper 3;
    output is called Rates3PSVM and is the value in row 4, column_i
    call this function 6 times using every TransformedData_i as input in turn to get a 1x6 output for row 4"
    r_train = data.frame(train_p, Y = factor(train_r))
    r_test =data.frame( test_p, Y = factor(test_r))
    svm_quad = svm(Y~., data = r_train, kernel = "polynomial", degree = 2,  gamma =0.01, coef0 = 100)
    svm_q_tepr = predict(svm_quad, newdata = r_test)
    Rates3PSVM = mean(svm_q_tepr == test_r)
    return(Rates3PSVM)
}

## Adaboost

classifier3Ada = function(train_p, train_r, test_p, test_r){
    "carry out Adaboost classification as in Paper 3;
    output is called Rates3Ada and is the value in row 5, column_i
    call this function 6 times using every TransformedData_i as input in turn to get a 1x6 output for row 5"
    r_train_p1 = data.frame(train_p, Y = factor(train_r))
    r_test_p1 =data.frame( test_p, Y = factor(test_r))
    ada_cl_p1 = adaboost(Y~., data = r_train_p1, 100)
    ada_test_pr_p1 = predict(ada_cl_p1, newdata = r_test_p1)
    Rates3Ada = mean(ada_test_pr_p1$class == test_r)
    return(Rates3Ada)
}

In [10]:
# Paper 6
classifier6logit = function(train_p, train_r, test_p, test_r, K = 3){
    "carry out Logistic Discrimination as in Paper 6;
    output is called Rates6Logit and is the value in row 6, column_i
    call this function 6 times using every TransformedData_i as input in turn to get a 1x6 output for row 6"
    pca_slt = getLoadingMN(opls(train_p, printL = F, predI = K))
    pca_train_s = t(t(pca_slt)%*%t(train_p))
    pca_test_s = t(t(pca_slt)%*%t(test_p))
    train_data = data.frame(response = train_r, pca_train_s)
    test_data = pca_test_s
    ld_s = train(response~., data = train_data, method = "glm", family = "binomial", trControl = trainControl(method = "LOOCV"))
    ld_te = predict(ld_s, data.frame(test_data))
    Rates6Logit = mean(ld_te == test_r)
    return(Rates6Logit)
}
classifier6qda = function(train_p, train_r, test_p, test_r, K = 3){
    "carry out QDA as in Paper 6;
    output is called Rates6QDA and is the value in row 7, column_i
    call this function 6 times using every TransformedData_i as input in turn to get a 1x6 output for row 7"
    pca_slt = getLoadingMN(opls(train_p, printL = F, predI = K))
    pca_train_s = t(t(pca_slt)%*%t(train_p))
    pca_test_s = t(t(pca_slt)%*%t(test_p))
    train_data = data.frame(response = train_r, pca_train_s)
    test_data = pca_test_s
    qda_s = train(response~., data = train_data, method = "qda",  trControl = trainControl(method = "LOOCV"))
    qda_te = predict(qda_s, data.frame(test_data))
    Rates6QDA = mean(qda_te == test_r)
    return(Rates6QDA)
}

In [11]:
# Paper 9

## NN
### Distance measure used in the paper
Distance = function(predictor, test){
    1- apply(predictor, 1, cor, test)
}
### NN classification process
paper9_nn = function(test, pk, learning, response){
     distance = Distance(learning, test)
     index = order(distance)[1:pk]
     cl = ifelse(sum(response[index] == "AML")>sum(response[index]=="ALL"), "AML", "ALL")
     cl
}
# leave-one-cross-validation to tune k
mycv= function(pk,learning,response){
    error = 0
    for(i in 1:nrow(learning)){
        cl = paper9_nn(learning[i,], pk, learning[-i, ], response[-i])
        error = error+(cl == response[i])
    }
    error
}
classifier9nn = function(train_p, train_r, test_p, test_r){
    "carry out Nearest Neighbor as in Paper 9;
    output is called Rates9NN and is the value in row 8, column_i
    call this function 6 times using every TransformedData_i as input in turn to get a 1x6 output for row 8"
    k = seq(1, 21, 2)
    choose_k = sapply(k,mycv, learning = train_p, response= train_r)
    nn_test = apply(test_p ,1, paper9_nn, k[which.min(choose_k)], train_p ,train_r)
    Rates9NN = mean(nn_test == test_r)
    return(Rates9NN)
}

## Decision Tree

### test_paper3 test_response train_paper3 train_response loaded
classifier9dt = function(train_p, train_r, test_p, test_r){
    "carry out Decision Tree as in Paper 9;
    output is called Rates9DT and is the value in row 9, column_i
    call this function 6 times using every TransformedData_i as input in turn to get a 1x6 output for row 9"
    cbine_data = data.frame(response = factor(train_r), train_p)
    tree_mdl = tree(response~.,data = cbine_data)
    tree_te = predict(tree_mdl, data.frame(test_p), type = "class")
    Rates9DT = mean(tree_te == test_r)
    return(Rates9DT)
}

## Bagging

my_baghelper = function(train, test){
    bg = sample(nrow(train), replace = T)
    temp_md = tree(response~., data = train[bg, ])
    predict(temp_md, test, type = "class")
}
classifier9bg = function(train_p, train_r, test_p, test_r, B = 50){
    "carry out Bagging as in Paper 9;
    output is called Rates9BG and is the value in row 10, column_i
    call this function 6 times using every TransformedData_i as input in turn to get a 1x6 output for row 10"
    cbine_data = data.frame(response = factor(train_r), train_p)
    t_te = replicate(B, my_baghelper(cbine_data, data.frame(test_p)))
    pred_te = apply(t_te, 1, function(x) ifelse(sum(x == "AML")>sum(x =="ALL"), "AML", "ALL"))
    Rates9BG = mean(pred_te == test_r)
    return(Rates9BG)
}
                    
## Bagging with CPD
                    
CPD = function(x1, x2, d = 0.75){
    a = runif(nrow(x1), 0, d)
    a*x1+(1-a)*x2
}
### helper function for each bagging with CPD
my_cpdhelper = function(train, test){
    id1 = sample(nrow(train), replace = T)
    id2 = sample(nrow(train), replace = T)
    temp = CPD(train[id1, -1], train[id2,-1])
    temp_md = tree(response~., data = data.frame(temp, response = train$response[id1]))
    predict(temp_md, test, type = "class")
}
classifier9bgcpd = function(train_p, train_r, test_p, test_r, B = 50){
    "carry out Bagging with CPD as in Paper 9;
    output is called Rates9BGCPD and is the value in row 11, column_i
    call this function 6 times using every TransformedData_i as input in turn to get a 1x6 output for row 11"
    cbine_data = data.frame(response = factor(train_r), train_p)
    t_te = replicate(B, my_cpdhelper(cbine_data, data.frame(test_p)))
    pred_te = apply(t_te, 1, function(x) ifelse(sum(x == "AML")>sum(x =="ALL"), "AML", "ALL"))
    Rates9BGCPD = mean(pred_te == test_r)
    return(Rates9BGCPD)
}
                    
## FLDA
classifier9flda = function(train_p, train_r, test_p, test_r){
    "carry out FLDA as in Paper 9;
    output is called Rates9FLDA and is the value in row 12, column_i
    call this function 6 times using every TransformedData_i as input in turn to get a 1x6 output for row 12"
    cbine_data = data.frame(response = factor(train_r), train_p)
    flda_md = MASS::lda(response~., data = cbine_data)
    flda_te = predict(flda_md, data.frame(test_p))$class
    Rates9FLDA = mean(flda_te == test_r)
    return(Rates9FLDA)
}
  
## DLDA
                    
classifier9dlda = function(train_p, train_r, test_p, test_r){
    "carry out DLDA as in Paper 9;
    output is called Rates9DLDA and is the value in row 13, column_i
    call this function 6 times using every TransformedData_i as input in turn to get a 1x6 output for row 13"
    cbine_data = data.frame(response = factor(train_r), train_p)
    dlda_md = dlda(response~., data = cbine_data)
    dlda_te = predict(dlda_md, data.frame(test_p))$class
    Rates9DLDA = mean(dlda_te == test_r)
    return(Rates9DLDA)
}
                    
## DQDA                    
classifier9dqda = function(train_p, train_r, test_p, test_r){
    "carry out DQDA as in Paper 9;
    output is called Rates9DQDA and is the value in row 14, column_i
    call this function 6 times using every TransformedData_i as input in turn to get a 1x6 output for row 14"
    cbine_data = data.frame(response = factor(train_r), train_p)
    dqda_md = dlda(response~., data = cbine_data)
    dqda_te = predict(dqda_md, data.frame(test_p))$class
    Rates9DQDA = mean(dqda_te == test_r)
    return(Rates9DQDA)
}

In [12]:
# Paper 29
classifier29 = function(train_p, train_r, test_p, test_r, B = 50){
    "carry out Bayesian Network as in Paper 29;
    output is called Rates29BN and is the value in row 15, column_i
    call this function 6 times using every TransformedData_i as input in turn to get a 1x6 output for row 15"
    train_data = data.frame(train_p, class = train_r)
    test_data = data.frame(test_p)
    eg = empty.graph(c("class", colnames(train_p)))
    arcs(eg) = matrix(c(rep("class", B), 
                        colnames(train_p)), 
                      ncol = 2, byrow = F, 
                      dimnames = list(c(), c("from", "to")))
    fitted = bn.fit(eg, train_data)
    predict_te = predict(fitted, node = "class", method="bayes-lw", test_data)
    Rates29BN = mean(predict_te == test_r)
    return(Rates29BN)
}

In [13]:
# Selection Result
Paper1Selection = PPFS1(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response)
Paper3Selection = PPFS3(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response)
Paper6SelectionPCA = PPFS6PCA(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response, K = 3)
Paper6SelectionPLS = PPFS6PLS(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response, K = 3)
Paper9Selection = PPFS9(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response, K = 3)
Paper29Selection = PPFS29(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response, K = 50)

In [14]:
# Example for calculating Row 1.
Cell11 = classifier1(Paper1Selection$train_predict, Paper1Selection$train_response, Paper1Selection$test_predict, Paper1Selection$test_response)
Cell12 = classifier1(Paper3Selection$train_predict, Paper3Selection$train_response, Paper3Selection$test_predict, Paper3Selection$test_response)
Cell13 = classifier1(Paper6SelectionPCA$train_predict, Paper6SelectionPCA$train_response, Paper6SelectionPCA$test_predict, Paper6SelectionPCA$test_response)
Cell14 = classifier1(Paper6SelectionPLS$train_predict, Paper6SelectionPLS$train_response, Paper6SelectionPLS$test_predict, Paper6SelectionPLS$test_response)
Cell15 = classifier1(Paper9Selection$train_predict, Paper9Selection$train_response, Paper9Selection$test_predict, Paper9Selection$test_response)
Cell16 = classifier1(Paper29Selection$train_predict, Paper29Selection$train_response, Paper29Selection$test_predict, Paper29Selection$test_response)
round(c(Cell11, Cell12, Cell13, Cell14, Cell15, Cell16),4)


  1. 0.9118
  2. 0.9412
  3. 0.9706
  4. 0.9706
  5. 0.8824
  6. 0.7353

In [15]:
# Example for calculating Row 2.
Cell21 = classifier3nn(Paper1Selection$train_predict, Paper1Selection$train_response, Paper1Selection$test_predict, Paper1Selection$test_response)
Cell22 = classifier3nn(Paper3Selection$train_predict, Paper3Selection$train_response, Paper3Selection$test_predict, Paper3Selection$test_response)
Cell23 = classifier3nn(Paper6SelectionPCA$train_predict, Paper6SelectionPCA$train_response, Paper6SelectionPCA$test_predict, Paper6SelectionPCA$test_response)
Cell24 = classifier3nn(Paper6SelectionPLS$train_predict, Paper6SelectionPLS$train_response, Paper6SelectionPLS$test_predict, Paper6SelectionPLS$test_response)
Cell25 = classifier3nn(Paper9Selection$train_predict, Paper9Selection$train_response, Paper9Selection$test_predict, Paper9Selection$test_response)
Cell26 = classifier3nn(Paper29Selection$train_predict, Paper29Selection$train_response, Paper29Selection$test_predict, Paper29Selection$test_response)
round(c(Cell21, Cell22, Cell23, Cell24, Cell25, Cell26),4)


  1. 0.9706
  2. 0.9412
  3. 0.9118
  4. 0.9412
  5. 0.9706
  6. 0.9706

In [16]:
# Example for calculating Row 3.
Cell31 = classifier3lsvm(Paper1Selection$train_predict, Paper1Selection$train_response, Paper1Selection$test_predict, Paper1Selection$test_response)
Cell32 = classifier3lsvm(Paper3Selection$train_predict, Paper3Selection$train_response, Paper3Selection$test_predict, Paper3Selection$test_response)
Cell33 = classifier3lsvm(Paper6SelectionPCA$train_predict, Paper6SelectionPCA$train_response, Paper6SelectionPCA$test_predict, Paper6SelectionPCA$test_response)
Cell34 = classifier3lsvm(Paper6SelectionPLS$train_predict, Paper6SelectionPLS$train_response, Paper6SelectionPLS$test_predict, Paper6SelectionPLS$test_response)
Cell35 = classifier3lsvm(Paper9Selection$train_predict, Paper9Selection$train_response, Paper9Selection$test_predict, Paper9Selection$test_response)
Cell36 = classifier3lsvm(Paper29Selection$train_predict, Paper29Selection$train_response, Paper29Selection$test_predict, Paper29Selection$test_response)
round(c(Cell31, Cell32, Cell33, Cell34, Cell35, Cell36),4)


  1. 0.9706
  2. 0.9706
  3. 0.9412
  4. 0.9706
  5. 0.9706
  6. 0.7647

In [17]:
# Example for calculating Row 4.
Cell41 = classifier3psvm(Paper1Selection$train_predict, Paper1Selection$train_response, Paper1Selection$test_predict, Paper1Selection$test_response)
Cell42 = classifier3psvm(Paper3Selection$train_predict, Paper3Selection$train_response, Paper3Selection$test_predict, Paper3Selection$test_response)
Cell43 = classifier3psvm(Paper6SelectionPCA$train_predict, Paper6SelectionPCA$train_response, Paper6SelectionPCA$test_predict, Paper6SelectionPCA$test_response)
Cell44 = classifier3psvm(Paper6SelectionPLS$train_predict, Paper6SelectionPLS$train_response, Paper6SelectionPLS$test_predict, Paper6SelectionPLS$test_response)
Cell45 = classifier3psvm(Paper9Selection$train_predict, Paper9Selection$train_response, Paper9Selection$test_predict, Paper9Selection$test_response)
Cell46 = classifier3psvm(Paper29Selection$train_predict, Paper29Selection$train_response, Paper29Selection$test_predict, Paper29Selection$test_response)
round(c(Cell41, Cell42, Cell43, Cell44, Cell45, Cell46),4)


  1. 0.9706
  2. 0.8824
  3. 0.9706
  4. 0.9706
  5. 0.9706
  6. 0.9118

In [18]:
# Example for calculating Row 5.
Cell51 = classifier3Ada (Paper1Selection$train_predict, Paper1Selection$train_response, Paper1Selection$test_predict, Paper1Selection$test_response)
Cell52 = classifier3Ada (Paper3Selection$train_predict, Paper3Selection$train_response, Paper3Selection$test_predict, Paper3Selection$test_response)
Cell53 = classifier3Ada (Paper6SelectionPCA$train_predict, Paper6SelectionPCA$train_response, Paper6SelectionPCA$test_predict, Paper6SelectionPCA$test_response)
Cell54 = classifier3Ada (Paper6SelectionPLS$train_predict, Paper6SelectionPLS$train_response, Paper6SelectionPLS$test_predict, Paper6SelectionPLS$test_response)
Cell55 = classifier3Ada (Paper9Selection$train_predict, Paper9Selection$train_response, Paper9Selection$test_predict, Paper9Selection$test_response)
Cell56 = classifier3Ada (Paper29Selection$train_predict, Paper29Selection$train_response, Paper29Selection$test_predict, Paper29Selection$test_response)
round(c(Cell51, Cell52, Cell53, Cell54, Cell55, Cell56),4)


  1. 0.9118
  2. 0.9118
  3. 0.9706
  4. 0.9706
  5. 0.9118
  6. 0.9118

In [19]:
# Example for calculating Row 6.
options(warn = -1)
Cell61 = classifier6logit(Paper1Selection$train_predict, Paper1Selection$train_response, Paper1Selection$test_predict, Paper1Selection$test_response)
Cell62 = classifier6logit(Paper3Selection$train_predict, Paper3Selection$train_response, Paper3Selection$test_predict, Paper3Selection$test_response)
Cell63 = classifier6logit(Paper6SelectionPCA$train_predict, Paper6SelectionPCA$train_response, Paper6SelectionPCA$test_predict, Paper6SelectionPCA$test_response)
Cell64 = classifier6logit(Paper6SelectionPLS$train_predict, Paper6SelectionPLS$train_response, Paper6SelectionPLS$test_predict, Paper6SelectionPLS$test_response)
Cell65 = classifier6logit(Paper9Selection$train_predict, Paper9Selection$train_response, Paper9Selection$test_predict, Paper9Selection$test_response)
Cell66 = classifier6logit(Paper29Selection$train_predict, Paper29Selection$train_response, Paper29Selection$test_predict, Paper29Selection$test_response)
round(c(Cell61, Cell62, Cell63, Cell64, Cell65, Cell66),4)


  1. 0.9706
  2. 0.9706
  3. 0.9706
  4. 0.9706
  5. 0.9706
  6. 0.8824

In [20]:
# Example for calculating Row 7.
options(warn = -1)
Cell71 = classifier6qda(Paper1Selection$train_predict, Paper1Selection$train_response, Paper1Selection$test_predict, Paper1Selection$test_response)
Cell72 = classifier6qda(Paper3Selection$train_predict, Paper3Selection$train_response, Paper3Selection$test_predict, Paper3Selection$test_response)
Cell73 = classifier6qda(Paper6SelectionPCA$train_predict, Paper6SelectionPCA$train_response, Paper6SelectionPCA$test_predict, Paper6SelectionPCA$test_response)
Cell74 = classifier6qda(Paper6SelectionPLS$train_predict, Paper6SelectionPLS$train_response, Paper6SelectionPLS$test_predict, Paper6SelectionPLS$test_response)
Cell75 = classifier6qda(Paper9Selection$train_predict, Paper9Selection$train_response, Paper9Selection$test_predict, Paper9Selection$test_response)
Cell76 = classifier6qda(Paper29Selection$train_predict, Paper29Selection$train_response, Paper29Selection$test_predict, Paper29Selection$test_response)
round(c(Cell71, Cell72, Cell73, Cell74, Cell75, Cell76),4)


  1. 0.9412
  2. 0.9118
  3. 0.9412
  4. 0.9706
  5. 0.9706
  6. 0.8529

In [21]:
# Example for calculating Row 8.
Cell81 = classifier9nn(Paper1Selection$train_predict, Paper1Selection$train_response, Paper1Selection$test_predict, Paper1Selection$test_response)
Cell82 = classifier9nn(Paper3Selection$train_predict, Paper3Selection$train_response, Paper3Selection$test_predict, Paper3Selection$test_response)
Cell83 = classifier9nn(Paper6SelectionPCA$train_predict, Paper6SelectionPCA$train_response, Paper6SelectionPCA$test_predict, Paper6SelectionPCA$test_response)
Cell84 = classifier9nn(Paper6SelectionPLS$train_predict, Paper6SelectionPLS$train_response, Paper6SelectionPLS$test_predict, Paper6SelectionPLS$test_response)
Cell85 = classifier9nn(Paper9Selection$train_predict, Paper9Selection$train_response, Paper9Selection$test_predict, Paper9Selection$test_response)
Cell86 = classifier9nn(Paper29Selection$train_predict, Paper29Selection$train_response, Paper29Selection$test_predict, Paper29Selection$test_response)
round(c(Cell81, Cell82, Cell83, Cell84, Cell85, Cell86),4)


  1. 0.9706
  2. 0.9118
  3. 0.8529
  4. 0.9706
  5. 0.9412
  6. 0.9412

In [22]:
# Example for calculating Row 9.
Cell91 = classifier9dt(Paper1Selection$train_predict, Paper1Selection$train_response, Paper1Selection$test_predict, Paper1Selection$test_response)
Cell92 = classifier9dt(Paper3Selection$train_predict, Paper3Selection$train_response, Paper3Selection$test_predict, Paper3Selection$test_response)
Cell93 = classifier9dt(Paper6SelectionPCA$train_predict, Paper6SelectionPCA$train_response, Paper6SelectionPCA$test_predict, Paper6SelectionPCA$test_response)
Cell94 = classifier9dt(Paper6SelectionPLS$train_predict, Paper6SelectionPLS$train_response, Paper6SelectionPLS$test_predict, Paper6SelectionPLS$test_response)
Cell95 = classifier9dt(Paper9Selection$train_predict, Paper9Selection$train_response, Paper9Selection$test_predict, Paper9Selection$test_response)
Cell96 = classifier9dt(Paper29Selection$train_predict, Paper29Selection$train_response, Paper29Selection$test_predict, Paper29Selection$test_response)
round(c(Cell91, Cell92, Cell93, Cell94, Cell95, Cell96),4)


  1. 0.9118
  2. 0.9118
  3. 0.9706
  4. 0.9706
  5. 0.9118
  6. 0.7353

In [23]:
# Example for calculating Row 10.
Cell101 = classifier9bg(Paper1Selection$train_predict, Paper1Selection$train_response, Paper1Selection$test_predict, Paper1Selection$test_response)
Cell102 = classifier9bg(Paper3Selection$train_predict, Paper3Selection$train_response, Paper3Selection$test_predict, Paper3Selection$test_response)
Cell103 = classifier9bg(Paper6SelectionPCA$train_predict, Paper6SelectionPCA$train_response, Paper6SelectionPCA$test_predict, Paper6SelectionPCA$test_response)
Cell104 = classifier9bg(Paper6SelectionPLS$train_predict, Paper6SelectionPLS$train_response, Paper6SelectionPLS$test_predict, Paper6SelectionPLS$test_response)
Cell105 = classifier9bg(Paper9Selection$train_predict, Paper9Selection$train_response, Paper9Selection$test_predict, Paper9Selection$test_response)
Cell106 = classifier9bg(Paper29Selection$train_predict, Paper29Selection$train_response, Paper29Selection$test_predict, Paper29Selection$test_response)
round(c(Cell101, Cell102, Cell103, Cell104, Cell105, Cell106),4)


  1. 0.9412
  2. 0.9118
  3. 0.9706
  4. 0.9706
  5. 0.9118
  6. 0.7647

In [24]:
# Example for calculating Row 11.
Cell111 = classifier9bgcpd(Paper1Selection$train_predict, Paper1Selection$train_response, Paper1Selection$test_predict, Paper1Selection$test_response)
Cell112 = classifier9bgcpd(Paper3Selection$train_predict, Paper3Selection$train_response, Paper3Selection$test_predict, Paper3Selection$test_response)
Cell113 = classifier9bgcpd(Paper6SelectionPCA$train_predict, Paper6SelectionPCA$train_response, Paper6SelectionPCA$test_predict, Paper6SelectionPCA$test_response)
Cell114 = classifier9bgcpd(Paper6SelectionPLS$train_predict, Paper6SelectionPLS$train_response, Paper6SelectionPLS$test_predict, Paper6SelectionPLS$test_response)
Cell115 = classifier9bgcpd(Paper9Selection$train_predict, Paper9Selection$train_response, Paper9Selection$test_predict, Paper9Selection$test_response)
Cell116 = classifier9bgcpd(Paper29Selection$train_predict, Paper29Selection$train_response, Paper29Selection$test_predict, Paper29Selection$test_response)
round(c(Cell111, Cell112, Cell113, Cell114, Cell115, Cell116),4)


  1. 0.7353
  2. 0.8529
  3. 0.8235
  4. 0.9118
  5. 0.7647
  6. 0.6765

In [25]:
# Example for calculating Row 12.
Cell121 = classifier9flda(Paper1Selection$train_predict, Paper1Selection$train_response, Paper1Selection$test_predict, Paper1Selection$test_response)
Cell122 = classifier9flda(Paper3Selection$train_predict, Paper3Selection$train_response, Paper3Selection$test_predict, Paper3Selection$test_response)
Cell123 = classifier9flda(Paper6SelectionPCA$train_predict, Paper6SelectionPCA$train_response, Paper6SelectionPCA$test_predict, Paper6SelectionPCA$test_response)
Cell124 = classifier9flda(Paper6SelectionPLS$train_predict, Paper6SelectionPLS$train_response, Paper6SelectionPLS$test_predict, Paper6SelectionPLS$test_response)
Cell125 = classifier9flda(Paper9Selection$train_predict, Paper9Selection$train_response, Paper9Selection$test_predict, Paper9Selection$test_response)
Cell126 = classifier9flda(Paper29Selection$train_predict, Paper29Selection$train_response, Paper29Selection$test_predict, Paper29Selection$test_response)
round(c(Cell121, Cell122, Cell123, Cell124, Cell125, Cell126), 4)


  1. 0.8824
  2. 0.8824
  3. 0.9706
  4. 0.9706
  5. 0.8824
  6. 0.8824

In [26]:
# Example for calculating Row 13.
Cell131 = classifier9dlda(Paper1Selection$train_predict, Paper1Selection$train_response, Paper1Selection$test_predict, Paper1Selection$test_response)
Cell132 = classifier9dlda(Paper3Selection$train_predict, Paper3Selection$train_response, Paper3Selection$test_predict, Paper3Selection$test_response)
Cell133 = classifier9dlda(Paper6SelectionPCA$train_predict, Paper6SelectionPCA$train_response, Paper6SelectionPCA$test_predict, Paper6SelectionPCA$test_response)
Cell134 = classifier9dlda(Paper6SelectionPLS$train_predict, Paper6SelectionPLS$train_response, Paper6SelectionPLS$test_predict, Paper6SelectionPLS$test_response)
Cell135 = classifier9dlda(Paper9Selection$train_predict, Paper9Selection$train_response, Paper9Selection$test_predict, Paper9Selection$test_response)
Cell136 = classifier9dlda(Paper29Selection$train_predict, Paper29Selection$train_response, Paper29Selection$test_predict, Paper29Selection$test_response)
round(c(Cell131, Cell132, Cell133, Cell134, Cell135, Cell136),4)


  1. 0.9706
  2. 0.9412
  3. 0.9706
  4. 0.9706
  5. 0.9706
  6. 0.8824

In [27]:
# Example for calculating Row 14.
Cell141 = classifier9dqda(Paper1Selection$train_predict, Paper1Selection$train_response, Paper1Selection$test_predict, Paper1Selection$test_response)
Cell142 = classifier9dqda(Paper3Selection$train_predict, Paper3Selection$train_response, Paper3Selection$test_predict, Paper3Selection$test_response)
Cell143 = classifier9dqda(Paper6SelectionPCA$train_predict, Paper6SelectionPCA$train_response, Paper6SelectionPCA$test_predict, Paper6SelectionPCA$test_response)
Cell144 = classifier9dqda(Paper6SelectionPLS$train_predict, Paper6SelectionPLS$train_response, Paper6SelectionPLS$test_predict, Paper6SelectionPLS$test_response)
Cell145 = classifier9dqda(Paper9Selection$train_predict, Paper9Selection$train_response, Paper9Selection$test_predict, Paper9Selection$test_response)
Cell146 = classifier9dqda(Paper29Selection$train_predict, Paper29Selection$train_response, Paper29Selection$test_predict, Paper29Selection$test_response)
round(c(Cell141, Cell142, Cell143, Cell144, Cell145, Cell146), 4)


  1. 0.9706
  2. 0.9412
  3. 0.9706
  4. 0.9706
  5. 0.9706
  6. 0.8824

In [28]:
# Example for calculating Row 15.
Cell151 = classifier29(Paper1Selection$train_predict, Paper1Selection$train_response, Paper1Selection$test_predict, Paper1Selection$test_response)
Cell152 = classifier29(data.frame(apply(Paper3Selection$train_predict, 2, as.numeric)), Paper3Selection$train_response, data.frame(apply(Paper3Selection$test_predict , 2, as.numeric)), Paper3Selection$test_response)
Cell153 = classifier29(Paper6SelectionPCA$train_predict, Paper6SelectionPCA$train_response, Paper6SelectionPCA$test_predict, Paper6SelectionPCA$test_response, B = 3)
Cell154 = classifier29(Paper6SelectionPLS$train_predict, Paper6SelectionPLS$train_response, Paper6SelectionPLS$test_predict, Paper6SelectionPLS$test_response, B = 3)
Cell155 = classifier29(data.frame(Paper9Selection$train_predict), Paper9Selection$train_response, data.frame(Paper9Selection$test_predict), Paper9Selection$test_response)
Cell156 = classifier29(data.frame(Paper29Selection$train_predict),Paper29Selection$train_response, data.frame(Paper29Selection$test_predict), Paper29Selection$test_response)
round(c(Cell151, Cell152, Cell153, Cell154, Cell155, Cell156), 4)


  1. 0.7353
  2. 0.8824
  3. 0.9706
  4. 0.9706
  5. 0.8235
  6. 0.6176

Table 1

Paper1 Selection Paper3 Selection Paper6 PCA Selection Paper6 PLS Selection Paper9 Selection Paper29 Selection
Paper1 classifiier 0.912 0.941 0.971 0.971 0.882 0.735
Paper 3 NN 0.971 0.941 0.912 0.941 0.971 0.971
Paper 3 SVM Linear 0.971 0.971 0.941 0.971 0.971 0.765
Paper 3 SVM Quadratic 0.971 0.882 0.971 0.971 0.971 0.912
Paper 3 Adaboost 0.912 0.912 0.971 0.971 0.912 0.912
Paper 6 logit 0.971 0.971 0.971 0.971 0.971 0.882
Paper 6 qda 0.941 0.912 0.941 0.971 0.971 0.853
Paper 9 nn 0.971 0.912 0.853 0.971 0.941 0.941
Paper 9 decision tree 0.912 0.912 0.971 0.971 0.912 0.735
Paper 9 bagging 0.941 0.912 0.971 0.971 0.912 0.765
Paper 9 bagging with CPD 0.735 0.853 0.8235 0.912 0.765 0.677
Paper 9 FLDA 0.882 0.882 0.971 0.971 0.882 0.882
Paper 9 DLDA 0.971 0.941 0.971 0.971 0.971 0.882
Paper 9 DQDA 0.971 0.941 0.971 0.971 0.971 0.882
Paper 29 Bayesian Network 0.735 0.882 0.971 0.971 0.824 0.618

Table 2

Reported Result Reproduced result
Paper1 0.853(29/34) 0.912(31/34)
Paper 3 NN 0.916 0.941
Paper 3 SVM Linear 0.933 0.971
Paper 3 SVM Quadratic 0.944 0.882
Paper 3 Adaboost 0.958 0.912
Paper 6 logit(PCA) 1.00 0.971
Paper 6 qda(PCA) 0.972 0.941
Paper 6 logit(PLS) 0.944 0.971
Paper 6 qda(PLS) 0.944 0.971
Paper 9 nn 0.958 0.941
Paper 9 decision tree 0.875 0.912
Paper 9 bagging 0.917 0.912
Paper 9 bagging with CPD 0.958 0.765
Paper 9 FLDA 0.875 0.882
Paper 9 DLDA 1 0.971
Paper 9 DQDA 0.958 0.971
Paper 29 Bayesian Network 0.941 0.618

In [ ]: