Classification on paper29 feature selection

Summary of paper 29 feature selection:

  • Data:
    • Train: 38*7129 (27ALL/11AML)
    • Test: 34*7129 (20ALL/ 14 AML)
  • Preprocessing:
    • Thresholding
    • Filtering
    • Transformation
    • Normalization
  • Feature Selection:

    • SNR
    • Kmeans Clustering


In [1]:
suppressMessages(library(e1071))
suppressMessages(library(fastAdaboost))
suppressMessages(library(caret))
suppressMessages(library(sparsediscrim))
suppressMessages(library(tree))
suppressMessages(library(fastAdaboost))
suppressMessages(library(bnlearn))
library(ropls)
library(MASS)

set.seed(201703)
Data

In [2]:
# train_kmeans, golub_train_r, test_kmeans, golub_test_r
load("../transformed data/paper29.rda")
Paper1 classifiier

In [3]:
get_p = function(train_d, train_r){
    tr_m_aml =  colMeans(train_d[train_r == "AML",])
    tr_sd_aml = apply(train_d[train_r == "AML",], 2, sd)
    tr_m_all = colMeans(train_d[train_r == "ALL",])
    tr_sd_all = apply(train_d[train_r == "ALL",], 2, sd)
    p = (tr_m_aml-tr_m_all)/(tr_sd_aml+tr_sd_all)
    return(p)
}
classifier1 = function(train_p, train_r, test_p, test_r){
    train_m_aml = colMeans(train_p[train_r == "AML",])
    train_m_all = colMeans(train_p[train_r =="ALL",])
    b = (train_m_aml+train_m_all)/2
    p = get_p(train_p, train_r)
    #train
    train_vote = t(p*t(sweep(train_p, 2, b)))
    train_V1 = apply(train_vote, 1, function(x) sum(x[x>0]))
    train_V2 = abs(apply(train_vote, 1, function(x) sum(x[x<=0])))
    train_pred = (train_V1-train_V2)/(train_V1+train_V2)
    train_pred_r = ifelse(abs(train_pred)>0.3, ifelse(train_pred>0, "AML", "ALL"), "Uncertain")
    train_table = table(Train_Predict = train_pred_r, Train_Actual = train_r)
    ##train_table
    #test
    test_vote = t(p*t(sweep(test_p, 2, b)))
    test_V1 = apply(test_vote, 1, function(x) sum(x[x>0]))
    test_V2 = abs(apply(test_vote, 1, function(x) sum(x[x<=0])))
    test_pred = (test_V1-test_V2)/(test_V1+test_V2)
    test_pred_r = ifelse(abs(test_pred)>0.3, ifelse(test_pred>0, "AML", "ALL"), "Uncertain")
    test_table = table(Test_Predict = test_pred_r, Test_Actual = test_r)
    ##test_table
    return(list(train = train_table, test = test_table))
}

In [4]:
paper1 = classifier1(train_kmeans, golub_train_r, test_kmeans, golub_test_r)
paper1$train
paper1$test


             Train_Actual
Train_Predict ALL AML
          ALL  27   0
          AML   0  11
            Test_Actual
Test_Predict ALL AML
   ALL        18   1
   AML         0   6
   Uncertain   2   7
Paper 3 classifier
  • NN

In [5]:
# helper function
cl_nn_helper = function(new_s, train, train_label){
    # use Pearson correlation
    corr = apply(train, 1, cor, new_s)
    train_label[corr==max(corr)]
}

classifier3nn = function(train_p, train_r, test_p, test_r){
    nn_train_pr_p1 = apply(train_p,1, cl_nn_helper, train_p, train_r)
    nn_test_pr_p1 = apply(test_p,1, cl_nn_helper, train_p, train_r)
    train_table = table(Train_Predict = nn_train_pr_p1, Train_Actual =train_r)
    test_table = table(Test_Predict = nn_test_pr_p1, Test_Actual = test_r)
    return(list(train = train_table, test = test_table))
}
paper3nn = classifier3nn(train_kmeans, golub_train_r, test_kmeans, golub_test_r)
paper3nn$train
paper3nn$test


             Train_Actual
Train_Predict ALL AML
          ALL  27   0
          AML   0  11
            Test_Actual
Test_Predict ALL AML
         ALL  17   0
         AML   3  14
  • SVM

In [6]:
# linear SVM
classifier3lsvm = function(train_p, train_r, test_p, test_r){
    r_train = data.frame(train_p, Y = factor(train_r))
    r_test =data.frame( test_p, Y = factor(test_r))
    svm_linear = svm(Y~., data = r_train)
    svm_l_trpr = predict(svm_linear, r_train)
    svm_l_tepr = predict(svm_linear, newdata = r_test)
    train_table = table(Train_Predicted  = svm_l_trpr, Train_Actual = train_r)
    test_table = table(Test_Predicted = svm_l_tepr, Test_Actual = test_r)
    return(list(train = train_table, test = test_table))
}
paper3lsvm = classifier3lsvm(train_kmeans, golub_train_r, test_kmeans, golub_test_r)
paper3lsvm$train
paper3lsvm$test


               Train_Actual
Train_Predicted ALL AML
            ALL  27   0
            AML   0  11
              Test_Actual
Test_Predicted ALL AML
           ALL  20   8
           AML   0   6

In [7]:
classifier3qsvm = function(train_p, train_r, test_p, test_r){
    r_train = data.frame(train_p, Y = factor(train_r))
    r_test =data.frame( test_p, Y = factor(test_r))
    svm_quad = svm(Y~., data = r_train, kernel = "polynomial", degree = 2,  gamma =0.01, coef0 = 100)
    svm_q_trpr = predict(svm_quad, r_train )
    svm_q_tepr = predict(svm_quad, newdata = r_test)
    train_table = table(Train_Predicted  = svm_q_trpr, Train_Actual = train_r)
    test_table = table(Test_Predicted = svm_q_tepr, Test_Actual = test_r)
    return(list(train = train_table, test = test_table))
}
paper3qsvm = classifier3qsvm(train_kmeans, golub_train_r, test_kmeans, golub_test_r)
paper3qsvm$train
paper3qsvm$test


               Train_Actual
Train_Predicted ALL AML
            ALL  27   0
            AML   0  11
              Test_Actual
Test_Predicted ALL AML
           ALL  20   3
           AML   0  11
  • Adaboost

In [8]:
classifier3Ada = function(train_p, train_r, test_p, test_r){
    r_train_p1 = data.frame(train_p, Y = factor(train_r))
    r_test_p1 =data.frame( test_p, Y = factor(test_r))
    ada_cl_p1 = adaboost(Y~., data = r_train_p1, 100)
    ada_train_pr_p1 = predict(ada_cl_p1, r_train_p1)
    ada_test_pr_p1 = predict(ada_cl_p1, newdata = r_test_p1)
    train_table = table(Train_Predict = ada_train_pr_p1$class, Train_Actual = train_r)
    test_table = table(Test_Predict = ada_test_pr_p1$class, Test_Actual =test_r)
    return(list(train = train_table, test = test_table))
}
paper3Ada = classifier3Ada(train_kmeans, golub_train_r, test_kmeans, golub_test_r)
paper3Ada$train
paper3Ada$test


             Train_Actual
Train_Predict ALL AML
          ALL  27   0
          AML   0  11
            Test_Actual
Test_Predict ALL AML
         ALL  20   2
         AML   0  12
Paper 6 classifier

In [9]:
pca_helper = function(train_p, train_r, test_p, test_r, K=3){
    pca_slt = getLoadingMN(opls(train_p, printL = F, predI = K))
    pca_train_s = t(t(pca_slt)%*%t(train_p))
    pca_test_s = t(t(pca_slt)%*%t(test_p))
    return(list(pca_train = pca_train_s, pca_test = pca_test_s))
}
pls_helper = function(train_p, train_r, test_p, test_r, K=3){
    pls_slt = getLoadingMN(opls(train_p, train_r, printL = F, predI = K))
    pls_train_s = t(t(pls_slt)%*%t(train_p))
    pls_test_s = t(t(pls_slt)%*%t(test_p))
    return(list(pls_train = pls_train_s, pls_test = pls_test_s))
}

classifier6logit = function(train_p, train_r, test_p, test_r, dr = "pca"){
    if(dr == "pca"){
        pca_result = pca_helper(train_p, train_r, test_p, test_r)
        train_data = data.frame(response = train_r, pca_result$pca_train)
        test_data = pca_result$pca_test
    }else{
        pls_result = pls_helper(train_p, train_r, test_p, test_r)
        train_data = data.frame(response = train_r, pls_result$pls_train)
        test_data = pls_result$pls_test
    }   
    ld_s = train(response~., data = train_data, method = "glm", family = "binomial", trControl = trainControl(method = "LOOCV"))
    ld_tr = predict(ld_s)
    ld_te = predict(ld_s, data.frame(test_data))
    ld_ac = mean(ld_te == test_r)
    ld_re = c(LOOCV = ld_s$results$Accuracy, Test = ld_ac)
    train_table = table(Train_Predict = ld_tr, Train_Actual = train_r)
    test_table = table(Test_Predict = ld_te, Test_Actual =test_r)
    return(list(train = train_table, test = test_table, re = ld_re))
}
classifier6qda = function(train_p, train_r, test_p, test_r, dr= "pca"){
    if(dr == "pca"){
        pca_result = pca_helper(train_p, train_r, test_p, test_r)
        train_data = data.frame(response = train_r, pca_result$pca_train)
        test_data = pca_result$pca_test
    }else{
        pls_result = pls_helper(train_p, train_r, test_p, test_r)
        train_data = data.frame(response = train_r, pls_result$pls_train)
        test_data = pls_result$pls_test
    }   
    qda_s = train(response~., data = train_data, method = "qda",  trControl = trainControl(method = "LOOCV"))
    qda_tr = predict(qda_s)
    qda_te = predict(qda_s, data.frame(test_data))
    qda_ac = mean(qda_te == test_r)
    qda_re = c(LOOCV = qda_s$results$Accuracy, Test = qda_ac)
    train_table = table(Train_Predict = qda_tr, Train_Actual = train_r)
    test_table = table(Test_Predict = qda_te, Test_Actual =test_r)
    return(list(train = train_table, test = test_table, re = qda_re))
}

In [10]:
options(warn=-1)
pca_logit = classifier6logit(train_kmeans, golub_train_r, test_kmeans, golub_test_r)
print("pca, logit")
pca_logit$train
pca_logit$test
pca_qda = classifier6qda(train_kmeans, golub_train_r, test_kmeans, golub_test_r)
print("pca, qda")
pca_qda$train
pca_qda$test
pls_logit = classifier6logit(train_kmeans, golub_train_r, test_kmeans, golub_test_r, "pls")
print("pls, logit")
pls_logit$train
pls_logit$test
pls_qda = classifier6qda(train_kmeans, golub_train_r, test_kmeans, golub_test_r, "pls")
print("pls, qda")
pls_qda$train
pls_qda$test


[1] "pca, logit"
             Train_Actual
Train_Predict ALL AML
          ALL  27   0
          AML   0  11
            Test_Actual
Test_Predict ALL AML
         ALL  20   5
         AML   0   9
[1] "pca, qda"
             Train_Actual
Train_Predict ALL AML
          ALL  27   0
          AML   0  11
            Test_Actual
Test_Predict ALL AML
         ALL  20   5
         AML   0   9
[1] "pls, logit"
             Train_Actual
Train_Predict ALL AML
          ALL  27   0
          AML   0  11
            Test_Actual
Test_Predict ALL AML
         ALL  20   5
         AML   0   9
[1] "pls, qda"
             Train_Actual
Train_Predict ALL AML
          ALL  27   0
          AML   0  11
            Test_Actual
Test_Predict ALL AML
         ALL  20   5
         AML   0   9
Paper 9 classifier
  • NN

In [11]:
# Distance measure used in the paper
Distance = function(predictor, test){
    1- apply(predictor, 1, cor, test)
}
# NN classification process
paper9_nn = function(test, pk, learning, response){
     distance = Distance(learning, test)
     index = order(distance)[1:pk]
     cl = ifelse(sum(response[index] == "AML")>sum(response[index]=="ALL"), "AML", "ALL")
     cl
}
# leave-one-cross-validation to tune k
mycv= function(pk,learning,response){
    error = 0
    for(i in 1:nrow(learning)){
        cl = paper9_nn(learning[i,], pk, learning[-i, ], response[-i])
        error = error+(cl == response[i])
    }
    error
}
classifier9nn = function(train_p, train_r, test_p, test_r){
    k = seq(1, 21, 2)
    choose_k = sapply(k,mycv, learning = train_p, response= train_r)
    nn_train = apply(train_p, 1, paper9_nn, k[which.min(choose_k)], train_p, train_r)
    nn_test = apply(test_p ,1, paper9_nn, k[which.min(choose_k)], train_p ,train_r)
    train_table = table(Train_Predict = nn_train, Train_Actual = train_r)
    test_table = table(Test_Predict = nn_test, Test_Actual = test_r)
    return(list(train = train_table, test = test_table))
}
paper9nn = classifier9nn(train_kmeans, golub_train_r, test_kmeans, golub_test_r)
paper9nn$train
paper9nn$test


             Train_Actual
Train_Predict ALL AML
          ALL  27   0
          AML   0  11
            Test_Actual
Test_Predict ALL AML
         ALL  20   1
         AML   0  13
  • Decision Tree

In [12]:
# test_paper3 test_response train_paper3 train_response loaded
classifier9dt = function(train_p, train_r, test_p, test_r){
    cbine_data = data.frame(response = factor(train_r), train_p)
    tree_mdl = tree(response~.,data = cbine_data)
    tree_tr = predict(tree_mdl, data.frame(train_p), type = "class")
    tree_te = predict(tree_mdl, data.frame(test_p), type = "class")
    train_table = table(Train_Predict = tree_tr, Train_Actual = train_r)
    test_table = table(Test_Predict = tree_te, Test_Actual = test_r)
    return(list(train = train_table, test = test_table))
}
paper9dt = classifier9dt(train_kmeans, golub_train_r, test_kmeans, golub_test_r)
paper9dt$train
paper9dt$test


             Train_Actual
Train_Predict ALL AML
          ALL  26   0
          AML   1  11
            Test_Actual
Test_Predict ALL AML
         ALL  19   8
         AML   1   6
  • Bagging

In [13]:
my_baghelper = function(train, test){
    bg = sample(nrow(train), replace = T)
    temp_md = tree(response~., data = train[bg, ])
    predict(temp_md, test, type = "class")
}
classifier9bg = function(train_p, train_r, test_p, test_r, B = 50){
    cbine_data = data.frame(response = factor(train_r), train_p)
    t_tr = replicate(B, my_baghelper(cbine_data, data.frame(train_p)))
    pred_tr = apply(t_tr, 1, function(x) ifelse(sum(x == "AML")>sum(x =="ALL"), "AML", "ALL"))
    t_te = replicate(B, my_baghelper(cbine_data, data.frame(test_p)))
    pred_te = apply(t_te, 1, function(x) ifelse(sum(x == "AML")>sum(x =="ALL"), "AML", "ALL"))
    train_table = table(Train_predict = pred_tr, Train_Actual = train_r)
    test_table = table(Test_predict = pred_te, Test_Actual = test_r)
    return(list(train = train_table, test = test_table))
}
paper9bg = classifier9bg(train_kmeans, golub_train_r, test_kmeans, golub_test_r)
paper9bg$train
paper9bg$test


             Train_Actual
Train_predict ALL AML
          ALL  27   0
          AML   0  11
            Test_Actual
Test_predict ALL AML
         ALL  19   8
         AML   1   6
  • Bagging with CPD

In [14]:
CPD = function(x1, x2, d = 0.75){
    a = runif(nrow(x1), 0, d)
    a*x1+(1-a)*x2
}
# helper function for each bagging with CPD
my_cpdhelper = function(train, test){
    id1 = sample(nrow(train), replace = T)
    id2 = sample(nrow(train), replace = T)
    temp = CPD(train[id1, -1], train[id2,-1])
    temp_md = tree(response~., data = data.frame(temp, response = train$response[id1]))
    predict(temp_md, test, type = "class")
}
classifier9bgcpd = function(train_p, train_r, test_p, test_r, B = 50){
    cbine_data = data.frame(response = factor(train_r), train_p)
    t_tr = replicate(B, my_cpdhelper(cbine_data, data.frame(train_p)))
    pred_tr = apply(t_tr, 1, function(x) ifelse(sum(x == "AML")>sum(x =="ALL"), "AML", "ALL"))
    t_te = replicate(B, my_cpdhelper(cbine_data, data.frame(test_p)))
    pred_te = apply(t_te, 1, function(x) ifelse(sum(x == "AML")>sum(x =="ALL"), "AML", "ALL"))
    train_table = table(Train_predict = pred_tr, Train_Actual = train_r)
    test_table = table(Test_predict = pred_te, Test_Actual = test_r)
    return(list(train = train_table, test = test_table))
}
paper9bgcpd = classifier9bg(train_kmeans, golub_train_r, test_kmeans, golub_test_r)
paper9bgcpd$train
paper9bgcpd$test


             Train_Actual
Train_predict ALL AML
          ALL  26   0
          AML   1  11
            Test_Actual
Test_predict ALL AML
         ALL  20   7
         AML   0   7
  • FLDA

In [15]:
classifier9flda = function(train_p, train_r, test_p, test_r, B = 50){
    cbine_data = data.frame(response = factor(train_r), train_p)
    flda_md = MASS::lda(response~., data = cbine_data)
    flda_tr = predict(flda_md, data.frame(train_p))$class
    flda_te = predict(flda_md, data.frame(test_p))$class
    train_table = table(Train_predict = flda_tr, Train_Actual = train_r)
    test_table = table(Test_predict = flda_te, Test_Actual = test_r)
    return(list(train = train_table, test = test_table))
}
paper9flda = classifier9bg(train_kmeans, golub_train_r, test_kmeans, golub_test_r)
paper9flda$train
paper9flda$test


             Train_Actual
Train_predict ALL AML
          ALL  27   0
          AML   0  11
            Test_Actual
Test_predict ALL AML
         ALL  20   7
         AML   0   7
  • DLDA

In [16]:
classifier9dlda = function(train_p, train_r, test_p, test_r, B = 50){
    cbine_data = data.frame(response = factor(train_r), train_p)
    dlda_md = dlda(response~., data = cbine_data)
    dlda_tr = predict(dlda_md, data.frame(train_p))$class
    dlda_te = predict(dlda_md, data.frame(test_p))$class
    train_table = table(Train_predict = dlda_tr, Train_Actual = train_r)
    test_table = table(Test_predict = dlda_te, Test_Actual = test_r)
    return(list(train = train_table, test = test_table))
}
paper9dlda = classifier9bg(train_kmeans, golub_train_r, test_kmeans, golub_test_r)
paper9dlda$train
paper9dlda$test


             Train_Actual
Train_predict ALL AML
          ALL  27   0
          AML   0  11
            Test_Actual
Test_predict ALL AML
         ALL  20   8
         AML   0   6
  • DQDA

In [17]:
classifier9dqda = function(train_p, train_r, test_p, test_r, B = 50){
    cbine_data = data.frame(response = factor(train_r), train_p)
    dqda_md = dlda(response~., data = cbine_data)
    dqda_tr = predict(dqda_md, data.frame(train_p))$class
    dqda_te = predict(dqda_md, data.frame(test_p))$class
    train_table = table(Train_predict = dqda_tr, Train_Actual = train_r)
    test_table = table(Test_predict = dqda_te, Test_Actual = test_r)
    return(list(train = train_table, test = test_table))
}
paper9dqda = classifier9bg(train_kmeans, golub_train_r, test_kmeans, golub_test_r)
paper9dqda$train
paper9dqda$test


             Train_Actual
Train_predict ALL AML
          ALL  27   0
          AML   0  11
            Test_Actual
Test_predict ALL AML
         ALL  19   8
         AML   1   6
Paper 29 classifier

In [18]:
classifier29 = function(train_p, train_r, test_p, test_r, B = 50){
    train_data = data.frame(train_p, class = train_r)
    test_data = data.frame(test_p)
    eg = empty.graph(c("class", colnames(data.frame(train_p))))
    arcs(eg) = matrix(c(rep("class", 50), 
                        colnames(data.frame(train_p))), 
                      ncol = 2, byrow = F, 
                      dimnames = list(c(), c("from", "to")))
    fitted = bn.fit(eg, train_data)
    predict_tr = predict(fitted, node = "class", method="bayes-lw", train_data)
    predict_te = predict(fitted, node = "class", method="bayes-lw", test_data)
    train_table = table(Train_predict = predict_tr, Train_Actual = train_r)
    test_table = table(Test_predict = predict_te, Test_Actual = test_r)
    return(list(train = train_table, test = test_table))
}
paper29 = classifier29(train_kmeans, golub_train_r, test_kmeans, golub_test_r)
paper29$train
paper29$test


             Train_Actual
Train_predict ALL AML
          ALL  27   0
          AML   0  11
            Test_Actual
Test_predict ALL AML
         ALL  19  11
         AML   1   3