Summary of paper6 feature selection:
Dimension Reduction:
In [3]:
suppressMessages(library(e1071))
suppressMessages(library(fastAdaboost))
suppressMessages(library(caret))
suppressMessages(library(sparsediscrim))
suppressMessages(library(tree))
suppressMessages(library(bnlearn))
library(ropls)
library(MASS)
set.seed(201703)
In [5]:
# pca_train, pca_test, pls_train, pls_test
load("paper6.rda")
In [3]:
get_p = function(train_d, train_r){
tr_m_aml = colMeans(train_d[train_r == "AML",])
tr_sd_aml = apply(train_d[train_r == "AML",], 2, sd)
tr_m_all = colMeans(train_d[train_r == "ALL",])
tr_sd_all = apply(train_d[train_r == "ALL",], 2, sd)
p = (tr_m_aml-tr_m_all)/(tr_sd_aml+tr_sd_all)
return(p)
}
classifier1 = function(train_p, train_r, test_p, test_r){
train_m_aml = colMeans(train_p[train_r == "AML",])
train_m_all = colMeans(train_p[train_r =="ALL",])
b = (train_m_aml+train_m_all)/2
p = get_p(train_p, train_r)
#train
train_vote = t(p*t(sweep(train_p, 2, b)))
train_V1 = apply(train_vote, 1, function(x) sum(x[x>0]))
train_V2 = abs(apply(train_vote, 1, function(x) sum(x[x<=0])))
train_pred = (train_V1-train_V2)/(train_V1+train_V2)
train_pred_r = ifelse(abs(train_pred)>0.3, ifelse(train_pred>0, "AML", "ALL"), "Uncertain")
train_table = table(Train_Predict = train_pred_r, Train_Actual = train_r)
##train_table
#test
test_vote = t(p*t(sweep(test_p, 2, b)))
test_V1 = apply(test_vote, 1, function(x) sum(x[x>0]))
test_V2 = abs(apply(test_vote, 1, function(x) sum(x[x<=0])))
test_pred = (test_V1-test_V2)/(test_V1+test_V2)
test_pred_r = ifelse(abs(test_pred)>0.3, ifelse(test_pred>0, "AML", "ALL"), "Uncertain")
test_table = table(Test_Predict = test_pred_r, Test_Actual = test_r)
##test_table
return(list(train = train_table, test = test_table))
}
In [4]:
paper1_pca = classifier1(pca_train[, -1], pca_train$response, pca_test[, -1], pca_test$response)
paper1_pca$train
paper1_pca$test
In [5]:
paper1_pls = classifier1(pls_train[, -1], pls_train$response, pls_test[, -1], pls_test$response)
paper1_pls$train
paper1_pls$test
In [6]:
# helper function
cl_nn_helper = function(new_s, train, train_label){
# use Pearson correlation
corr = apply(train, 1, cor, new_s)
train_label[corr==max(corr)]
}
classifier3nn = function(train_p, train_r, test_p, test_r){
nn_train_pr_p1 = apply(train_p,1, cl_nn_helper, train_p, train_r)
nn_test_pr_p1 = apply(test_p,1, cl_nn_helper, train_p, train_r)
train_table = table(Train_Predict = nn_train_pr_p1, Train_Actual =train_r)
test_table = table(Test_Predict = nn_test_pr_p1, Test_Actual = test_r)
return(list(train = train_table, test = test_table))
}
In [7]:
paper3nn_pca = classifier3nn(pca_train[, -1], pca_train$response, pca_test[, -1], pca_test$response)
paper3nn_pca$train
paper3nn_pca$test
In [8]:
paper3nn_pls = classifier3nn(pls_train[, -1], pls_train$response, pls_test[, -1], pls_test$response)
paper3nn_pls$train
paper3nn_pls$test
In [9]:
# linear SVM
classifier3lsvm = function(train_p, train_r, test_p, test_r){
r_train = data.frame(train_p, Y = factor(train_r))
r_test =data.frame( test_p, Y = factor(test_r))
svm_linear = svm(Y~., data = r_train)
svm_l_trpr = predict(svm_linear, r_train)
svm_l_tepr = predict(svm_linear, newdata = r_test)
train_table = table(Train_Predicted = svm_l_trpr, Train_Actual = train_r)
test_table = table(Test_Predicted = svm_l_tepr, Test_Actual = test_r)
return(list(train = train_table, test = test_table))
}
In [10]:
paper3lsvm_pca = classifier3lsvm(pca_train[, -1], pca_train$response, pca_test[, -1], pca_test$response)
paper3lsvm_pca$train
paper3lsvm_pca$test
In [11]:
paper3lsvm_pls = classifier3lsvm(pls_train[, -1], pls_train$response, pls_test[, -1], pls_test$response)
paper3lsvm_pls$train
paper3lsvm_pls$test
In [12]:
classifier3qsvm = function(train_p, train_r, test_p, test_r){
r_train = data.frame(train_p, Y = factor(train_r))
r_test =data.frame( test_p, Y = factor(test_r))
svm_quad = svm(Y~., data = r_train, kernel = "polynomial", degree = 2, gamma =0.01, coef0 = 100)
svm_q_trpr = predict(svm_quad, r_train )
svm_q_tepr = predict(svm_quad, newdata = r_test)
train_table = table(Train_Predicted = svm_q_trpr, Train_Actual = train_r)
test_table = table(Test_Predicted = svm_q_tepr, Test_Actual = test_r)
return(list(train = train_table, test = test_table))
}
In [13]:
paper3qsvm_pca = classifier3qsvm(pca_train[, -1], pca_train$response, pca_test[, -1], pca_test$response)
paper3qsvm_pca$train
paper3qsvm_pca$test
In [14]:
paper3qsvm_pls = classifier3qsvm(pls_train[, -1], pls_train$response, pls_test[, -1], pls_test$response)
paper3qsvm_pls$train
paper3qsvm_pls$test
In [15]:
classifier3Ada = function(train_p, train_r, test_p, test_r){
r_train_p1 = data.frame(train_p, Y = factor(train_r))
r_test_p1 =data.frame( test_p, Y = factor(test_r))
ada_cl_p1 = adaboost(Y~., data = r_train_p1, 100)
ada_train_pr_p1 = predict(ada_cl_p1, r_train_p1)
ada_test_pr_p1 = predict(ada_cl_p1, newdata = r_test_p1)
train_table = table(Train_Predict = ada_train_pr_p1$class, Train_Actual = train_r)
test_table = table(Test_Predict = ada_test_pr_p1$class, Test_Actual =test_r)
return(list(train = train_table, test = test_table))
}
In [16]:
paper3Ada_pca = classifier3Ada(pca_train[, -1], pca_train$response, pca_test[, -1], pca_test$response)
paper3Ada_pca$train
paper3Ada_pca$test
In [17]:
paper3Ada_pls = classifier3Ada(pls_train[, -1], pls_train$response, pls_test[, -1], pls_test$response)
paper3Ada_pls$train
paper3Ada_pls$test
In [18]:
classifier6logit = function(train_p, train_r, test_p, test_r){
ld_s = train(response~., data = data.frame(response = train_r, train_p), method = "glm", family = "binomial", trControl = trainControl(method = "LOOCV"))
ld_tr = predict(ld_s)
ld_te = predict(ld_s, data.frame(test_p))
ld_ac = mean(ld_te == test_r)
ld_re = c(LOOCV = ld_s$results$Accuracy, Test = ld_ac)
train_table = table(Train_Predict = ld_tr, Train_Actual = train_r)
test_table = table(Test_Predict = ld_te, Test_Actual =test_r)
return(list(train = train_table, test = test_table, re = ld_re))
}
classifier6qda = function(train_p, train_r, test_p, test_r){
qda_s = train(response~., data = data.frame(response = train_r, train_p), method = "qda", trControl = trainControl(method = "LOOCV"))
qda_tr = predict(qda_s)
qda_te = predict(qda_s, data.frame(test_p))
qda_ac = mean(qda_te == test_r)
qda_re = c(LOOCV = qda_s$results$Accuracy, Test = qda_ac)
train_table = table(Train_Predict = qda_tr, Train_Actual = train_r)
test_table = table(Test_Predict = qda_te, Test_Actual =test_r)
return(list(train = train_table, test = test_table, re = qda_re))
}
In [19]:
options(warn=-1)
pca_logit = classifier6logit(pca_train[, -1], pca_train$response, pca_test[, -1], pca_test$response)
print("pca, logit")
pca_logit$train
pca_logit$test
pca_qda = classifier6qda(pca_train[, -1], pca_train$response, pca_test[, -1], pca_test$response)
print("pca, qda")
pca_qda$train
pca_qda$test
pls_logit = classifier6logit(pls_train[, -1], pls_train$response, pls_test[, -1], pls_test$response)
print("pls, logit")
pls_logit$train
pls_logit$test
pls_qda = classifier6qda(pls_train[, -1], pls_train$response, pls_test[, -1], pls_test$response)
print("pls, qda")
pls_qda$train
pls_qda$test
In [20]:
# Distance measure used in the paper
Distance = function(predictor, test){
1- apply(predictor, 1, cor, test)
}
# NN classification process
paper9_nn = function(test, pk, learning, response){
distance = Distance(learning, test)
index = order(distance)[1:pk]
cl = ifelse(sum(response[index] == "AML")>sum(response[index]=="ALL"), "AML", "ALL")
cl
}
# leave-one-cross-validation to tune k
mycv= function(pk,learning,response){
error = 0
for(i in 1:nrow(learning)){
cl = paper9_nn(learning[i,], pk, learning[-i, ], response[-i])
error = error+(cl == response[i])
}
error
}
classifier9nn = function(train_p, train_r, test_p, test_r){
k = seq(1, 21, 2)
choose_k = sapply(k,mycv, learning = train_p, response= train_r)
nn_train = apply(train_p, 1, paper9_nn, k[which.min(choose_k)], train_p, train_r)
nn_test = apply(test_p ,1, paper9_nn, k[which.min(choose_k)], train_p ,train_r)
train_table = table(Train_Predict = nn_train, Train_Actual = train_r)
test_table = table(Test_Predict = nn_test, Test_Actual = test_r)
return(list(train = train_table, test = test_table))
}
In [21]:
paper9nn_pca = classifier9nn(as.matrix(pca_train[, -1]), pca_train$response, as.matrix(pca_test[, -1]), pca_test$response)
paper9nn_pca$train
paper9nn_pca$test
In [22]:
paper9nn_pls = classifier9nn(as.matrix(pls_train[, -1]), pls_train$response, as.matrix(pls_test[, -1]), pls_test$response)
paper9nn_pls$train
paper9nn_pls$test
In [23]:
# test_paper3 test_response train_paper3 train_response loaded
classifier9dt = function(train_p, train_r, test_p, test_r){
cbine_data = data.frame(response = factor(train_r), train_p)
tree_mdl = tree(response~.,data = cbine_data)
tree_tr = predict(tree_mdl, data.frame(train_p), type = "class")
tree_te = predict(tree_mdl, data.frame(test_p), type = "class")
train_table = table(Train_Predict = tree_tr, Train_Actual = train_r)
test_table = table(Test_Predict = tree_te, Test_Actual = test_r)
return(list(train = train_table, test = test_table))
}
In [24]:
paper9dt_pca = classifier9dt(pca_train[, -1], pca_train$response, pca_test[, -1], pca_test$response)
paper9dt_pca$train
paper9dt_pca$test
In [25]:
paper9dt_pls = classifier9dt(pls_train[, -1], pls_train$response, pls_test[, -1], pls_test$response)
paper9dt_pls$train
paper9dt_pls$test
In [26]:
my_baghelper = function(train, test){
bg = sample(nrow(train), replace = T)
temp_md = tree(response~., data = train[bg, ])
predict(temp_md, test, type = "class")
}
classifier9bg = function(train_p, train_r, test_p, test_r, B = 50){
cbine_data = data.frame(response = factor(train_r), train_p)
t_tr = replicate(B, my_baghelper(cbine_data, data.frame(train_p)))
pred_tr = apply(t_tr, 1, function(x) ifelse(sum(x == "AML")>sum(x =="ALL"), "AML", "ALL"))
t_te = replicate(B, my_baghelper(cbine_data, data.frame(test_p)))
pred_te = apply(t_te, 1, function(x) ifelse(sum(x == "AML")>sum(x =="ALL"), "AML", "ALL"))
train_table = table(Train_predict = pred_tr, Train_Actual = train_r)
test_table = table(Test_predict = pred_te, Test_Actual = test_r)
return(list(train = train_table, test = test_table))
}
In [27]:
paper9bg_pca = classifier9bg(pca_train[, -1], pca_train$response, pca_test[, -1], pca_test$response)
paper9bg_pca$train
paper9bg_pca$test
In [28]:
paper9bg_pls = classifier9bg(pls_train[, -1], pls_train$response, pls_test[, -1], pls_test$response)
paper9bg_pls$train
paper9bg_pls$test
In [29]:
CPD = function(x1, x2, d = 0.75){
a = runif(nrow(x1), 0, d)
a*x1+(1-a)*x2
}
# helper function for each bagging with CPD
my_cpdhelper = function(train, test){
id1 = sample(nrow(train), replace = T)
id2 = sample(nrow(train), replace = T)
temp = CPD(train[id1, -1], train[id2,-1])
temp_md = tree(response~., data = data.frame(temp, response = train$response[id1]))
predict(temp_md, test, type = "class")
}
classifier9bgcpd = function(train_p, train_r, test_p, test_r, B = 50){
cbine_data = data.frame(response = factor(train_r), train_p)
t_tr = replicate(B, my_cpdhelper(cbine_data, data.frame(train_p)))
pred_tr = apply(t_tr, 1, function(x) ifelse(sum(x == "AML")>sum(x =="ALL"), "AML", "ALL"))
t_te = replicate(B, my_cpdhelper(cbine_data, data.frame(test_p)))
pred_te = apply(t_te, 1, function(x) ifelse(sum(x == "AML")>sum(x =="ALL"), "AML", "ALL"))
train_table = table(Train_predict = pred_tr, Train_Actual = train_r)
test_table = table(Test_predict = pred_te, Test_Actual = test_r)
return(list(train = train_table, test = test_table))
}
In [30]:
paper9bgcpd_pca = classifier9bg(pca_train[, -1], pca_train$response, pca_test[, -1], pca_test$response)
paper9bgcpd_pca$train
paper9bgcpd_pca$test
In [31]:
paper9bgcpd_pls = classifier9bg(pls_train[, -1], pls_train$response, pls_test[, -1], pls_test$response)
paper9bgcpd_pls$train
paper9bgcpd_pls$test
In [32]:
classifier9flda = function(train_p, train_r, test_p, test_r, B = 50){
cbine_data = data.frame(response = factor(train_r), train_p)
flda_md = MASS::lda(response~., data = cbine_data)
flda_tr = predict(flda_md, data.frame(train_p))$class
flda_te = predict(flda_md, data.frame(test_p))$class
train_table = table(Train_predict = flda_tr, Train_Actual = train_r)
test_table = table(Test_predict = flda_te, Test_Actual = test_r)
return(list(train = train_table, test = test_table))
}
In [33]:
paper9flda_pca = classifier9bg(pca_train[, -1], pca_train$response, pca_test[, -1], pca_test$response)
paper9flda_pca$train
paper9flda_pca$test
In [34]:
paper9flda_pls = classifier9bg(pls_train[, -1], pls_train$response, pls_test[, -1], pls_test$response)
paper9flda_pls$train
paper9flda_pls$test
In [35]:
classifier9dlda = function(train_p, train_r, test_p, test_r, B = 50){
cbine_data = data.frame(response = factor(train_r), train_p)
dlda_md = dlda(response~., data = cbine_data)
dlda_tr = predict(dlda_md, data.frame(train_p))$class
dlda_te = predict(dlda_md, data.frame(test_p))$class
train_table = table(Train_predict = dlda_tr, Train_Actual = train_r)
test_table = table(Test_predict = dlda_te, Test_Actual = test_r)
return(list(train = train_table, test = test_table))
}
In [36]:
paper9dlda_pca = classifier9bg(pca_train[, -1], pca_train$response, pca_test[, -1], pca_test$response)
paper9dlda_pca$train
paper9dlda_pca$test
In [37]:
paper9dlda_pls = classifier9bg(pls_train[, -1], pls_train$response, pls_test[, -1], pls_test$response)
paper9dlda_pls$train
paper9dlda_pls$test
In [38]:
classifier9dqda = function(train_p, train_r, test_p, test_r, B = 50){
cbine_data = data.frame(response = factor(train_r), train_p)
dqda_md = dlda(response~., data = cbine_data)
dqda_tr = predict(dqda_md, data.frame(train_p))$class
dqda_te = predict(dqda_md, data.frame(test_p))$class
train_table = table(Train_predict = dqda_tr, Train_Actual = train_r)
test_table = table(Test_predict = dqda_te, Test_Actual = test_r)
return(list(train = train_table, test = test_table))
}
In [39]:
paper9dqda_pca = classifier9bg(pca_train[, -1], pca_train$response, pca_test[, -1], pca_test$response)
paper9dqda_pca$train
paper9dqda_pca$test
In [40]:
paper9dqda_pls = classifier9bg(pls_train[, -1], pls_train$response, pls_test[, -1], pls_test$response)
paper9dqda_pls$train
paper9dqda_pls$test
In [41]:
classifier29 = function(train_p, train_r, test_p, test_r, B = 50){
train_data = data.frame(train_p, class = train_r)
test_data = data.frame(test_p)
eg = empty.graph(c("class", colnames(train_p)))
arcs(eg) = matrix(c(rep("class", 3),
colnames(train_p)),
ncol = 2, byrow = F,
dimnames = list(c(), c("from", "to")))
fitted = bn.fit(eg, train_data)
predict_tr = predict(fitted, node = "class", method="bayes-lw", train_data)
predict_te = predict(fitted, node = "class", method="bayes-lw", test_data)
train_table = table(Train_predict = predict_tr, Train_Actual = train_r)
test_table = table(Test_predict = predict_te, Test_Actual = test_r)
return(list(train = train_table, test = test_table))
}
In [42]:
paper29_pca = classifier29(data.frame(apply(pca_train[, -1], 2, as.numeric)), pca_train$response, data.frame(apply(pca_test[, -1], 2, as.numeric)),pca_test$response)
paper29_pca$train
paper29_pca$test
In [43]:
paper29_pls = classifier29(data.frame(apply(pca_train[, -1], 2, as.numeric)), pca_train$response, data.frame(apply(pca_test[, -1], 2, as.numeric)),pca_test$response)
paper29_pls$train
paper29_pls$test