In [1]:
# suppressMessages(library(golubEsets))
# data(Golub_Merge)
# golub_merge_predict = t(exprs(Golub_Merge))
# golub_merge_response =pData(Golub_Merge)[, "ALL.AML"]
# #Training data predictor and response
# data(Golub_Train)
# golub_train_predict = t(exprs(Golub_Train))
# golub_train_response =pData(Golub_Train)[, "ALL.AML"]
# #Testing data predictor
# data(Golub_Test)
# golub_test_predict = t(exprs(Golub_Test))
# golub_test_response = pData(Golub_Test)[, "ALL.AML"]
# save(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response, golub_merge_predict, golub_merge_response, file = "GolubData.rda")
In [2]:
# golub_train_predict, golub_train_response, golub_test_predict, golub_test_response
load("GolubData.rda")
suppressMessages(library(e1071))
suppressMessages(library(fastAdaboost))
suppressMessages(library(caret))
suppressMessages(library(sparsediscrim))
suppressMessages(library(tree))
suppressMessages(library(fastAdaboost))
suppressMessages(library(bnlearn))
library(ropls)
library(MASS)
set.seed(2017)
In [3]:
# Paper 1
## Helper Functions
golub_filter = function(x, r = 5, d=500){
minval = min(x)
maxval = max(x)
(maxval/minval>r)&&(maxval-minval>d)
}
## Neighbourhood analysis
### signal-to-noise ratio/PS in the paper
get_p = function(train_d, train_r){
tr_m_aml = colMeans(train_d[train_r == "AML",])
tr_sd_aml = apply(train_d[train_r == "AML",], 2, sd)
tr_m_all = colMeans(train_d[train_r == "ALL",])
tr_sd_all = apply(train_d[train_r == "ALL",], 2, sd)
p = (tr_m_aml-tr_m_all)/(tr_sd_aml+tr_sd_all)
return(p)
}
PPFS1 = function(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response){
"carry out preprocessing on original Golub data and apply feature selection as both were done in Paper 1;
output is called TransformedData1"
# Preprocessing
## Thresholding
golub_train_pp = golub_train_predict
golub_train_pp[golub_train_pp<100] = 100
golub_train_pp[golub_train_pp>16000] = 16000
## Filtering
index = apply(golub_train_pp, 2, golub_filter)
golub_index = (1:7129)[index]
golub_train_pp = golub_train_pp[, golub_index]
golub_test_pp = golub_test_predict
golub_test_pp[golub_test_pp<100] = 100
golub_test_pp[golub_test_pp>16000] = 16000
golub_test_pp = golub_test_pp[, golub_index]
## Log Transformation
golub_train_p_trans = log10(golub_train_pp)
golub_test_p_trans = log10(golub_test_pp)
## Normalization
train_m = colMeans(golub_train_p_trans)
train_sd = apply(golub_train_p_trans, 2, sd)
golub_train_p_trans = t((t(golub_train_p_trans)-train_m)/train_sd)
golub_test_p_trans = t((t(golub_test_p_trans)-train_m)/train_sd)
golub_train_3051 = golub_train_p_trans
golub_test_3051 = golub_test_p_trans
# Feature Selection
nna = matrix(0, 400, 3051)
set.seed(201702)
## Permutation test
for(i in 1:400){
t_r = sample(golub_train_response)
nna[i, ] = get_p(golub_train_p_trans, t_r)
}
## Predictor selection based on the result of Neighbourhood analysis
nna_q = apply(nna, 2, quantile, prob = c(0.005, 0.995))
p = get_p(golub_train_p_trans, golub_train_response)
## Keep the one with 0.01 significant level
index_1 = (1:3051)[p>=nna_q[2,] | p<=nna_q[1,]]
golub_train_p_trans = golub_train_p_trans[, index_1]
train_m_aml = colMeans(golub_train_p_trans[golub_train_response == "AML",])
train_m_all = colMeans(golub_train_p_trans[golub_train_response =="ALL",])
golub_test_p_trans =golub_test_p_trans[, index_1]
p = p[index_1]
## 50 most informative genes
cl_index = c(head(order(p), 25), head(order(p, decreasing = T), 25))
p_50 = p[cl_index]
b = (train_m_aml[cl_index]+train_m_all[cl_index])/2
train_cl = golub_train_p_trans[, cl_index]
test_cl = golub_test_p_trans[, cl_index]
golub_train_50 = train_cl
golub_test_50 = test_cl
train_vote = t(p_50*t(sweep(train_cl, 2, b)))
TransformedData1 = list(train_predict = golub_train_50, test_predict = golub_test_50, train_response = golub_train_response, test_response = golub_test_response)
return(TransformedData1)
}
In [4]:
# Paper 3
## Helper Functions
### TNoM score
r_score = function(slabel){
total = length(slabel)
n = sum(slabel == 0)
p = sum(slabel == 1)
temp = min(n, p)
d = ifelse(n < p, 1, -1)
for(i in 1:(total-1)){
count = sum(slabel[1:i] == 0)
t_t = min(n-2*count+i, p+2*count-i)
t_d = ifelse((n-2*count+i)<(p+2*count-i),1,-1)
if(t_t < temp){
temp = t_t
d = t_d
}
}
c(temp, t_d)
}
### Significance (using bootstrap with size 1000 to replace)
r_bootstrap = function(gene, label){
total = length(label)
index = order(gene)
s_l = label[index]
score= r_score(s_l)
dist_score = numeric(200)
for(i in 1:200){
temp = sample(1:total)
t = r_score(label[temp])
dist_score[i] = t[1]
}
prob = mean(dist_score<score[1])
c(score[1], score[2], prob)
}
PPFS3 = function(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response){
"carry out preprocessing on original Golub data and apply feature selection as both were done in Paper 3;
output is called TransformedData3"
golub_train_l = ifelse(golub_train_response == "AML", 1, 0)
golub_test_l = ifelse(golub_test_response == "AML", 1, 0)
set.seed(201703)
# perform the caculation, this may take a while since the inevitable loops in above functions
a = apply(golub_train_predict, 2, r_bootstrap, label = golub_train_l)
# select informative genes and subset the train and test data
index = (1:7129)[a[1,]<14 & a[3,]<0.01]
b = order(a[1,index])[1:50]
# data subsetting
train_cl = golub_train_predict[, index]
test_cl = golub_test_predict[, index]
train_paper3 = train_cl[, b]
test_paper3 = test_cl[, b]
TransformedData3 = list(train_predict = train_paper3, test_predict = test_paper3, train_response = golub_train_response, test_response = golub_test_response)
return(TransformedData3)
}
In [5]:
# Paper 6
PPFS6PCA = function(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response, K = 3){
"carry out preprocessing on original Golub data and apply PCA feature selection as both were done in Paper 6;
output is called TransformedData6PCA"
# First Same 50 genes as in PPFS1
rslt = PPFS1(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response)
golub_train_p = rslt$train_predict
golub_train_r = rslt$train_response
golub_test_p = rslt$test_predict
golub_test_r = rslt$test_response
pca_rslt= getLoadingMN(opls(golub_train_p, printL = F, predI = K))
pca_train_s = t(t(pca_rslt)%*%t(golub_train_p))
pca_test_s = t(t(pca_rslt)%*%t(golub_test_p))
TransformedData6PCA = list(train_predict = pca_train_s, test_predict = pca_test_s, train_response = golub_train_r, test_response = golub_test_r)
return(TransformedData6PCA)
}
PPFS6PLS = function(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response, K = 3){
"carry out preprocessing on original Golub data and apply PLS feature selection as both were done in Paper 6;
output is called TransformedData6PLS"
# First Same 50 genes as in PPFS1
rslt = PPFS1(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response)
golub_train_p = rslt$train_predict
golub_train_r = rslt$train_response
golub_test_p = rslt$test_predict
golub_test_r = rslt$test_response
pls_rslt = getLoadingMN(opls(golub_train_p, golub_train_r, printL = F, predI = K))
pls_train_s = t(t(pls_rslt)%*%t(golub_train_p))
pls_test_s = t(t(pls_rslt)%*%t(golub_test_p))
TransformedData6PLS = list(train_predict = pls_train_s, test_predict = pls_test_s, train_response = golub_train_r, test_response = golub_test_r)
return(TransformedData6PLS)
}
In [6]:
# Paper 9
## Helper Function
### implement function for calculating BW as stated in the paper(the ratio of between-group to within group sums of squares)
BW = function(predictor, response){
overall = colMeans(predictor)
ALL_mean = apply(predictor, 2, function(x) mean(x[response == "ALL"]))
AML_mean = apply(predictor, 2, function(x) mean(x[response == "AML"]))
numerator = sum(response == "ALL")*(ALL_mean-overall)^2+sum(response == "AML")*(AML_mean-overall)^2
denumerator = colSums((t(t(predictor[response == "ALL", ])-ALL_mean))^2)+colSums((t(t(predictor[response == "AML", ])-AML_mean))^2)
numerator/denumerator
}
PPFS9 = function(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response, K = 3){
"carry out preprocessing on original Golub data and apply feature selection as both were done in Paper 9;
output is called TransformedData9"
n_train = nrow(golub_train_predict)
golub_merge_predict = rbind(golub_train_predict, golub_test_predict)
golub_merge_response = c(golub_train_response, golub_test_response)
golub_merge_l = ifelse(golub_merge_response == "AML", 1, 0)
n_total = nrow(golub_merge_predict)
#
## Thresholding
golub_merge_pp = golub_merge_predict
golub_merge_pp[golub_merge_pp<100] = 100
golub_merge_pp[golub_merge_pp>16000] = 16000
merge_index = apply(golub_merge_pp, 2, golub_filter)
golub_merge_index = (1:7129)[merge_index]
golub_merge_pp = golub_merge_pp[, golub_merge_index]
## Base 10 logarithmic transformation
golub_merge_p_trans = log10(golub_merge_pp)
# Further standardization to mean 0 variance 1.
scale_golub_merge = scale(golub_merge_p_trans)
# randomly split once to do feature selection for comparison for furthur study
train_p = scale_golub_merge[1:n_train,]
train_r = golub_train_response
test_p = scale_golub_merge[(n_train+1):n_total,]
test_r = golub_test_response
temp_bw = order(BW(train_p, train_r), decreasing = T)[1:50]
train_BW_predictor = train_p[,temp_bw]
test_BW_predictor = test_p[,temp_bw]
TransformedData9 = list(train_predict = train_BW_predictor, test_predict = test_BW_predictor, train_response = train_r, test_response = test_r)
return(TransformedData9)
}
In [7]:
# Paper 29
## Helper Function
### SNR: signal-to-noise ratio
get_SNR = function(train_d, train_r){
tr_m_aml = colMeans(train_d[train_r == "AML",])
tr_sd_aml = apply(train_d[train_r == "AML",], 2, sd)
tr_m_all = colMeans(train_d[train_r == "ALL",])
tr_sd_all = apply(train_d[train_r == "ALL",], 2, sd)
p = (tr_m_aml-tr_m_all)/(tr_sd_aml+tr_sd_all)
return(p)
}
### Kmeans clustering and then SNR ranking selection
get_kmeans = function(k, train_d, train_r){
cl = kmeans(t(train_d), k, iter.max=50)$cluster
result = numeric(k)
for(i in 1:k){
id = (cl == i)
oid = (1:ncol(train_d))[id]
iSNR = get_SNR(t(t(train_d)[id,]),train_r)
temp = which.max(abs(iSNR))
result[i] = oid[temp]
}
return(result)
}
PPFS29 = function(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response, K = 50){
"carry out preprocessing on original Golub data and apply feature selection as both were done in Paper 29;
output is called TransformedData29"
# Thresholding
golub_train_pp = golub_train_predict
golub_train_pp[golub_train_pp<100] = 100
golub_train_pp[golub_train_pp>16000] = 16000
# Filtering
index = apply(golub_train_pp, 2, golub_filter)
golub_index = (1:7129)[index]
golub_train_pp = golub_train_pp[, golub_index]
golub_test_pp = golub_test_predict
golub_test_pp[golub_test_pp<100] = 100
golub_test_pp[golub_test_pp>16000] = 16000
golub_test_pp = golub_test_pp[, golub_index]
# Log Transformation
golub_train_p_trans = log10(golub_train_pp)
golub_test_p_trans = log10(golub_test_pp)
# Normalization
train_m = colMeans(golub_train_p_trans)
train_sd = apply(golub_train_p_trans, 2, sd)
golub_train_p_trans = t((t(golub_train_p_trans)-train_m)/train_sd)
golub_test_p_trans = t((t(golub_test_p_trans)-train_m)/train_sd)
kmeans_id = get_kmeans(K, golub_train_p_trans, golub_train_response)
train_kmeans = golub_train_p_trans[,kmeans_id]
test_kmeans = golub_test_p_trans[,kmeans_id]
TransformedData29 = list(train_predict = train_kmeans, test_predict = test_kmeans, train_response = golub_train_response, test_response = golub_test_response)
return(TransformedData29)
}
In [8]:
# Paper 1
## Helper function
get_p = function(train_d, train_r){
tr_m_aml = colMeans(train_d[train_r == "AML",])
tr_sd_aml = apply(train_d[train_r == "AML",], 2, sd)
tr_m_all = colMeans(train_d[train_r == "ALL",])
tr_sd_all = apply(train_d[train_r == "ALL",], 2, sd)
p = (tr_m_aml-tr_m_all)/(tr_sd_aml+tr_sd_all)
return(p)
}
classifier1 = function(train_p, train_r, test_p, test_r){
"carry out classification as in Paper 1;
output is called Rates1 and is the value in row 1, column_i
call this function 6 times using every TransformedData_i as input in turn to get a 1x6 output for row 1"
train_m_aml = colMeans(train_p[train_r == "AML",])
train_m_all = colMeans(train_p[train_r =="ALL",])
b = (train_m_aml+train_m_all)/2
p = get_p(train_p, train_r)
#train
train_vote = t(p*t(sweep(train_p, 2, b)))
train_V1 = apply(train_vote, 1, function(x) sum(x[x>0]))
train_V2 = abs(apply(train_vote, 1, function(x) sum(x[x<=0])))
train_pred = (train_V1-train_V2)/(train_V1+train_V2)
train_pred_r = ifelse(abs(train_pred)>0.3, ifelse(train_pred>0, "AML", "ALL"), "Uncertain")
train_table = table(Train_Predict = train_pred_r, Train_Actual = train_r)
##train_table
#test
test_vote = t(p*t(sweep(test_p, 2, b)))
test_V1 = apply(test_vote, 1, function(x) sum(x[x>0]))
test_V2 = abs(apply(test_vote, 1, function(x) sum(x[x<=0])))
test_pred = (test_V1-test_V2)/(test_V1+test_V2)
test_pred_r = ifelse(abs(test_pred)>0.3, ifelse(test_pred>0, "AML", "ALL"), "Uncertain")
test_table = table(Test_Predict = test_pred_r, Test_Actual = test_r)
##test accuracy rate
Rates1 = mean(test_pred_r == test_r)
return(Rates1)
}
In [9]:
# Paper 3
## NN
### helper function
cl_nn_helper = function(new_s, train, train_label){
# use Pearson correlation
corr = apply(train, 1, cor, new_s)
train_label[corr==max(corr)]
}
classifier3nn = function(train_p, train_r, test_p, test_r){
"carry out NN classification as in Paper 3;
output is called Rates3NN and is the value in row 2, column_i
call this function 6 times using every TransformedData_i as input in turn to get a 1x6 output for row 2"
nn_test_pr_pl = apply(test_p,1, cl_nn_helper, train_p, train_r)
Rates3NN = mean(nn_test_pr_pl == test_r)
return(Rates3NN)
}
## Linear SVM
classifier3lsvm = function(train_p, train_r, test_p, test_r){
"carry out linear SVM classification as in Paper 3;
output is called Rates3LSVM and is the value in row 3, column_i
call this function 6 times using every TransformedData_i as input in turn to get a 1x6 output for row 3"
r_train = data.frame(train_p, Y = factor(train_r))
r_test =data.frame( test_p, Y = factor(test_r))
svm_linear = svm(Y~., data = r_train)
svm_l_tepr = predict(svm_linear, newdata = r_test)
Rates3LSVM = mean(svm_l_tepr == test_r)
return(Rates3LSVM)
}
## Polynomial SVM
classifier3psvm = function(train_p, train_r, test_p, test_r){
"carry out polynomial degree 2 SVM classification as in Paper 3;
output is called Rates3PSVM and is the value in row 4, column_i
call this function 6 times using every TransformedData_i as input in turn to get a 1x6 output for row 4"
r_train = data.frame(train_p, Y = factor(train_r))
r_test =data.frame( test_p, Y = factor(test_r))
svm_quad = svm(Y~., data = r_train, kernel = "polynomial", degree = 2, gamma =0.01, coef0 = 100)
svm_q_tepr = predict(svm_quad, newdata = r_test)
Rates3PSVM = mean(svm_q_tepr == test_r)
return(Rates3PSVM)
}
## Adaboost
classifier3Ada = function(train_p, train_r, test_p, test_r){
"carry out Adaboost classification as in Paper 3;
output is called Rates3Ada and is the value in row 5, column_i
call this function 6 times using every TransformedData_i as input in turn to get a 1x6 output for row 5"
r_train_p1 = data.frame(train_p, Y = factor(train_r))
r_test_p1 =data.frame( test_p, Y = factor(test_r))
ada_cl_p1 = adaboost(Y~., data = r_train_p1, 100)
ada_test_pr_p1 = predict(ada_cl_p1, newdata = r_test_p1)
Rates3Ada = mean(ada_test_pr_p1$class == test_r)
return(Rates3Ada)
}
In [10]:
# Paper 6
classifier6logit = function(train_p, train_r, test_p, test_r, K = 3){
"carry out Logistic Discrimination as in Paper 6;
output is called Rates6Logit and is the value in row 6, column_i
call this function 6 times using every TransformedData_i as input in turn to get a 1x6 output for row 6"
pca_slt = getLoadingMN(opls(train_p, printL = F, predI = K))
pca_train_s = t(t(pca_slt)%*%t(train_p))
pca_test_s = t(t(pca_slt)%*%t(test_p))
train_data = data.frame(response = train_r, pca_train_s)
test_data = pca_test_s
ld_s = train(response~., data = train_data, method = "glm", family = "binomial", trControl = trainControl(method = "LOOCV"))
ld_te = predict(ld_s, data.frame(test_data))
Rates6Logit = mean(ld_te == test_r)
return(Rates6Logit)
}
classifier6qda = function(train_p, train_r, test_p, test_r, K = 3){
"carry out QDA as in Paper 6;
output is called Rates6QDA and is the value in row 7, column_i
call this function 6 times using every TransformedData_i as input in turn to get a 1x6 output for row 7"
pca_slt = getLoadingMN(opls(train_p, printL = F, predI = K))
pca_train_s = t(t(pca_slt)%*%t(train_p))
pca_test_s = t(t(pca_slt)%*%t(test_p))
train_data = data.frame(response = train_r, pca_train_s)
test_data = pca_test_s
qda_s = train(response~., data = train_data, method = "qda", trControl = trainControl(method = "LOOCV"))
qda_te = predict(qda_s, data.frame(test_data))
Rates6QDA = mean(qda_te == test_r)
return(Rates6QDA)
}
In [11]:
# Paper 9
## NN
### Distance measure used in the paper
Distance = function(predictor, test){
1- apply(predictor, 1, cor, test)
}
### NN classification process
paper9_nn = function(test, pk, learning, response){
distance = Distance(learning, test)
index = order(distance)[1:pk]
cl = ifelse(sum(response[index] == "AML")>sum(response[index]=="ALL"), "AML", "ALL")
cl
}
# leave-one-cross-validation to tune k
mycv= function(pk,learning,response){
error = 0
for(i in 1:nrow(learning)){
cl = paper9_nn(learning[i,], pk, learning[-i, ], response[-i])
error = error+(cl == response[i])
}
error
}
classifier9nn = function(train_p, train_r, test_p, test_r){
"carry out Nearest Neighbor as in Paper 9;
output is called Rates9NN and is the value in row 8, column_i
call this function 6 times using every TransformedData_i as input in turn to get a 1x6 output for row 8"
k = seq(1, 21, 2)
choose_k = sapply(k,mycv, learning = train_p, response= train_r)
nn_test = apply(test_p ,1, paper9_nn, k[which.min(choose_k)], train_p ,train_r)
Rates9NN = mean(nn_test == test_r)
return(Rates9NN)
}
## Decision Tree
### test_paper3 test_response train_paper3 train_response loaded
classifier9dt = function(train_p, train_r, test_p, test_r){
"carry out Decision Tree as in Paper 9;
output is called Rates9DT and is the value in row 9, column_i
call this function 6 times using every TransformedData_i as input in turn to get a 1x6 output for row 9"
cbine_data = data.frame(response = factor(train_r), train_p)
tree_mdl = tree(response~.,data = cbine_data)
tree_te = predict(tree_mdl, data.frame(test_p), type = "class")
Rates9DT = mean(tree_te == test_r)
return(Rates9DT)
}
## Bagging
my_baghelper = function(train, test){
bg = sample(nrow(train), replace = T)
temp_md = tree(response~., data = train[bg, ])
predict(temp_md, test, type = "class")
}
classifier9bg = function(train_p, train_r, test_p, test_r, B = 50){
"carry out Bagging as in Paper 9;
output is called Rates9BG and is the value in row 10, column_i
call this function 6 times using every TransformedData_i as input in turn to get a 1x6 output for row 10"
cbine_data = data.frame(response = factor(train_r), train_p)
t_te = replicate(B, my_baghelper(cbine_data, data.frame(test_p)))
pred_te = apply(t_te, 1, function(x) ifelse(sum(x == "AML")>sum(x =="ALL"), "AML", "ALL"))
Rates9BG = mean(pred_te == test_r)
return(Rates9BG)
}
## Bagging with CPD
CPD = function(x1, x2, d = 0.75){
a = runif(nrow(x1), 0, d)
a*x1+(1-a)*x2
}
### helper function for each bagging with CPD
my_cpdhelper = function(train, test){
id1 = sample(nrow(train), replace = T)
id2 = sample(nrow(train), replace = T)
temp = CPD(train[id1, -1], train[id2,-1])
temp_md = tree(response~., data = data.frame(temp, response = train$response[id1]))
predict(temp_md, test, type = "class")
}
classifier9bgcpd = function(train_p, train_r, test_p, test_r, B = 50){
"carry out Bagging with CPD as in Paper 9;
output is called Rates9BGCPD and is the value in row 11, column_i
call this function 6 times using every TransformedData_i as input in turn to get a 1x6 output for row 11"
cbine_data = data.frame(response = factor(train_r), train_p)
t_te = replicate(B, my_cpdhelper(cbine_data, data.frame(test_p)))
pred_te = apply(t_te, 1, function(x) ifelse(sum(x == "AML")>sum(x =="ALL"), "AML", "ALL"))
Rates9BGCPD = mean(pred_te == test_r)
return(Rates9BGCPD)
}
## FLDA
classifier9flda = function(train_p, train_r, test_p, test_r){
"carry out FLDA as in Paper 9;
output is called Rates9FLDA and is the value in row 12, column_i
call this function 6 times using every TransformedData_i as input in turn to get a 1x6 output for row 12"
cbine_data = data.frame(response = factor(train_r), train_p)
flda_md = MASS::lda(response~., data = cbine_data)
flda_te = predict(flda_md, data.frame(test_p))$class
Rates9FLDA = mean(flda_te == test_r)
return(Rates9FLDA)
}
## DLDA
classifier9dlda = function(train_p, train_r, test_p, test_r){
"carry out DLDA as in Paper 9;
output is called Rates9DLDA and is the value in row 13, column_i
call this function 6 times using every TransformedData_i as input in turn to get a 1x6 output for row 13"
cbine_data = data.frame(response = factor(train_r), train_p)
dlda_md = dlda(response~., data = cbine_data)
dlda_te = predict(dlda_md, data.frame(test_p))$class
Rates9DLDA = mean(dlda_te == test_r)
return(Rates9DLDA)
}
## DQDA
classifier9dqda = function(train_p, train_r, test_p, test_r){
"carry out DQDA as in Paper 9;
output is called Rates9DQDA and is the value in row 14, column_i
call this function 6 times using every TransformedData_i as input in turn to get a 1x6 output for row 14"
cbine_data = data.frame(response = factor(train_r), train_p)
dqda_md = dlda(response~., data = cbine_data)
dqda_te = predict(dqda_md, data.frame(test_p))$class
Rates9DQDA = mean(dqda_te == test_r)
return(Rates9DQDA)
}
In [12]:
# Paper 29
classifier29 = function(train_p, train_r, test_p, test_r, B = 50){
"carry out Bayesian Network as in Paper 29;
output is called Rates29BN and is the value in row 15, column_i
call this function 6 times using every TransformedData_i as input in turn to get a 1x6 output for row 15"
train_data = data.frame(train_p, class = train_r)
test_data = data.frame(test_p)
eg = empty.graph(c("class", colnames(train_p)))
arcs(eg) = matrix(c(rep("class", B),
colnames(train_p)),
ncol = 2, byrow = F,
dimnames = list(c(), c("from", "to")))
fitted = bn.fit(eg, train_data)
predict_te = predict(fitted, node = "class", method="bayes-lw", test_data)
Rates29BN = mean(predict_te == test_r)
return(Rates29BN)
}
In [13]:
# Selection Result
Paper1Selection = PPFS1(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response)
Paper3Selection = PPFS3(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response)
Paper6SelectionPCA = PPFS6PCA(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response, K = 3)
Paper6SelectionPLS = PPFS6PLS(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response, K = 3)
Paper9Selection = PPFS9(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response, K = 3)
Paper29Selection = PPFS29(golub_train_predict, golub_train_response, golub_test_predict, golub_test_response, K = 50)
In [14]:
# Example for calculating Row 1.
Cell11 = classifier1(Paper1Selection$train_predict, Paper1Selection$train_response, Paper1Selection$test_predict, Paper1Selection$test_response)
Cell12 = classifier1(Paper3Selection$train_predict, Paper3Selection$train_response, Paper3Selection$test_predict, Paper3Selection$test_response)
Cell13 = classifier1(Paper6SelectionPCA$train_predict, Paper6SelectionPCA$train_response, Paper6SelectionPCA$test_predict, Paper6SelectionPCA$test_response)
Cell14 = classifier1(Paper6SelectionPLS$train_predict, Paper6SelectionPLS$train_response, Paper6SelectionPLS$test_predict, Paper6SelectionPLS$test_response)
Cell15 = classifier1(Paper9Selection$train_predict, Paper9Selection$train_response, Paper9Selection$test_predict, Paper9Selection$test_response)
Cell16 = classifier1(Paper29Selection$train_predict, Paper29Selection$train_response, Paper29Selection$test_predict, Paper29Selection$test_response)
round(c(Cell11, Cell12, Cell13, Cell14, Cell15, Cell16),4)
In [15]:
# Example for calculating Row 2.
Cell21 = classifier3nn(Paper1Selection$train_predict, Paper1Selection$train_response, Paper1Selection$test_predict, Paper1Selection$test_response)
Cell22 = classifier3nn(Paper3Selection$train_predict, Paper3Selection$train_response, Paper3Selection$test_predict, Paper3Selection$test_response)
Cell23 = classifier3nn(Paper6SelectionPCA$train_predict, Paper6SelectionPCA$train_response, Paper6SelectionPCA$test_predict, Paper6SelectionPCA$test_response)
Cell24 = classifier3nn(Paper6SelectionPLS$train_predict, Paper6SelectionPLS$train_response, Paper6SelectionPLS$test_predict, Paper6SelectionPLS$test_response)
Cell25 = classifier3nn(Paper9Selection$train_predict, Paper9Selection$train_response, Paper9Selection$test_predict, Paper9Selection$test_response)
Cell26 = classifier3nn(Paper29Selection$train_predict, Paper29Selection$train_response, Paper29Selection$test_predict, Paper29Selection$test_response)
round(c(Cell21, Cell22, Cell23, Cell24, Cell25, Cell26),4)
In [16]:
# Example for calculating Row 3.
Cell31 = classifier3lsvm(Paper1Selection$train_predict, Paper1Selection$train_response, Paper1Selection$test_predict, Paper1Selection$test_response)
Cell32 = classifier3lsvm(Paper3Selection$train_predict, Paper3Selection$train_response, Paper3Selection$test_predict, Paper3Selection$test_response)
Cell33 = classifier3lsvm(Paper6SelectionPCA$train_predict, Paper6SelectionPCA$train_response, Paper6SelectionPCA$test_predict, Paper6SelectionPCA$test_response)
Cell34 = classifier3lsvm(Paper6SelectionPLS$train_predict, Paper6SelectionPLS$train_response, Paper6SelectionPLS$test_predict, Paper6SelectionPLS$test_response)
Cell35 = classifier3lsvm(Paper9Selection$train_predict, Paper9Selection$train_response, Paper9Selection$test_predict, Paper9Selection$test_response)
Cell36 = classifier3lsvm(Paper29Selection$train_predict, Paper29Selection$train_response, Paper29Selection$test_predict, Paper29Selection$test_response)
round(c(Cell31, Cell32, Cell33, Cell34, Cell35, Cell36),4)
In [17]:
# Example for calculating Row 4.
Cell41 = classifier3psvm(Paper1Selection$train_predict, Paper1Selection$train_response, Paper1Selection$test_predict, Paper1Selection$test_response)
Cell42 = classifier3psvm(Paper3Selection$train_predict, Paper3Selection$train_response, Paper3Selection$test_predict, Paper3Selection$test_response)
Cell43 = classifier3psvm(Paper6SelectionPCA$train_predict, Paper6SelectionPCA$train_response, Paper6SelectionPCA$test_predict, Paper6SelectionPCA$test_response)
Cell44 = classifier3psvm(Paper6SelectionPLS$train_predict, Paper6SelectionPLS$train_response, Paper6SelectionPLS$test_predict, Paper6SelectionPLS$test_response)
Cell45 = classifier3psvm(Paper9Selection$train_predict, Paper9Selection$train_response, Paper9Selection$test_predict, Paper9Selection$test_response)
Cell46 = classifier3psvm(Paper29Selection$train_predict, Paper29Selection$train_response, Paper29Selection$test_predict, Paper29Selection$test_response)
round(c(Cell41, Cell42, Cell43, Cell44, Cell45, Cell46),4)
In [18]:
# Example for calculating Row 5.
Cell51 = classifier3Ada (Paper1Selection$train_predict, Paper1Selection$train_response, Paper1Selection$test_predict, Paper1Selection$test_response)
Cell52 = classifier3Ada (Paper3Selection$train_predict, Paper3Selection$train_response, Paper3Selection$test_predict, Paper3Selection$test_response)
Cell53 = classifier3Ada (Paper6SelectionPCA$train_predict, Paper6SelectionPCA$train_response, Paper6SelectionPCA$test_predict, Paper6SelectionPCA$test_response)
Cell54 = classifier3Ada (Paper6SelectionPLS$train_predict, Paper6SelectionPLS$train_response, Paper6SelectionPLS$test_predict, Paper6SelectionPLS$test_response)
Cell55 = classifier3Ada (Paper9Selection$train_predict, Paper9Selection$train_response, Paper9Selection$test_predict, Paper9Selection$test_response)
Cell56 = classifier3Ada (Paper29Selection$train_predict, Paper29Selection$train_response, Paper29Selection$test_predict, Paper29Selection$test_response)
round(c(Cell51, Cell52, Cell53, Cell54, Cell55, Cell56),4)
In [19]:
# Example for calculating Row 6.
options(warn = -1)
Cell61 = classifier6logit(Paper1Selection$train_predict, Paper1Selection$train_response, Paper1Selection$test_predict, Paper1Selection$test_response)
Cell62 = classifier6logit(Paper3Selection$train_predict, Paper3Selection$train_response, Paper3Selection$test_predict, Paper3Selection$test_response)
Cell63 = classifier6logit(Paper6SelectionPCA$train_predict, Paper6SelectionPCA$train_response, Paper6SelectionPCA$test_predict, Paper6SelectionPCA$test_response)
Cell64 = classifier6logit(Paper6SelectionPLS$train_predict, Paper6SelectionPLS$train_response, Paper6SelectionPLS$test_predict, Paper6SelectionPLS$test_response)
Cell65 = classifier6logit(Paper9Selection$train_predict, Paper9Selection$train_response, Paper9Selection$test_predict, Paper9Selection$test_response)
Cell66 = classifier6logit(Paper29Selection$train_predict, Paper29Selection$train_response, Paper29Selection$test_predict, Paper29Selection$test_response)
round(c(Cell61, Cell62, Cell63, Cell64, Cell65, Cell66),4)
In [20]:
# Example for calculating Row 7.
options(warn = -1)
Cell71 = classifier6qda(Paper1Selection$train_predict, Paper1Selection$train_response, Paper1Selection$test_predict, Paper1Selection$test_response)
Cell72 = classifier6qda(Paper3Selection$train_predict, Paper3Selection$train_response, Paper3Selection$test_predict, Paper3Selection$test_response)
Cell73 = classifier6qda(Paper6SelectionPCA$train_predict, Paper6SelectionPCA$train_response, Paper6SelectionPCA$test_predict, Paper6SelectionPCA$test_response)
Cell74 = classifier6qda(Paper6SelectionPLS$train_predict, Paper6SelectionPLS$train_response, Paper6SelectionPLS$test_predict, Paper6SelectionPLS$test_response)
Cell75 = classifier6qda(Paper9Selection$train_predict, Paper9Selection$train_response, Paper9Selection$test_predict, Paper9Selection$test_response)
Cell76 = classifier6qda(Paper29Selection$train_predict, Paper29Selection$train_response, Paper29Selection$test_predict, Paper29Selection$test_response)
round(c(Cell71, Cell72, Cell73, Cell74, Cell75, Cell76),4)
In [21]:
# Example for calculating Row 8.
Cell81 = classifier9nn(Paper1Selection$train_predict, Paper1Selection$train_response, Paper1Selection$test_predict, Paper1Selection$test_response)
Cell82 = classifier9nn(Paper3Selection$train_predict, Paper3Selection$train_response, Paper3Selection$test_predict, Paper3Selection$test_response)
Cell83 = classifier9nn(Paper6SelectionPCA$train_predict, Paper6SelectionPCA$train_response, Paper6SelectionPCA$test_predict, Paper6SelectionPCA$test_response)
Cell84 = classifier9nn(Paper6SelectionPLS$train_predict, Paper6SelectionPLS$train_response, Paper6SelectionPLS$test_predict, Paper6SelectionPLS$test_response)
Cell85 = classifier9nn(Paper9Selection$train_predict, Paper9Selection$train_response, Paper9Selection$test_predict, Paper9Selection$test_response)
Cell86 = classifier9nn(Paper29Selection$train_predict, Paper29Selection$train_response, Paper29Selection$test_predict, Paper29Selection$test_response)
round(c(Cell81, Cell82, Cell83, Cell84, Cell85, Cell86),4)
In [22]:
# Example for calculating Row 9.
Cell91 = classifier9dt(Paper1Selection$train_predict, Paper1Selection$train_response, Paper1Selection$test_predict, Paper1Selection$test_response)
Cell92 = classifier9dt(Paper3Selection$train_predict, Paper3Selection$train_response, Paper3Selection$test_predict, Paper3Selection$test_response)
Cell93 = classifier9dt(Paper6SelectionPCA$train_predict, Paper6SelectionPCA$train_response, Paper6SelectionPCA$test_predict, Paper6SelectionPCA$test_response)
Cell94 = classifier9dt(Paper6SelectionPLS$train_predict, Paper6SelectionPLS$train_response, Paper6SelectionPLS$test_predict, Paper6SelectionPLS$test_response)
Cell95 = classifier9dt(Paper9Selection$train_predict, Paper9Selection$train_response, Paper9Selection$test_predict, Paper9Selection$test_response)
Cell96 = classifier9dt(Paper29Selection$train_predict, Paper29Selection$train_response, Paper29Selection$test_predict, Paper29Selection$test_response)
round(c(Cell91, Cell92, Cell93, Cell94, Cell95, Cell96),4)
In [23]:
# Example for calculating Row 10.
Cell101 = classifier9bg(Paper1Selection$train_predict, Paper1Selection$train_response, Paper1Selection$test_predict, Paper1Selection$test_response)
Cell102 = classifier9bg(Paper3Selection$train_predict, Paper3Selection$train_response, Paper3Selection$test_predict, Paper3Selection$test_response)
Cell103 = classifier9bg(Paper6SelectionPCA$train_predict, Paper6SelectionPCA$train_response, Paper6SelectionPCA$test_predict, Paper6SelectionPCA$test_response)
Cell104 = classifier9bg(Paper6SelectionPLS$train_predict, Paper6SelectionPLS$train_response, Paper6SelectionPLS$test_predict, Paper6SelectionPLS$test_response)
Cell105 = classifier9bg(Paper9Selection$train_predict, Paper9Selection$train_response, Paper9Selection$test_predict, Paper9Selection$test_response)
Cell106 = classifier9bg(Paper29Selection$train_predict, Paper29Selection$train_response, Paper29Selection$test_predict, Paper29Selection$test_response)
round(c(Cell101, Cell102, Cell103, Cell104, Cell105, Cell106),4)
In [24]:
# Example for calculating Row 11.
Cell111 = classifier9bgcpd(Paper1Selection$train_predict, Paper1Selection$train_response, Paper1Selection$test_predict, Paper1Selection$test_response)
Cell112 = classifier9bgcpd(Paper3Selection$train_predict, Paper3Selection$train_response, Paper3Selection$test_predict, Paper3Selection$test_response)
Cell113 = classifier9bgcpd(Paper6SelectionPCA$train_predict, Paper6SelectionPCA$train_response, Paper6SelectionPCA$test_predict, Paper6SelectionPCA$test_response)
Cell114 = classifier9bgcpd(Paper6SelectionPLS$train_predict, Paper6SelectionPLS$train_response, Paper6SelectionPLS$test_predict, Paper6SelectionPLS$test_response)
Cell115 = classifier9bgcpd(Paper9Selection$train_predict, Paper9Selection$train_response, Paper9Selection$test_predict, Paper9Selection$test_response)
Cell116 = classifier9bgcpd(Paper29Selection$train_predict, Paper29Selection$train_response, Paper29Selection$test_predict, Paper29Selection$test_response)
round(c(Cell111, Cell112, Cell113, Cell114, Cell115, Cell116),4)
In [25]:
# Example for calculating Row 12.
Cell121 = classifier9flda(Paper1Selection$train_predict, Paper1Selection$train_response, Paper1Selection$test_predict, Paper1Selection$test_response)
Cell122 = classifier9flda(Paper3Selection$train_predict, Paper3Selection$train_response, Paper3Selection$test_predict, Paper3Selection$test_response)
Cell123 = classifier9flda(Paper6SelectionPCA$train_predict, Paper6SelectionPCA$train_response, Paper6SelectionPCA$test_predict, Paper6SelectionPCA$test_response)
Cell124 = classifier9flda(Paper6SelectionPLS$train_predict, Paper6SelectionPLS$train_response, Paper6SelectionPLS$test_predict, Paper6SelectionPLS$test_response)
Cell125 = classifier9flda(Paper9Selection$train_predict, Paper9Selection$train_response, Paper9Selection$test_predict, Paper9Selection$test_response)
Cell126 = classifier9flda(Paper29Selection$train_predict, Paper29Selection$train_response, Paper29Selection$test_predict, Paper29Selection$test_response)
round(c(Cell121, Cell122, Cell123, Cell124, Cell125, Cell126), 4)
In [26]:
# Example for calculating Row 13.
Cell131 = classifier9dlda(Paper1Selection$train_predict, Paper1Selection$train_response, Paper1Selection$test_predict, Paper1Selection$test_response)
Cell132 = classifier9dlda(Paper3Selection$train_predict, Paper3Selection$train_response, Paper3Selection$test_predict, Paper3Selection$test_response)
Cell133 = classifier9dlda(Paper6SelectionPCA$train_predict, Paper6SelectionPCA$train_response, Paper6SelectionPCA$test_predict, Paper6SelectionPCA$test_response)
Cell134 = classifier9dlda(Paper6SelectionPLS$train_predict, Paper6SelectionPLS$train_response, Paper6SelectionPLS$test_predict, Paper6SelectionPLS$test_response)
Cell135 = classifier9dlda(Paper9Selection$train_predict, Paper9Selection$train_response, Paper9Selection$test_predict, Paper9Selection$test_response)
Cell136 = classifier9dlda(Paper29Selection$train_predict, Paper29Selection$train_response, Paper29Selection$test_predict, Paper29Selection$test_response)
round(c(Cell131, Cell132, Cell133, Cell134, Cell135, Cell136),4)
In [27]:
# Example for calculating Row 14.
Cell141 = classifier9dqda(Paper1Selection$train_predict, Paper1Selection$train_response, Paper1Selection$test_predict, Paper1Selection$test_response)
Cell142 = classifier9dqda(Paper3Selection$train_predict, Paper3Selection$train_response, Paper3Selection$test_predict, Paper3Selection$test_response)
Cell143 = classifier9dqda(Paper6SelectionPCA$train_predict, Paper6SelectionPCA$train_response, Paper6SelectionPCA$test_predict, Paper6SelectionPCA$test_response)
Cell144 = classifier9dqda(Paper6SelectionPLS$train_predict, Paper6SelectionPLS$train_response, Paper6SelectionPLS$test_predict, Paper6SelectionPLS$test_response)
Cell145 = classifier9dqda(Paper9Selection$train_predict, Paper9Selection$train_response, Paper9Selection$test_predict, Paper9Selection$test_response)
Cell146 = classifier9dqda(Paper29Selection$train_predict, Paper29Selection$train_response, Paper29Selection$test_predict, Paper29Selection$test_response)
round(c(Cell141, Cell142, Cell143, Cell144, Cell145, Cell146), 4)
In [28]:
# Example for calculating Row 15.
Cell151 = classifier29(Paper1Selection$train_predict, Paper1Selection$train_response, Paper1Selection$test_predict, Paper1Selection$test_response)
Cell152 = classifier29(data.frame(apply(Paper3Selection$train_predict, 2, as.numeric)), Paper3Selection$train_response, data.frame(apply(Paper3Selection$test_predict , 2, as.numeric)), Paper3Selection$test_response)
Cell153 = classifier29(Paper6SelectionPCA$train_predict, Paper6SelectionPCA$train_response, Paper6SelectionPCA$test_predict, Paper6SelectionPCA$test_response, B = 3)
Cell154 = classifier29(Paper6SelectionPLS$train_predict, Paper6SelectionPLS$train_response, Paper6SelectionPLS$test_predict, Paper6SelectionPLS$test_response, B = 3)
Cell155 = classifier29(data.frame(Paper9Selection$train_predict), Paper9Selection$train_response, data.frame(Paper9Selection$test_predict), Paper9Selection$test_response)
Cell156 = classifier29(data.frame(Paper29Selection$train_predict),Paper29Selection$train_response, data.frame(Paper29Selection$test_predict), Paper29Selection$test_response)
round(c(Cell151, Cell152, Cell153, Cell154, Cell155, Cell156), 4)
Paper1 Selection | Paper3 Selection | Paper6 PCA Selection | Paper6 PLS Selection | Paper9 Selection | Paper29 Selection | |
---|---|---|---|---|---|---|
Paper1 classifiier | 0.912 | 0.941 | 0.971 | 0.971 | 0.882 | 0.735 |
Paper 3 NN | 0.971 | 0.941 | 0.912 | 0.941 | 0.971 | 0.971 |
Paper 3 SVM Linear | 0.971 | 0.971 | 0.941 | 0.971 | 0.971 | 0.765 |
Paper 3 SVM Quadratic | 0.971 | 0.882 | 0.971 | 0.971 | 0.971 | 0.912 |
Paper 3 Adaboost | 0.912 | 0.912 | 0.971 | 0.971 | 0.912 | 0.912 |
Paper 6 logit | 0.971 | 0.971 | 0.971 | 0.971 | 0.971 | 0.882 |
Paper 6 qda | 0.941 | 0.912 | 0.941 | 0.971 | 0.971 | 0.853 |
Paper 9 nn | 0.971 | 0.912 | 0.853 | 0.971 | 0.941 | 0.941 |
Paper 9 decision tree | 0.912 | 0.912 | 0.971 | 0.971 | 0.912 | 0.735 |
Paper 9 bagging | 0.941 | 0.912 | 0.971 | 0.971 | 0.912 | 0.765 |
Paper 9 bagging with CPD | 0.735 | 0.853 | 0.8235 | 0.912 | 0.765 | 0.677 |
Paper 9 FLDA | 0.882 | 0.882 | 0.971 | 0.971 | 0.882 | 0.882 |
Paper 9 DLDA | 0.971 | 0.941 | 0.971 | 0.971 | 0.971 | 0.882 |
Paper 9 DQDA | 0.971 | 0.941 | 0.971 | 0.971 | 0.971 | 0.882 |
Paper 29 Bayesian Network | 0.735 | 0.882 | 0.971 | 0.971 | 0.824 | 0.618 |
Reported Result | Reproduced result | |
---|---|---|
Paper1 | 0.853(29/34) | 0.912(31/34) |
Paper 3 NN | 0.916 | 0.941 |
Paper 3 SVM Linear | 0.933 | 0.971 |
Paper 3 SVM Quadratic | 0.944 | 0.882 |
Paper 3 Adaboost | 0.958 | 0.912 |
Paper 6 logit(PCA) | 1.00 | 0.971 |
Paper 6 qda(PCA) | 0.972 | 0.941 |
Paper 6 logit(PLS) | 0.944 | 0.971 |
Paper 6 qda(PLS) | 0.944 | 0.971 |
Paper 9 nn | 0.958 | 0.941 |
Paper 9 decision tree | 0.875 | 0.912 |
Paper 9 bagging | 0.917 | 0.912 |
Paper 9 bagging with CPD | 0.958 | 0.765 |
Paper 9 FLDA | 0.875 | 0.882 |
Paper 9 DLDA | 1 | 0.971 |
Paper 9 DQDA | 0.958 | 0.971 |
Paper 29 Bayesian Network | 0.941 | 0.618 |
In [ ]: