Nearest Neighbor
As the distance measure defined in the paper are not general distance measure used by R knn function. We reimplement KNN in R and this cell could be a little bit slow.
In [1]:
load("../transformed data/golub3571.rda")
load("../transformed data/paper9.rda")
# Settings as specified in the paper
p = 40 # number of genes for FLDA
B = 50 # Aggregation predictors
N = 200 # repeat classification N times
d = c(0.05, 0.1,0.25, 0.5, 0.75, 1) # CPD parameter
set.seed(2017)
In [ ]:
k = seq(1, 21, 2)
# Distance measure used in the paper
Distance = function(predictor, test){
1- apply(predictor, 1, cor, test)
}
# NN classification process
nn = function(test, pk, learning, response){
distance = Distance(learning, test)
index = order(distance)[1:pk]
cl = ifelse(sum(response[index] == "AML")>sum(response[index]=="ALL"), "AML", "ALL")
cl
}
# leave-one-cross-validation to tune k
mycv= function(pk,learning,response){
error = 0
for(i in 1:nrow(learning)){
cl = nn(learning[i,], pk, learning[-i, ], response[-i])
error = error+(cl == response[i])
}
error
}
error_count = numeric(N)
for(i in 1:N){
# split data
nn_index = mysplit(nrow(scale_golub_merge))
nn_train_p = scale_golub_merge[-nn_index,]
nn_train_r = total3571_response[-nn_index]
nn_test_p = scale_golub_merge[nn_index,]
nn_test_r = total3571_response[nn_index]
# gene selection/feature selection
temp_bw = order(BW(nn_train_p, nn_train_r), decreasing = T)[1:p]
nn_train_p = nn_train_p[,temp_bw]
nn_test_p = nn_test_p[,temp_bw]
# cross-validation to choose k
choose_k = sapply(k,mycv, learning = nn_train_p, response = nn_train_r)
# nn classification
nn_r = apply(nn_test_p,1, nn, k[which.min(choose_k)], nn_train_p, nn_train_r)
error_count[i] = sum(nn_r != nn_test_r)
}
resultNN = c(Median = median(error_count), Upper_quartile = quantile(error_count, 0.75))