Nearest Neighbor

As the distance measure defined in the paper are not general distance measure used by R knn function. We reimplement KNN in R and this cell could be a little bit slow.


In [1]:
load("../transformed data/golub3571.rda")
load("../transformed data/paper9.rda")
# Settings as specified in the paper
p = 40 # number of genes for FLDA
B = 50 # Aggregation predictors
N = 200 # repeat classification N times
d = c(0.05, 0.1,0.25, 0.5, 0.75, 1) # CPD parameter
set.seed(2017)

In [ ]:
k = seq(1, 21, 2)
# Distance measure used in the paper
Distance = function(predictor, test){
    1- apply(predictor, 1, cor, test)
}
# NN classification process
nn = function(test, pk, learning, response){
     distance = Distance(learning, test)
     index = order(distance)[1:pk]
     cl = ifelse(sum(response[index] == "AML")>sum(response[index]=="ALL"), "AML", "ALL")
     cl
}
# leave-one-cross-validation to tune k
mycv= function(pk,learning,response){
    error = 0
    for(i in 1:nrow(learning)){
        cl = nn(learning[i,], pk, learning[-i, ], response[-i])
        error = error+(cl == response[i])
    }
    error
}
error_count = numeric(N)
for(i in 1:N){
    # split data
    nn_index = mysplit(nrow(scale_golub_merge))
    nn_train_p = scale_golub_merge[-nn_index,]
    nn_train_r = total3571_response[-nn_index]
    nn_test_p = scale_golub_merge[nn_index,]
    nn_test_r = total3571_response[nn_index]
    # gene selection/feature selection
    temp_bw = order(BW(nn_train_p, nn_train_r), decreasing = T)[1:p]
    nn_train_p = nn_train_p[,temp_bw]
    nn_test_p = nn_test_p[,temp_bw]
    # cross-validation to choose k
    choose_k = sapply(k,mycv, learning = nn_train_p, response = nn_train_r)
    # nn classification
    nn_r = apply(nn_test_p,1, nn, k[which.min(choose_k)], nn_train_p, nn_train_r)
    error_count[i] = sum(nn_r != nn_test_r)
}
resultNN = c(Median = median(error_count), Upper_quartile = quantile(error_count, 0.75))