Loading Data


In [1]:
Pkg.add("Images")
Pkg.add("DataFrames")
using Images
using DataFrames

#typeData could be either "train" or "test.
#labelsInfo should contain the IDs of each image to be read
#The images in the trainResized and testResized data files
#are 20x20 pixels, so imageSize is set to 400.
#path should be set to the location of the data files.

function read_data(typeData, labelsInfo, imageSize, path)
 #Intialize x matrix
 x = zeros(size(labelsInfo, 1), imageSize)

 for (index, idImage) in enumerate(labelsInfo["ID"]) 
  #Read image file 
  nameFile = "$(path)/$(typeData)Resized/$(idImage).Bmp"
  img = imread(nameFile)

  #Convert img to float values 
  temp = float32sc(img)

  #Convert color images to gray images
  #by taking the average of the color scales. 
  if ndims(temp) == 3
   temp = mean(temp.data, 1)
  end
    
  #Transform image matrix to a vector and store 
  #it in data matrix 
  x[index, :] = reshape(temp, 1, imageSize)
 end 
 return x
end

imageSize = 400 # 20 x 20 pixel

#Set location of data files, folders
path = ...

#Read information about training data , IDs.
labelsInfoTrain = readtable("$(path)/trainLabels.csv")

#Read training matrix
xTrain = read_data("train", labelsInfoTrain, imageSize, path)

#Read information about test data ( IDs ).
labelsInfoTest = readtable("$(path)/sampleSubmission.csv")

#Read test matrix
xTest = read_data("test", labelsInfoTest, imageSize, path)

#Get only first character of string (convert from string to character).
#Apply the function to each element of the column "Class"
yTrain = map(x -> x[1], labelsInfoTrain["Class"])

#Convert from character to integer
yTrain = int(yTrain)

xTrain = xTrain'
xTest = xTest'


INFO: Nothing to be done.
INFO: Nothing to be done.
Warning: New definition 
    scale(ScaleNone{T<:Integer},S<:FloatingPoint) at C:\Users\Luis Tandalla\.julia\v0.2\Images\src\scaling.jl:49
is ambiguous with: 
    scale(ScaleNone{T<:Real},T<:Real) at C:\Users\Luis Tandalla\.julia\v0.2\Images\src\scaling.jl:46.
To fix, define 
    scale(ScaleNone{_<:Integer},_<:Integer)
before the new definition.
Warning: New definition 
    .==(AbstractArray{T,N},DataArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\DataArrays\src\operators.jl:335
is ambiguous with: 
    .==(AbstractImageDirect{Bool,N},AbstractArray{Bool,N}) at C:\Users\Luis Tandalla\.julia\v0.2\Images\src\algorithms.jl:74.
To fix, define 
    .==(AbstractImageDirect{Bool,N},DataArray{Bool,N})
before the new definition.
Warning: New definition 
    .==(AbstractArray{T,N},DataArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\DataArrays\src\operators.jl:335
is ambiguous with: 
    .==(AbstractImageDirect{T,N},AbstractArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\Images\src\algorithms.jl:75.
To fix, define 
    .==(AbstractImageDirect{T,N},DataArray{T,N})
before the new definition.
Warning: New definition 
    .==(AbstractArray{T,N},AbstractDataArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\DataArrays\src\operators.jl:356
is ambiguous with: 
    .==(AbstractImageDirect{Bool,N},AbstractArray{Bool,N}) at C:\Users\Luis Tandalla\.julia\v0.2\Images\src\algorithms.jl:74.
To fix, define 
    .==(AbstractImageDirect{Bool,N},AbstractDataArray{Bool,N})
before the new definition.
Warning: New definition 
    .==(AbstractArray{T,N},AbstractDataArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\DataArrays\src\operators.jl:356
is ambiguous with: 
    .==(AbstractImageDirect{T,N},AbstractArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\Images\src\algorithms.jl:75.
To fix, define 
    .==(AbstractImageDirect{T,N},AbstractDataArray{T,N})
before the new definition.
Warning: New definition 
    .>(AbstractArray{T,N},DataArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\DataArrays\src\operators.jl:335
is ambiguous with: 
    .>(AbstractImageDirect{T,N},AbstractArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\Images\src\algorithms.jl:72.
To fix, define 
    .>(AbstractImageDirect{T,N},DataArray{T,N})
before the new definition.
Warning: New definition 
    .>(AbstractArray{T,N},AbstractDataArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\DataArrays\src\operators.jl:356
is ambiguous with: 
    .>(AbstractImageDirect{T,N},AbstractArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\Images\src\algorithms.jl:72.
To fix, define 
    .>(AbstractImageDirect{T,N},AbstractDataArray{T,N})
before the new definition.
Warning: New definition 
    .<(AbstractArray{T,N},DataArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\DataArrays\src\operators.jl:335
is ambiguous with: 
    .<(AbstractImageDirect{Bool,N},AbstractArray{Bool,N}) at C:\Users\Luis Tandalla\.julia\v0.2\Images\src\algorithms.jl:70.
To fix, define 
    .<(AbstractImageDirect{Bool,N},DataArray{Bool,N})
before the new definition.
Warning: New definition 
    .<(AbstractArray{T,N},DataArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\DataArrays\src\operators.jl:335
is ambiguous with: 
    .<(AbstractImageDirect{T,N},AbstractArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\Images\src\algorithms.jl:71.
To fix, define 
    .<(AbstractImageDirect{T,N},DataArray{T,N})
before the new definition.
Warning: New definition 
    .<(AbstractArray{T,N},AbstractDataArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\DataArrays\src\operators.jl:356
is ambiguous with: 
    .<(AbstractImageDirect{Bool,N},AbstractArray{Bool,N}) at C:\Users\Luis Tandalla\.julia\v0.2\Images\src\algorithms.jl:70.
To fix, define 
    .<(AbstractImageDirect{Bool,N},AbstractDataArray{Bool,N})
before the new definition.
Warning: New definition 
    .<(AbstractArray{T,N},AbstractDataArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\DataArrays\src\operators.jl:356
is ambiguous with: 
    .<(AbstractImageDirect{T,N},AbstractArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\Images\src\algorithms.jl:71.
To fix, define 
    .<(AbstractImageDirect{T,N},AbstractDataArray{T,N})
before the new definition.
Warning: New definition 
    +(AbstractArray{T,N},DataArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\DataArrays\src\operators.jl:335
is ambiguous with: 
    +(AbstractImageDirect{T,N},AbstractArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\Images\src\algorithms.jl:12.
To fix, define 
    +(AbstractImageDirect{T,N},DataArray{T,N})
before the new definition.
Warning: New definition 
    +(AbstractArray{T,N},AbstractDataArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\DataArrays\src\operators.jl:356
is ambiguous with: 
    +(AbstractImageDirect{T,N},AbstractArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\Images\src\algorithms.jl:12.
To fix, define 
    +(AbstractImageDirect{T,N},AbstractDataArray{T,N})
before the new definition.
Warning: New definition 
    .+(AbstractArray{T,N},DataArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\DataArrays\src\operators.jl:335
is ambiguous with: 
    .+(AbstractImageDirect{T,N},AbstractArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\Images\src\algorithms.jl:14.
To fix, define 
    .+(AbstractImageDirect{T,N},DataArray{T,N})
before the new definition.
Warning: New definition 
    .+(AbstractArray{T,N},AbstractDataArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\DataArrays\src\operators.jl:356
is ambiguous with: 
    .+(AbstractImageDirect{T,N},AbstractArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\Images\src\algorithms.jl:14.
To fix, define 
    .+(AbstractImageDirect{T,N},AbstractDataArray{T,N})
before the new definition.
Warning: New definition 
    -(AbstractArray{T,N},DataArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\DataArrays\src\operators.jl:335
is ambiguous with: 
    -(AbstractImageDirect{T,N},AbstractArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\Images\src\algorithms.jl:21.
To fix, define 
    -(AbstractImageDirect{T,N},DataArray{T,N})
before the new definition.
Warning: New definition 
    -(AbstractArray{T,N},AbstractDataArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\DataArrays\src\operators.jl:356
is ambiguous with: 
    -(AbstractImageDirect{T,N},AbstractArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\Images\src\algorithms.jl:21.
To fix, define 
    -(AbstractImageDirect{T,N},AbstractDataArray{T,N})
before the new definition.
Warning: New definition 
    .-(AbstractArray{T,N},DataArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\DataArrays\src\operators.jl:335
is ambiguous with: 
    .-(AbstractImageDirect{T,N},AbstractArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\Images\src\algorithms.jl:24.
To fix, define 
    .-(AbstractImageDirect{T,N},DataArray{T,N})
before the new definition.
Warning: New definition 
    .-(AbstractArray{T,N},AbstractDataArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\DataArrays\src\operators.jl:356
is ambiguous with: 
    .-(AbstractImageDirect{T,N},AbstractArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\Images\src\algorithms.jl:24.
To fix, define 
    .-(AbstractImageDirect{T,N},AbstractDataArray{T,N})
before the new definition.
Warning: New definition 
    .*(DataArray{T,N},AbstractArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\DataArrays\src\operators.jl:335
is ambiguous with: 
    .*(AbstractArray{T,N},AbstractImageDirect{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\Images\src\algorithms.jl:36.
To fix, define 
    .*(DataArray{T,N},AbstractImageDirect{T,N})
before the new definition.
Warning: New definition 
    .*(AbstractArray{T,N},DataArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\DataArrays\src\operators.jl:335
is ambiguous with: 
    .*(AbstractImageDirect{T,N},AbstractArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\Images\src\algorithms.jl:35.
To fix, define 
    .*(AbstractImageDirect{T,N},DataArray{T,N})
before the new definition.
Warning: New definition 
    .*(AbstractDataArray{T,N},AbstractArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\DataArrays\src\operators.jl:356
is ambiguous with: 
    .*(AbstractArray{T,N},AbstractImageDirect{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\Images\src\algorithms.jl:36.
To fix, define 
    .*(AbstractDataArray{T,N},AbstractImageDirect{T,N})
before the new definition.
Warning: New definition 
    .*(AbstractArray{T,N},AbstractDataArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\DataArrays\src\operators.jl:356
is ambiguous with: 
    .*(AbstractImageDirect{T,N},AbstractArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\Images\src\algorithms.jl:35.
To fix, define 
    .*(AbstractImageDirect{T,N},AbstractDataArray{T,N})
before the new definition.
Warning: New definition 
    ./(AbstractArray{T,N},DataArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\DataArrays\src\operators.jl:335
is ambiguous with: 
    ./(AbstractImageDirect{T,N},AbstractArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\Images\src\algorithms.jl:39.
To fix, define 
    ./(AbstractImageDirect{T,N},DataArray{T,N})
before the new definition.
Warning: New definition 
    ./(AbstractArray{T,N},AbstractDataArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\DataArrays\src\operators.jl:356
is ambiguous with: 
    ./(AbstractImageDirect{T,N},AbstractArray{T,N}) at C:\Users\Luis Tandalla\.julia\v0.2\Images\src\algorithms.jl:39.
To fix, define 
    ./(AbstractImageDirect{T,N},AbstractDataArray{T,N})
Out[1]:
400x6220 Array{Float64,2}:
 0.470588  0.284967  0.115033  0.586928  …  0.379085  0.626144  0.437909
 0.469281  0.291503  0.15817   0.546405     0.393464  0.59085   0.462745
 0.46536   0.315033  0.160784  0.571242     0.396078  0.639216  0.431373
 0.469281  0.30719   0.162092  0.526797     0.397386  0.607843  0.413072
 0.462745  0.31634   0.162092  0.569935     0.393464  0.597386  0.389542
 0.456209  0.346405  0.162092  0.603922  …  0.392157  0.597386  0.392157
 0.48366   0.355556  0.166013  0.562092     0.396078  0.602614  0.431373
 0.48366   0.342484  0.16732   0.571242     0.381699  0.6       0.512418
 0.470588  0.34902   0.168627  0.535948     0.384314  0.620915  0.694118
 0.48366   0.372549  0.164706  0.560784     0.389542  0.631373  0.819608
 0.479739  0.372549  0.164706  0.581699  …  0.393464  0.6       0.862745
 0.477124  0.385621  0.164706  0.589543     0.384314  0.635294  0.870588
 0.47451   0.396078  0.163399  0.586928     0.396078  0.611765  0.803922
 ⋮                                       ⋱                              
 0.486275  0.456209  0.162092  0.484967     0.380392  0.618301  0.443137
 0.494118  0.46536   0.159477  0.373856     0.379085  0.613072  0.456209
 0.509804  0.449673  0.15817   0.252288  …  0.376471  0.630065  0.435294
 0.501961  0.448366  0.155556  0.28366      0.368627  0.601307  0.464052
 0.501961  0.453595  0.151634  0.237908     0.369935  0.624837  0.464052
 0.498039  0.454902  0.147712  0.237908     0.376471  0.594771  0.464052
 0.498039  0.454902  0.146405  0.262745     0.373856  0.624837  0.453595
 0.508497  0.444445  0.146405  0.257516  …  0.36732   0.610457  0.445752
 0.50719   0.454902  0.147712  0.394771     0.369935  0.598693  0.413072
 0.504575  0.435294  0.147712  0.552941     0.381699  0.652288  0.520261
 0.499346  0.287582  0.147712  0.576471     0.381699  0.568627  0.623529
 0.490196  0.301961  0.105882  0.581699     0.368627  0.563399  0.420915

Defining main functions


In [2]:
function euclidean_distance(a, b)
 distance = 0.0 
 for index in 1:size(a, 1) 
  distance += (a[index]-b[index]) * (a[index]-b[index])
 end
 return distance
end

#This function finds the k nearest neighbors of the ith data point.
function get_k_nearest_neighbors(x, i, k)

 nRows, nCols = size(x)

 #Let's initialize a vector image_i. We do this so that 
 #the image ith is accessed only once from the main X matrix.
 #The program saves time because no repeated work is done.
 #Also, creating an empty vector and filling it with each 
 #element at a time is faster than copying the entire vector at once.
 #Creating empty array (vector) of nRows elements of type Float32(decimal)
 imageI = Array(Float32, nRows) 

 for index in 1:nRows
  imageI[index] = x[index, i]
 end

 #For the same previous reasons, we initialize an empty vector 
 #that will contain the jth data point
 imageJ = Array(Float32, nRows)
 
 #Let's also initialize an empty vector that will contain the distances
 #between the ith data point and each data point in the X matrix.
 distances = Array(Float32, nCols)
 
 for j in 1:nCols
  #The next for loop fills the vector image_j with the jth data point 
  #from the main matrix. Copying element one by one is faster
  #than copying the entire vector at once.
  for index in 1:nRows
   imageJ[index] = x[index, j]
  end
  #Let's calculate the distance and save the result
  distances[j] = euclidean_distance(imageI, imageJ)
 end
 
 #The following line gives the indices sorted by distances.
 sortedNeighbors = sortperm(distances)
 
 #Let's select the k nearest neighbors. We start with the 
 #second closest. See explanation below.
 kNearestNeighbors = sortedNeighbors[2:k+1]
 return kNearestNeighbors
end

#This function assigns a label to the ith point according to
#the labels of the k nearest neighbors. The training
#data is stored in the X matrix, and its labels are stored in y.

function assign_label(x, y, k, i)
 kNearestNeighbors = get_k_nearest_neighbors(x, i, k)
 
 #let's make a dictionary to save the counts of 
 #the labels
 # Dict{}() is also right .
 # Int,Int indicates the dictionary to expect integer values 
 counts = Dict{Int, Int}() 

 #The next two variables keep track of the 
 #label with the highest count.
 highestCount = 0
 mostPopularLabel = 0

 #Iterating over the labels of the k nearest neighbors
 for n in kNearestNeighbors
  labelOfN = y[n]
  #Adding the current label to our dictionary
  #if it's not already there
  if !haskey(counts, labelOfN)
   counts[labelOfN] = 0
  end
  #Add one to the count
  counts[labelOfN] += 1 

  if counts[labelOfN] > highestCount
   highestCount = counts[labelOfN]
   mostPopularLabel = labelOfN
  end 
 end
 return mostPopularLabel
end


Out[2]:
assign_label (generic function with 1 method)
before the new definition.

Running LOOF-CV with 1NN sequentially


In [3]:
tic()
k=1 
yPredictions = [assign_label(xTrain, yTrain, k, i) for i in 1:size(xTrain, 2)]
loofCvAccuracy = mean(yPredictions .== yTrain) 
println("The LOOF-CV accuracy of 1NN is $(loofCvAccuracy)")
toc()


The LOOF-CV accuracy of 1NN is 0.42798026420499763
elapsed time: 89.379442979 seconds
Out[3]:
89.379442979

Preparing Julia to run in parallel


In [4]:
addprocs(3) 

@everywhere function euclidean_distance(a, b)
 distance = 0.0 
 for index in 1:size(a, 1) 
  distance += (a[index]-b[index]) * (a[index]-b[index])
 end
 return distance
end

@everywhere function get_k_nearest_neighbors(x, i, k)
 nRows, nCols = size(x)
 imageI = Array(Float32, nRows)
 for index in 1:nRows
  imageI[index] = x[index, i]
 end
 imageJ = Array(Float32, nRows)
 distances = Array(Float32, nCols) 
 for j in 1:nCols
  for index in 1:nRows
   imageJ[index] = x[index, j]
  end
  distances[j] = euclidean_distance(imageI, imageJ)
 end
 sortedNeighbors = sortperm(distances)
 kNearestNeighbors = sortedNeighbors[2:k+1]
 return kNearestNeighbors
end 

@everywhere function assign_label(x, y, k, i)
 kNearestNeighbors = get_k_nearest_neighbors(x, i, k) 
 counts = Dict{Int, Int}() 
 highestCount = 0
 mostPopularLabel = 0
 for n in kNearestNeighbors
  labelOfN = y[n]
  if !haskey(counts, labelOfN)
   counts[labelOfN] = 0
  end
  counts[labelOfN] += 1 
  if counts[labelOfN] > highestCount
   highestCount = counts[labelOfN]
   mostPopularLabel = labelOfN
  end 
 end
 return mostPopularLabel
end

Running LOOF-CV with 1NN in parallel


In [5]:
tic()
k = 1
yPredictions = @parallel (vcat) for i in 1:size(xTrain, 2)
 assign_label(xTrain, yTrain, k, i)
end
loofCvAccuracy = mean(yPredictions .== yTrain) 
println("The LOOF-CV accuracy of 1NN is $(loofCvAccuracy)")
toc()


The LOOF-CV accuracy of 1NN is 0.42798026420499763
elapsed time: 45.727314347 seconds
Out[5]:
45.727314347

In [6]:
tic()
k = 1
sumValues = @parallel (+) for i in 1:size(xTrain, 2)
 assign_label(xTrain, yTrain, k, i) == yTrain[i, 1]
end
loofCvAccuracy = sumValues / size(xTrain, 2)
println("The LOOF-CV accuracy of 1NN is $(loofCvAccuracy)")
toc()


elapsed time: 45.046256153 seconds
Out[6]:
45.046256153

Tuning the value for k


In [8]:
#Similar to function assign_label.
#Only changes are commented
@everywhere function assign_label_each_k(x, y, maxK, i)
 kNearestNeighbors = get_k_nearest_neighbors(x, i, maxK) 

 #The next array will keep the labels for each value of k
 labelsK = zeros(Int, 1, maxK) 

 counts = Dict{Int, Int}()
 highestCount = 0
 mostPopularLabel = 0

 #We need to keep track of the current value of k
 for (k, n) in enumerate(kNearestNeighbors)
  labelOfN = y[n]
  if !haskey(counts, labelOfN)
   counts[labelOfN] = 0
  end
  counts[labelOfN] += 1
  if counts[labelOfN] > highestCount
   highestCount = counts[labelOfN]
   mostPopularLabel = labelOfN  
  end
  #Save current most popular label 
  labelsK[k] = mostPopularLabel
 end
 #Return vector of labels for each k
 return labelsK
end

In [9]:
tic()
maxK = 20 #Any value can be chosen
yPredictionsK = @parallel (vcat) for i in 1:size(xTrain, 2)
 assign_label_each_k(xTrain, yTrain, maxK, i)
end
for k in 1:maxK
 accuracyK = mean(yTrain .== yPredictionsK[:, k])
 println("The LOOF-CV accuracy of $(k)-NN is $(accuracyK)")
end
toc()


The LOOF-CV accuracy of 1-NN is 0.42798026420499763
The LOOF-CV accuracy of 2-NN is 0.42798026420499763
The LOOF-CV accuracy of 3-NN is 0.4286169027534617
The LOOF-CV accuracy of 4-NN is 0.4211363998090084
The LOOF-CV accuracy of 5-NN is 0.41301925831609104
The LOOF-CV accuracy of 6-NN is 0.40713035174279805
The LOOF-CV accuracy of 7-NN is 0.3983765717014165
The LOOF-CV accuracy of 8-NN is 0.3961483367817921
The LOOF-CV accuracy of 9-NN is 0.3923285054910075
The LOOF-CV accuracy of 10-NN is 0.3886678338373388
The LOOF-CV accuracy of 11-NN is 0.3865987585548305
The LOOF-CV accuracy of 12-NN is 0.38309724653827787
The LOOF-CV accuracy of 13-NN is 0.378004138150565
The LOOF-CV accuracy of 14-NN is 0.37657170141652074
The LOOF-CV accuracy of 15-NN is 0.37593506286805667
The LOOF-CV accuracy of 16-NN is 0.3729110297628521
The LOOF-CV accuracy of 17-NN is 0.3695686773834156
The LOOF-CV accuracy of 18-NN is 0.3679770810122553
The LOOF-CV accuracy of 19-NN is 0.3692503581091835
The LOOF-CV accuracy of 20-NN is 0.36718128282667517
elapsed time: 85.727564674 seconds
Out[9]:
85.727564674

Running kNN on the test data


In [10]:
@everywhere function get_k_nearest_neighbors(xTrain, imageI, k)
 nRows, nCols = size(xTrain) 
 imageJ = Array(Float32, nRows)
 distances = Array(Float32, nCols) 
 for j in 1:nCols
  for index in 1:nRows
   imageJ[index] = xTrain[index, j]
  end
  distances[j] = euclidean_distance(imageI, imageJ)
 end
 sortedNeighbors = sortperm(distances)
 kNearestNeighbors = sortedNeighbors[1:k]
 return kNearestNeighbors
end 

@everywhere function assign_label(xTrain, yTrain, k, imageI)
 kNearestNeighbors = get_k_nearest_neighbors(xTrain, imageI, k) 
 counts = Dict{Int, Int}() 
 highestCount = 0
 mostPopularLabel = 0
 for n in kNearestNeighbors
  labelOfN = yTrain[n]
  if !haskey(counts, labelOfN)
   counts[labelOfN] = 0
  end
  counts[labelOfN] += 1 #add one to the count
  if counts[labelOfN] > highestCount
   highestCount = counts[labelOfN]
   mostPopularLabel = labelOfN
  end 
 end
 return mostPopularLabel
end

In [11]:
tic()
k = 3 # The CV accuracy shows this value to be the best.
yPredictions = @parallel (vcat) for i in 1:size(xTest, 2)
 nRows = size(xTrain, 1)
 imageI = Array(Float32, nRows)
 for index in 1:nRows
  imageI[index] = xTest[index, i]
 end
 assign_label(xTrain, yTrain, k, imageI)
end
toc()


elapsed time: 46.272848076 seconds
Out[11]:
46.272848076

In [12]:
#Convert integer predictions to character
labelsInfoTest["Class"] = char(yPredictions)

#Save predictions
writetable("$(path)/juliaKNNSubmission.csv", labelsInfoTest, separator=',', header=true)
println("Submission file saved in $(path)/juliaKNNSubmission.csv")