In [ ]:
#Pkg.add("MNIST");
#Pkg.add("PyPlot")
In [1]:
using MNIST
using PyPlot
In [2]:
# ===================
# load training data
# ===================
X,y = traindata(); #X:(784x60000), y:(60000x1)
X /= 255.0; # scale the input between 0 and 1
X = X'; #X:(60000X784)
y = y+1; # adding 1 to handle index, hence value 1 represent digit 0 now
# number of instances
println(size(y,1));
In [3]:
const inputLayerSize = size(X,2);
const hiddenLayerSize = 100;
const outputLayerSize = 10;
In [4]:
# define a network
network = [];
numIterations = 1000;
const alpha = 1e-0; # step size
const lambda = 1e-3; # regularization factor
Out[4]:
In [5]:
# add first layer to the network
layer1 = Array(Matrix{Float64}, 1,2)
#theta1
layer1[1] = 0.01*randn(inputLayerSize, hiddenLayerSize);
#bias1
layer1[2] = zeros(1, hiddenLayerSize);
push!(network,layer1);
In [6]:
# add second layer to the network
layer2 = Array(Matrix{Float64}, 1,2)
#theta2
layer2[1] = 0.01*randn(hiddenLayerSize, outputLayerSize);
#bias2
layer2[2] = zeros(1, outputLayerSize);
push!(network,layer2);
In [7]:
function forwardNN(x::Matrix{Float64})
global network;
# collect input for each layer
activation = Matrix{Float64}[];
# initialize input vector with the actual data
push!(activation, x);
for layer in 1:length(network)-1
push!(activation, max(0.0, (activation[layer]*network[layer][1]) .+ network[layer][2]))
end
# compute the class probabilities
# softmax on last layer
score = activation[length(network)]*network[length(network)][1] .+ network[length(network)][2]
exp_scores = exp(score);
probs = exp_scores ./ sum(exp_scores, 2); # [N x K]
return activation,probs;
end
Out[7]:
In [8]:
function backwardNN(a::Vector{Matrix{Float64}}, y::Vector{Float64}, dscores::Matrix{Float64})
global network;
m = size(y,1);
delta = Array(Matrix{Float64}, 1,length(network));
# start from the last layer to backpropagate the error
# compute the gradient on scores
for j in 1:size(dscores,1)
dscores[j,convert(Int32, y[j])] -= 1;
end
delta[length(network)] = dscores / m;
for j in length(network)-1:-1:1
# backpropate the gradient to the parameters
dhidden = delta[j+1]*network[j+1][1]';
# backprop the ReLU non-linearity
dhidden[a[j+1] .<= 0] = 0;
delta[j] = dhidden;
end
return delta;
end
Out[8]:
In [9]:
function updateThetas(a::Vector{Matrix{Float64}}, delta::Matrix{Matrix{Float64}})
global network;
for j in 1:length(network)
# update theta
network[j][1] = network[j][1] - alpha*(a[j]'*delta[j] + lambda*network[j][1])
# update bias
network[j][2] = network[j][2] - alpha*sum(delta[j],1);
end
end
Out[9]:
In [10]:
function costFunction(truth::Vector{Float64}, probability::Matrix{Float64})
global network;
# compute the loss: average cross-entropy loss and regularization
m = size(truth,1)
corect_logprobs = [-log(probability[j,convert(Int32, y[j])]) for j in 1:m];
data_loss = sum(corect_logprobs)/m;
reg_loss = 0;
for j in 1:length(network)
reg_loss = reg_loss + 0.5*lambda*sum(network[j][1].^2);
end
loss = data_loss + reg_loss;
return loss;
end
Out[10]:
In [11]:
J = zeros(numIterations,1);
for itr in 1:numIterations
# feedforwarf
activations, probs = forwardNN(X);
# cost
if itr%100 ==0
J[itr,:] = costFunction(y, probs);
println("cost: ", J[itr,:]);
end
# backpropagation
newThetas = backwardNN(activations, y, probs);
# update parameters
updateThetas(activations, newThetas);
end
# print network to weight_file
# print ids to id_file
In [12]:
function predict(data::Matrix{Float64})
activation, probs = forwardNN(data);
predicted_class = [indmax(probs[j,:]) for j in 1:size(probs,1)]
return predicted_class;
end
Out[12]:
In [13]:
pred = predict(X);
In [14]:
function accuracy(truth, prediction)
correct = 0;
for j in 1:length(truth)
if truth[j] == prediction[j]
correct = correct + 1;
end
end
println("training accuracy: ", correct/length(truth)*100);
end
Out[14]:
In [15]:
accuracy(y, pred)
In [ ]:
# plt.matshow
In [16]:
# test data
XTest,yTest = testdata();
In [17]:
XTest /= 255.0;
XTest = XTest';
yTest = yTest+1;
In [18]:
predTest = predict(XTest);
In [19]:
accuracy(yTest, predTest);
In [ ]: