In [ ]:
#Pkg.add("MNIST");
#Pkg.add("PyPlot")

In [1]:
using MNIST
using PyPlot

In [2]:
# ===================
# load training data
# ===================
X,y = traindata(); #X:(784x60000), y:(60000x1)
X /= 255.0; # scale the input between 0 and 1
X = X'; #X:(60000X784)
y = y+1; # adding 1 to handle index, hence value 1 represent digit 0 now
# number of instances
println(size(y,1));


60000

In [3]:
const inputLayerSize = size(X,2);
const hiddenLayerSize = 100;
const outputLayerSize = 10;

In [4]:
# define a network
network = [];
numIterations = 1000;
const alpha = 1e-0; # step size
const lambda = 1e-3; # regularization factor


Out[4]:
0.001

In [5]:
# add first layer to the network
layer1 = Array(Matrix{Float64}, 1,2)
#theta1
layer1[1] = 0.01*randn(inputLayerSize, hiddenLayerSize); 
#bias1
layer1[2] = zeros(1, hiddenLayerSize); 
push!(network,layer1);

In [6]:
# add second layer to the network
layer2 = Array(Matrix{Float64}, 1,2)
#theta2
layer2[1] = 0.01*randn(hiddenLayerSize, outputLayerSize); 
#bias2
layer2[2] = zeros(1, outputLayerSize); 
push!(network,layer2);

In [7]:
function forwardNN(x::Matrix{Float64})
    global network;
    # collect input for each layer
    activation = Matrix{Float64}[];
    # initialize input vector with the actual data
    push!(activation, x);
    for layer in 1:length(network)-1
        push!(activation, max(0.0, (activation[layer]*network[layer][1]) .+ network[layer][2]))
    end
    # compute the class probabilities
    # softmax on last layer
    score = activation[length(network)]*network[length(network)][1] .+ network[length(network)][2]
    exp_scores = exp(score);
    probs = exp_scores ./ sum(exp_scores, 2); # [N x K]

    return activation,probs;
end


Out[7]:
forwardNN (generic function with 1 method)

In [8]:
function backwardNN(a::Vector{Matrix{Float64}}, y::Vector{Float64}, dscores::Matrix{Float64})
    global network;
    m = size(y,1);
    delta = Array(Matrix{Float64}, 1,length(network));
    # start from the last layer to backpropagate the error
    # compute the gradient on scores
    for j in 1:size(dscores,1)
        dscores[j,convert(Int32, y[j])] -= 1;
    end
    delta[length(network)] = dscores / m;
    
    for j in length(network)-1:-1:1
        # backpropate the gradient to the parameters
        dhidden = delta[j+1]*network[j+1][1]';
        # backprop the ReLU non-linearity
        dhidden[a[j+1] .<= 0] = 0;
        delta[j] = dhidden;
    end
    return delta;
end


Out[8]:
backwardNN (generic function with 1 method)

In [9]:
function updateThetas(a::Vector{Matrix{Float64}}, delta::Matrix{Matrix{Float64}})
    global network;
    for j in 1:length(network)
        # update theta
        network[j][1] =  network[j][1] - alpha*(a[j]'*delta[j] + lambda*network[j][1])
        # update bias
        network[j][2] =  network[j][2] - alpha*sum(delta[j],1);
    end
end


Out[9]:
updateThetas (generic function with 1 method)

In [10]:
function costFunction(truth::Vector{Float64}, probability::Matrix{Float64})
    global network;
    # compute the loss: average cross-entropy loss and regularization
    m = size(truth,1)
    
    corect_logprobs = [-log(probability[j,convert(Int32, y[j])]) for j in 1:m];
    data_loss = sum(corect_logprobs)/m;
    
    reg_loss = 0;
    for j in 1:length(network)
        reg_loss = reg_loss + 0.5*lambda*sum(network[j][1].^2);
    end
    
    loss = data_loss + reg_loss;
    return loss;
end


Out[10]:
costFunction (generic function with 1 method)

In [11]:
J = zeros(numIterations,1);
for itr in 1:numIterations
    # feedforwarf
    activations, probs = forwardNN(X); 
    # cost 
    if itr%100 ==0
        J[itr,:] = costFunction(y, probs);
        println("cost: ", J[itr,:]);
    end
    # backpropagation
    newThetas = backwardNN(activations, y, probs);
    # update parameters
    updateThetas(activations, newThetas);
end
# print network to weight_file
# print ids to id_file


cost: [0.3425860661080752]
cost: [0.27579192159873106]
cost: [0.2272569768576536]
cost: [0.20965762253801098]
cost: [0.20493261136365065]
cost: [0.19439090346226265]
cost: [0.2789278047077673]
cost: [0.24548923798270034]
cost: [0.22795710495999277]
cost: [0.20856739145022474]

In [12]:
function predict(data::Matrix{Float64})
    activation, probs = forwardNN(data);
    predicted_class = [indmax(probs[j,:]) for j in 1:size(probs,1)]
    return predicted_class;
end


Out[12]:
predict (generic function with 1 method)

In [13]:
pred = predict(X);

In [14]:
function accuracy(truth, prediction)
    correct = 0;
    for j in 1:length(truth)
        if truth[j] == prediction[j]
            correct = correct + 1;
        end
    end
    println("training accuracy: ", correct/length(truth)*100);
end


Out[14]:
accuracy (generic function with 1 method)

In [15]:
accuracy(y, pred)


training accuracy: 96.86833333333334

In [ ]:
# plt.matshow

In [16]:
# test data
XTest,yTest = testdata();

In [17]:
XTest /= 255.0;
XTest = XTest';
yTest = yTest+1;

In [18]:
predTest = predict(XTest);

In [19]:
accuracy(yTest, predTest);


training accuracy: 96.19

In [ ]: