notebook.community

Edit and run



In [ ]:

    
#Pkg.add("MNIST");
#Pkg.add("PyPlot")



In [1]:

    
using MNIST
using PyPlot



In [2]:

    
# ===================
# load training data
# ===================
X,y = traindata(); #X:(784x60000), y:(60000x1)
X /= 255.0; # scale the input between 0 and 1
X = X'; #X:(60000X784)
y = y+1; # adding 1 to handle index, hence value 1 represent digit 0 now
# number of instances
println(size(y,1));



In [3]:

    
const inputLayerSize = size(X,2);
const hiddenLayerSize = 100;
const outputLayerSize = 10;



In [4]:

    
# define a network
network = [];
numIterations = 1000;
const alpha = 1e-0; # step size
const lambda = 1e-3; # regularization factor









    Out[4]:





0.001



In [5]:

    
# add first layer to the network
layer1 = Array(Matrix{Float64}, 1,2)
#theta1
layer1[1] = 0.01*randn(inputLayerSize, hiddenLayerSize); 
#bias1
layer1[2] = zeros(1, hiddenLayerSize); 
push!(network,layer1);



In [6]:

    
# add second layer to the network
layer2 = Array(Matrix{Float64}, 1,2)
#theta2
layer2[1] = 0.01*randn(hiddenLayerSize, outputLayerSize); 
#bias2
layer2[2] = zeros(1, outputLayerSize); 
push!(network,layer2);



In [7]:

    
function forwardNN(x::Matrix{Float64})
    global network;
    # collect input for each layer
    activation = Matrix{Float64}[];
    # initialize input vector with the actual data
    push!(activation, x);
    for layer in 1:length(network)-1
        push!(activation, max(0.0, (activation[layer]*network[layer][1]) .+ network[layer][2]))
    end
    # compute the class probabilities
    # softmax on last layer
    score = activation[length(network)]*network[length(network)][1] .+ network[length(network)][2]
    exp_scores = exp(score);
    probs = exp_scores ./ sum(exp_scores, 2); # [N x K]

    return activation,probs;
end









    Out[7]:





forwardNN (generic function with 1 method)



In [8]:

    
function backwardNN(a::Vector{Matrix{Float64}}, y::Vector{Float64}, dscores::Matrix{Float64})
    global network;
    m = size(y,1);
    delta = Array(Matrix{Float64}, 1,length(network));
    # start from the last layer to backpropagate the error
    # compute the gradient on scores
    for j in 1:size(dscores,1)
        dscores[j,convert(Int32, y[j])] -= 1;
    end
    delta[length(network)] = dscores / m;
    
    for j in length(network)-1:-1:1
        # backpropate the gradient to the parameters
        dhidden = delta[j+1]*network[j+1][1]';
        # backprop the ReLU non-linearity
        dhidden[a[j+1] .<= 0] = 0;
        delta[j] = dhidden;
    end
    return delta;
end









    Out[8]:





backwardNN (generic function with 1 method)



In [9]:

    
function updateThetas(a::Vector{Matrix{Float64}}, delta::Matrix{Matrix{Float64}})
    global network;
    for j in 1:length(network)
        # update theta
        network[j][1] =  network[j][1] - alpha*(a[j]'*delta[j] + lambda*network[j][1])
        # update bias
        network[j][2] =  network[j][2] - alpha*sum(delta[j],1);
    end
end









    Out[9]:





updateThetas (generic function with 1 method)

cross-entropy should be used to measure the distance of two distributions, not any vectors. So, if there is zero in your vector, cross-entropy doesn't make sense. It's better to use Euclidean distance.



In [10]:

    
function costFunction(truth::Vector{Float64}, probability::Matrix{Float64})
    global network;
    # compute the loss: average cross-entropy loss and regularization
    m = size(truth,1)
    
    corect_logprobs = [-log(probability[j,convert(Int32, y[j])]) for j in 1:m];
    data_loss = sum(corect_logprobs)/m;
    
    reg_loss = 0;
    for j in 1:length(network)
        reg_loss = reg_loss + 0.5*lambda*sum(network[j][1].^2);
    end
    
    loss = data_loss + reg_loss;
    return loss;
end









    Out[10]:





costFunction (generic function with 1 method)



In [11]:

    
J = zeros(numIterations,1);
for itr in 1:numIterations
    # feedforwarf
    activations, probs = forwardNN(X); 
    # cost 
    if itr%100 ==0
        J[itr,:] = costFunction(y, probs);
        println("cost: ", J[itr,:]);
    end
    # backpropagation
    newThetas = backwardNN(activations, y, probs);
    # update parameters
    updateThetas(activations, newThetas);
end
# print network to weight_file
# print ids to id_file









    



cost: [0.3425860661080752]
cost: [0.27579192159873106]
cost: [0.2272569768576536]
cost: [0.20965762253801098]
cost: [0.20493261136365065]
cost: [0.19439090346226265]
cost: [0.2789278047077673]
cost: [0.24548923798270034]
cost: [0.22795710495999277]
cost: [0.20856739145022474]



In [12]:

    
function predict(data::Matrix{Float64})
    activation, probs = forwardNN(data);
    predicted_class = [indmax(probs[j,:]) for j in 1:size(probs,1)]
    return predicted_class;
end









    Out[12]:





predict (generic function with 1 method)



In [13]:

    
pred = predict(X);



In [14]:

    
function accuracy(truth, prediction)
    correct = 0;
    for j in 1:length(truth)
        if truth[j] == prediction[j]
            correct = correct + 1;
        end
    end
    println("training accuracy: ", correct/length(truth)*100);
end









    Out[14]:





accuracy (generic function with 1 method)



In [15]:

    
accuracy(y, pred)









    



training accuracy: 96.86833333333334



In [ ]:

    
# plt.matshow



In [16]:

    
# test data
XTest,yTest = testdata();



In [17]:

    
XTest /= 255.0;
XTest = XTest';
yTest = yTest+1;



In [18]:

    
predTest = predict(XTest);



In [19]:

    
accuracy(yTest, predTest);









    



training accuracy: 96.19



In [ ]: