In [1]:
# install
#Pkg.add("MNIST");
using MNIST

In [2]:
# training data
X,y = traindata(); 
m = size(X, 2);

In [3]:
inputLayerSize = size(X,1); 
hiddenLayerSize = 25;
outputLayerSize = 10;

In [4]:
# representing each output as an array of size of the output layer
Y = zeros(outputLayerSize, m);
for i = 1:m
    Y[y[i]+1,i] = 1;
end

In [5]:
function sigmoid(z)
    g = 1.0 ./ (1.0 + exp(-z));
    return g;
end


Out[5]:
sigmoid (generic function with 1 method)

In [6]:
function sigmoidGradient(z)
  return sigmoid(z).*(1-sigmoid(z));
end


Out[6]:
sigmoidGradient (generic function with 1 method)

In [7]:
# weight regularization parameter
lambda = 2; 
function costFunction(truth, prediction)
    #cost
    cost = zeros(m,1);
    for i=1:m
        cost[i,:] = (-Y[:,i]'*log(prediction[:,i])) - ((1-Y[:,i]')*log(1-prediction[:,i]));
    end
    # regularization term
    regularization = (lambda/(2*m))*(sum(sum(Theta1[2:end,:].^2)) + sum(sum(Theta2[2:end,:].^2)));

    return (1/m)*sum(cost) + regularization; # regularized cost
end


Out[7]:
costFunction (generic function with 1 method)

In [8]:
function predict(Theta1Wt, Theta2Wt, data)
    dataSize = size(data, 2); 
    p = zeros(dataSize, 1);
    h1 = sigmoid(Theta1Wt'*[ones(1,size(data,2)), data]);
    h2 = sigmoid(Theta2Wt'*[ones(1,size(h1,2)), h1]);
    # 1 index is for 0, 2 for 1 ...so forth
    for i=1:dataSize
        p[i,:] = indmax(h2[:,i])-1;
    end
    return p;
end

function accuracy(truth, prediction)
    dataSize = length(truth);
    match =0;
    for i=1:dataSize
        if truth[i,:] == pred[i,:]
            match = match +1;
        end
    end
    return (match/dataSize)*100;
end


Out[8]:
accuracy (generic function with 1 method)

In [ ]:
# including one bias neuron in input layer
# weights for the links connecting input layer to the hidden layer
Theta1 = randn(inputLayerSize+1, hiddenLayerSize); #(785x25)
# including one bias neuron in hidden layer
# weights for the links connecting hidden layer to the output layer
Theta2 = randn(hiddenLayerSize+1, outputLayerSize); #(26x10)
# learning rate
alpha = 0.1;
# number of iterations
epoch = 1000;
# cost per epoch
J = zeros(epoch,1);
# ====================================================================
# Train the neural network using feedforward-backpropagation algorithm
# ====================================================================
for i = 1:epoch
    Delta1 = 0;
    Delta2 = 0;
    for j = 1:m # for each input
        # ===================
        # Feedforward process
        # ===================
        # input layer
        # add one bias element
        x1 = [1, X[:,j]];

        # hidden layer
        s2 = Theta1'*x1;
        x2 = sigmoid(s2);
        # add one bias element
        x2 = [1, x2];

        # output layer
        s3 = Theta2'*x2;
        x3 = sigmoid(s3);
        
        # =======================
        # Backpropagation process
        # =======================
        # delta for output layer
        delta3 = x3 - Y[:,j];
        delta2 = (Theta2[2:end,:]*delta3).*sigmoidGradient(s2) ;
        # there is no delta term for the input layer
        
        # adjust the weights (thetas)
        Delta1 = Delta1 + x1*delta2';
        Delta2 = Delta2 + x2*delta3';
    end
    
    reg_theta1 = ((lambda/m)*Theta1);
    reg_theta1[1,:] = 0;
    Theta1 = Theta1 - ((alpha/m)*Delta1 + reg_theta1);
    
    reg_theta2 = ((lambda/m)*Theta2);
    reg_theta2[1,:] = 0;
    Theta2 = Theta2 - ((alpha/m)* Delta2 + reg_theta2);
    
    h1 = sigmoid(Theta1'*[ones(1,size(X,2)), X]);
    h2 = sigmoid(Theta2'*[ones(1,size(h1,2)), h1]);
    J[i,:] = costFunction(Y, h2);
end

In [13]:
J


Out[13]:
20x1 Array{Float64,2}:
 12.669  
 12.4574 
 12.2516 
 12.0514 
 11.8556 
 11.6646 
 11.477  
 11.2909 
 11.1161 
 10.9428 
 10.7719 
 10.608  
 10.4477 
 10.292  
 10.1387 
  9.98914
  9.84468
  9.70279
  9.56375
  9.43196

In [14]:
pred = predict(Theta1, Theta2, X);
println("train accuracy: ", accuracy(y, pred));


train accuracy: 13.686666666666666