In [179]:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
import math


def softmax(z):
    z -= np.max(z, axis=1, keepdims=True)
    ans = np.exp(z) / np.sum(np.exp(z), axis=1, keepdims=True)
    return ans


def relu(z):
    z[z < 0] = 0
    return z


def relu_prime(z):
    z[z <= 0] = 0
    z[z > 0] = 1
    return z


# Just for the sake of saving  computing time to recalculate y_hat in each mini_batch
def stochastic_J(w1, w2, y_hat, images, labels, alpha=0.):
    x = images
    y = labels
    m = x.shape[0]
    cost_mat = np.multiply(y, np.log(y_hat))
    cost = (-1. / m) * np.sum(np.sum(cost_mat, axis=1))
    # Regularize
    cost += (alpha/(2 * m)) * (np.linalg.norm(w1) + np.linalg.norm(w2))
    return cost


def J(w1, w2, b1, b2, images, labels, alpha=0.):
    x = images
    y = labels
    m = x.shape[0]
    h1, y_hat = feedforward(w1, w2, b1, b2, x, y)
    cost_mat = np.multiply(y, np.log(y_hat))
    cost = (-1. / m) * np.sum(np.sum(cost_mat, axis=1))
    # Regularize
    cost += (alpha/(2 * m)) * (np.linalg.norm(w1) + np.linalg.norm(w2))
    return cost


def feedforward(w1, w2, b1, b2, images, labels):
    x = images
    h1 = relu(x.dot(w1.T) + b1)
    y_hat = softmax(h1.dot(w2.T) + b2)
    return h1, y_hat


def grad_layer2(h1, y_hat, w1, w2, images, labels, alpha=0.):
    y = labels
    m = y.shape[0]
    dJ_dz2 = (y_hat - y)/m
    dJ_dw2 = dJ_dz2.T.dot(h1)
    #Regularize
    dJ_dw2 += (alpha/m) * w2
    dJ_b2 = np.sum(dJ_dz2, axis=0, keepdims=True)
    return dJ_dw2, dJ_b2


def grad_layer1(h1, y_hat, w1, w2, images, labels, alpha=0.):
    x = images
    y = labels
    m = y.shape[0]
    dJ_dz2 = (y_hat - y)/m
    dJ_dh1 = dJ_dz2.dot(w2)
    g = dJ_dh1 * relu_prime(x.dot(w1.T))  # dJ/dz1
    dJ_dw1 = g.T.dot(x)
    # Regularize
    dJ_dw1 += (alpha/m) * w1
    dJ_db1 = np.sum(g, axis=0, keepdims=True)
    return dJ_dw1, dJ_db1


def sgd(train_images, train_labels, val_images, val_labels, h_nodes, epsilon, batch_size, epochs, alpha=0., searching=False):
    x = train_images
    y = train_labels
    sample_size, dimensions = x.shape
    classes = y.shape[1]
    cost_history = np.array([])

    w1_range = math.sqrt(1./dimensions)
    w1 = np.random.uniform(-w1_range, w1_range, (h_nodes, dimensions))
    b1 = np.ones((1, h_nodes)) * 0.1
    w2_range = math.sqrt(1./h_nodes)
    w2 = np.random.uniform(-w2_range, w2_range, (classes, h_nodes))
    b2 = np.ones((1, classes)) * 0.1

    num_batches = sample_size / batch_size
    for e in xrange(epochs):
        batch_history = np.array([])

        x_y = np.append(x, y, axis=1)
        np.random.shuffle(x_y)
        x_s = x_y[:, :dimensions]
        y_s = x_y[:, dimensions:]
        for i in xrange(num_batches):
            start = i * batch_size
            end = start + batch_size
            x_batch = x_s[start:end]
            y_batch = y_s[start:end]
            # Do feedforward pass
            h1, y_hat = feedforward(w1, w2, b1, b2, x_batch, y_batch)
            # Find dJ/dw1 and dJ/d1
            gradw1, gradb1 = grad_layer1(h1, y_hat, w1, w2, x_batch, y_batch, alpha)
            # Find dJ/dw2 and dJ/dwb2
            gradw2, gradb2 = grad_layer2(h1, y_hat, w1, w2, x_batch, y_batch, alpha)
            w1 -= (epsilon * gradw1)
            b1 -= (epsilon * gradb1)
            w2 -= (epsilon * gradw2)
            b2 -= (epsilon * gradb2)
            # Cost of current mini-batch
            cost = stochastic_J(w1, w2, y_hat, x_batch, y_batch, alpha)
            # List of costs for all the mini-batches in current epoch
            batch_history = np.append(batch_history, cost)

        epoch_cost = np.mean(batch_history)
        # List of costs for epochs
        cost_history = np.append(cost_history, epoch_cost)
        # Accuracy of on validation data set after each epoch
        validation_acc = report_accuracy(w1, w2, b1, b2, val_images, val_labels)
        if e % 2 == 0:
            print("Epochs: %d Cost: %.5f Validation Acc: %.2f ||w1||: %.5f ||w2||: %.5f" % (
                e, epoch_cost, validation_acc, np.linalg.norm(w1), np.linalg.norm(w2)))

    if searching:
        # When searching best hyperparameters, return final cost (last element in cost_history) and
        # final validation accuracy. These will be heuristics when choosing the best hyperparameters
        return cost_history[-1], validation_acc

    plt.plot(np.linspace(epochs-20, epochs, 20), cost_history[-20:], label="Training Cost")
    plt.legend('Cost')
    plt.ylabel('Training Cost')
    plt.xlabel('Epochs')
    plt.title("Cross-entropy cost values")
    plt.show()
    return w1, w2, b1, b2


def reportCosts(w1, w2, b1, b2, trainImg , trainLbl, valiImg, valiLbl, testImg, testLbl, alpha=0.):
    print "Training cost: {}".format(J(w1, w2, b1, b2, trainImg, trainLbl, alpha))
    print "Validation cost:  {}".format(J(w1, w2, b1, b2, valiImg, valiLbl, alpha))
    print "Testing cost:  {}".format(J(w1, w2, b1, b2, testImg, testLbl, alpha))


def report_accuracy(w1, w2, b1, b2, images, labels):
    h1, y_hat = feedforward(w1, w2, b1, b2, images, labels)
    acc = np.mean(np.argmax(y_hat, axis=1) == np.argmax(labels, axis=1))
    return acc * 100


def predict(images, labels, w1, w2, b1, b2):
    h1, y_hat = feedforward(w1, w2, b1, b2, images, labels)
    predicted = np.argmax(y_hat)
    real = np.argmax(labels)
    return predicted, real


def findBestHyperparameters(train_images, train_labels, val_images, val_labels):
    h_nodes = [20, 20, 60, 50, 80, 30, 60, 40, 60, 20, 80, 80]
    l_rate = [0.33, 1e-2, 0.4, 0.15, 0.001, 0.01, 0.006, 0.06, 0.2, 0.007, 0.15, 0.1]
    b_size = [100, 250, 50, 125, 500, 275, 88, 40, 100, 625, 50, 25]
    alpha = [0, 0.8, 0.02, 1e2, 5.0, 0.2, 0.3, 0.7, 0.05, 0.1, 0.02, 0.9]
    epochs = [10, 20, 50, 10, 30, 40, 30, 20, 50, 10, 30, 40]
    min_cost = 100
    max_acc = 0
    best = 0
    for i in range(12):
        print ("[%d] Trying parameters - Hidden Nodes: %d Learning Rate: %.6f Batch Size: %d Alpha: %.3f Epochs: %d" % (
            i, h_nodes[i], l_rate[i], b_size[i], alpha[i], epochs[i]))
        cost, acc = sgd(train_images, train_labels, val_images, val_labels,
                        h_nodes[i], l_rate[i], b_size[i], epochs[i], alpha[i], searching=True)
        if cost < min_cost:
            min_cost = cost
            best = i
        if acc > max_acc:
            max_acc = acc
            best = i
        print ("Current best parameters - Hidden Nodes: %d Learning Rate: %.6f Batch Size: %d Alpha: %.3f Epochs: %d" %
               (h_nodes[best], l_rate[best], b_size[best], alpha[best], epochs[best]))

    return h_nodes[best], l_rate[best], b_size[best], alpha[best], epochs[best]

#####CHECK GRAD######
def checkgrad_unpack(w, images, labels):
    w1_idx = images.shape[1] * hidden_nodes
    b1_idx = hidden_nodes
    w2_idx = labels.shape[1] * hidden_nodes
    b2_idx = 10
    w1 = np.reshape(w[:w1_idx], (hidden_nodes, images.shape[1]))
    b1 = np.reshape(w[w1_idx:w1_idx + b1_idx], (1, hidden_nodes))
    w2 = np.reshape(w[w1_idx + b1_idx:w1_idx + b1_idx + w2_idx], (labels.shape[1], hidden_nodes))
    b2 = np.reshape(w[w1_idx + b1_idx + w2_idx:], (1, 10))
    return w1, b1, w2, b2


def checkgrad_cost(w, images, labels, alpha=.0):
    w1, b1, w2, b2 = checkgrad_unpack(w, images, labels)
    cost = J(w1, w2, b1, b2, images, labels, alpha)
    return cost


def checkgrad_grad(w, images, labels, alpha=.0):
    w1, b1, w2, b2 = checkgrad_unpack(w, images, labels)
    h1, y_hat = feedforward(w1, w2, b1, b2, images, labels)
    gw2, gb2 = grad_layer2(h1, y_hat, w1, w2, images, labels, alpha)
    gw1, gb1 = grad_layer1(h1, y_hat, w1, w2, images, labels, alpha)
    final_w = np.concatenate((gw1.flatten(), gb1.flatten(), gw2.flatten(), gb2.flatten()))
    return final_w
#####END OF CHECK GRAD######

if __name__ == "__main__":
    # Load data
    if 'trainingImages' not in globals():
        trainingImages = np.load("datasets/mnist_train_images.npy")
        trainingLabels = np.load("datasets/mnist_train_labels.npy")
        validationImages = np.load("datasets/mnist_validation_images.npy")
        validationLabels = np.load("datasets/mnist_validation_labels.npy")
        testingImages = np.load("datasets/mnist_test_images.npy")
        testingLabels = np.load("datasets/mnist_test_labels.npy")

    import time

    start = time.time()
#     hidden_nodes, learning_rate, batch_size, ridge_term, epochs = findBestHyperparameters(trainingImages, trainingLabels,
#                                                                                           validationImages, validationLabels)
    hidden_nodes, learning_rate, batch_size, ridge_term, epochs = 80, 0.15, 50, 0.020, 30
    print ("Best parameters - Hidden Nodes: %d Learning Rate: %.6f Batch Size: %d Alpha: %.3f Epochs: %d" %
           (hidden_nodes, learning_rate, batch_size, ridge_term, epochs))
    w_1, w_2, b_1, b_2 = sgd(trainingImages, trainingLabels, validationImages, validationLabels,
                             hidden_nodes, learning_rate, batch_size, epochs, ridge_term)

    # from scipy.optimize import check_grad
    # checkgrad_w = np.concatenate((w_1.flatten(), b_1.flatten(), w_2.flatten(), b_2.flatten()))
    # print ("Check grad value is ", check_grad(checkgrad_cost, checkgrad_grad, checkgrad_w,
    #                                           trainingImages[1:100], trainingLabels[1:100],ridge_term))

    dt = int(time.time() - start)
    print("Execution time %d sec" % dt)

    reportCosts(w_1, w_2, b_1, b_2, trainingImages, trainingLabels, validationImages, validationLabels, testingImages, testingLabels)
    print "Accuracy on Validation set: ", report_accuracy(w_1, w_2, b_1, b_2, validationImages, validationLabels), "%"
    print "Accuracy on Testing set: ", report_accuracy(w_1, w_2, b_1, b_2, testingImages, testingLabels), "%"


Best parameters - Hidden Nodes: 80 Learning Rate: 0.150000 Batch Size: 50 Alpha: 0.020 Epochs: 30
Epochs: 0 Cost: 0.37458 Validation Acc: 94.24 ||w1||: 8.24261 ||w2||: 6.38255
Epochs: 2 Cost: 0.14060 Validation Acc: 96.58 ||w1||: 10.07440 ||w2||: 8.23535
Epochs: 4 Cost: 0.10092 Validation Acc: 97.24 ||w1||: 10.99758 ||w2||: 9.16372
Epochs: 6 Cost: 0.08188 Validation Acc: 97.24 ||w1||: 11.56093 ||w2||: 9.75170
Epochs: 8 Cost: 0.07057 Validation Acc: 97.42 ||w1||: 11.90471 ||w2||: 10.12030
Epochs: 10 Cost: 0.06322 Validation Acc: 97.52 ||w1||: 12.16546 ||w2||: 10.41706
Epochs: 12 Cost: 0.05848 Validation Acc: 97.68 ||w1||: 12.31993 ||w2||: 10.61274
Epochs: 14 Cost: 0.05476 Validation Acc: 97.74 ||w1||: 12.43382 ||w2||: 10.76787
Epochs: 16 Cost: 0.05203 Validation Acc: 97.80 ||w1||: 12.51797 ||w2||: 10.88632
Epochs: 18 Cost: 0.05049 Validation Acc: 97.62 ||w1||: 12.59070 ||w2||: 10.98034
Epochs: 20 Cost: 0.04852 Validation Acc: 97.88 ||w1||: 12.64291 ||w2||: 11.05597
Epochs: 22 Cost: 0.04727 Validation Acc: 97.90 ||w1||: 12.68203 ||w2||: 11.11782
Epochs: 24 Cost: 0.04627 Validation Acc: 97.40 ||w1||: 12.71705 ||w2||: 11.17078
Epochs: 26 Cost: 0.04552 Validation Acc: 97.92 ||w1||: 12.74085 ||w2||: 11.20406
Epochs: 28 Cost: 0.04512 Validation Acc: 97.78 ||w1||: 12.76470 ||w2||: 11.23364
Execution time 59 sec
Training cost: 0.0351485060428
Validation cost:  0.0693459503114
Testing cost:  0.0711645923269
Accuracy on Validation set:  97.8 %
Accuracy on Testing set:  97.86 %

In [174]:
w_1.shape


Out[174]:
(80, 784)

In [175]:
w_2.shape


Out[175]:
(10, 80)

In [176]:
plt.show(plt.imshow(w_2, cmap='gray'))



In [177]:
image = np.zeros((28,28))
for i in xrange(0,10):
    temp = w_1[i].reshape(28,28)
    image = np.concatenate((image,temp), axis=1)
img_plt = plt.imshow(image, cmap='gray')
plt.show(img_plt)



In [178]:
image = np.zeros((28,28))
for i in xrange(10,20):
    temp = w_1[i].reshape(28,28)
    image = np.concatenate((image,temp), axis=1)
img_plt = plt.imshow(image, cmap='gray')
plt.show(img_plt)



In [167]:
image = np.zeros((28,28))
for i in xrange(20,30):
    temp = w_1[i].reshape(28,28)
    image = np.concatenate((image,temp), axis=1)
img_plt = plt.imshow(image, cmap='gray')
plt.show(img_plt)



In [168]:
image = w_1[9].reshape(28,28)
img_plt = plt.imshow(image, cmap='gray')
plt.show(img_plt)



In [169]:
def draw_image(x, y, title_str, drawTest = False):
    for c in range(1, 10):
        plt.subplot(3, 3,c)
        i = np.random.randint(x.shape[0]) 
        predicted, real = predict(x[i], y[i], w1, w2, b1, b2)
        im = testingImages[i].reshape((28,28)) 
        plt.axis("off")
        label = predicted
        plt.title("{} = {}".format(title_str, label))
        plt.imshow(im, cmap='gray')

In [170]:
draw_image(testingImages, testingLabels, "Label")