``````

In [179]:

import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
import math

def softmax(z):
z -= np.max(z, axis=1, keepdims=True)
ans = np.exp(z) / np.sum(np.exp(z), axis=1, keepdims=True)
return ans

def relu(z):
z[z < 0] = 0
return z

def relu_prime(z):
z[z <= 0] = 0
z[z > 0] = 1
return z

# Just for the sake of saving  computing time to recalculate y_hat in each mini_batch
def stochastic_J(w1, w2, y_hat, images, labels, alpha=0.):
x = images
y = labels
m = x.shape[0]
cost_mat = np.multiply(y, np.log(y_hat))
cost = (-1. / m) * np.sum(np.sum(cost_mat, axis=1))
# Regularize
cost += (alpha/(2 * m)) * (np.linalg.norm(w1) + np.linalg.norm(w2))
return cost

def J(w1, w2, b1, b2, images, labels, alpha=0.):
x = images
y = labels
m = x.shape[0]
h1, y_hat = feedforward(w1, w2, b1, b2, x, y)
cost_mat = np.multiply(y, np.log(y_hat))
cost = (-1. / m) * np.sum(np.sum(cost_mat, axis=1))
# Regularize
cost += (alpha/(2 * m)) * (np.linalg.norm(w1) + np.linalg.norm(w2))
return cost

def feedforward(w1, w2, b1, b2, images, labels):
x = images
h1 = relu(x.dot(w1.T) + b1)
y_hat = softmax(h1.dot(w2.T) + b2)
return h1, y_hat

def grad_layer2(h1, y_hat, w1, w2, images, labels, alpha=0.):
y = labels
m = y.shape[0]
dJ_dz2 = (y_hat - y)/m
dJ_dw2 = dJ_dz2.T.dot(h1)
#Regularize
dJ_dw2 += (alpha/m) * w2
dJ_b2 = np.sum(dJ_dz2, axis=0, keepdims=True)
return dJ_dw2, dJ_b2

def grad_layer1(h1, y_hat, w1, w2, images, labels, alpha=0.):
x = images
y = labels
m = y.shape[0]
dJ_dz2 = (y_hat - y)/m
dJ_dh1 = dJ_dz2.dot(w2)
g = dJ_dh1 * relu_prime(x.dot(w1.T))  # dJ/dz1
dJ_dw1 = g.T.dot(x)
# Regularize
dJ_dw1 += (alpha/m) * w1
dJ_db1 = np.sum(g, axis=0, keepdims=True)
return dJ_dw1, dJ_db1

def sgd(train_images, train_labels, val_images, val_labels, h_nodes, epsilon, batch_size, epochs, alpha=0., searching=False):
x = train_images
y = train_labels
sample_size, dimensions = x.shape
classes = y.shape[1]
cost_history = np.array([])

w1_range = math.sqrt(1./dimensions)
w1 = np.random.uniform(-w1_range, w1_range, (h_nodes, dimensions))
b1 = np.ones((1, h_nodes)) * 0.1
w2_range = math.sqrt(1./h_nodes)
w2 = np.random.uniform(-w2_range, w2_range, (classes, h_nodes))
b2 = np.ones((1, classes)) * 0.1

num_batches = sample_size / batch_size
for e in xrange(epochs):
batch_history = np.array([])

x_y = np.append(x, y, axis=1)
np.random.shuffle(x_y)
x_s = x_y[:, :dimensions]
y_s = x_y[:, dimensions:]
for i in xrange(num_batches):
start = i * batch_size
end = start + batch_size
x_batch = x_s[start:end]
y_batch = y_s[start:end]
# Do feedforward pass
h1, y_hat = feedforward(w1, w2, b1, b2, x_batch, y_batch)
# Find dJ/dw1 and dJ/d1
gradw1, gradb1 = grad_layer1(h1, y_hat, w1, w2, x_batch, y_batch, alpha)
# Find dJ/dw2 and dJ/dwb2
gradw2, gradb2 = grad_layer2(h1, y_hat, w1, w2, x_batch, y_batch, alpha)
w1 -= (epsilon * gradw1)
b1 -= (epsilon * gradb1)
w2 -= (epsilon * gradw2)
b2 -= (epsilon * gradb2)
# Cost of current mini-batch
cost = stochastic_J(w1, w2, y_hat, x_batch, y_batch, alpha)
# List of costs for all the mini-batches in current epoch
batch_history = np.append(batch_history, cost)

epoch_cost = np.mean(batch_history)
# List of costs for epochs
cost_history = np.append(cost_history, epoch_cost)
# Accuracy of on validation data set after each epoch
validation_acc = report_accuracy(w1, w2, b1, b2, val_images, val_labels)
if e % 2 == 0:
print("Epochs: %d Cost: %.5f Validation Acc: %.2f ||w1||: %.5f ||w2||: %.5f" % (
e, epoch_cost, validation_acc, np.linalg.norm(w1), np.linalg.norm(w2)))

if searching:
# When searching best hyperparameters, return final cost (last element in cost_history) and
# final validation accuracy. These will be heuristics when choosing the best hyperparameters
return cost_history[-1], validation_acc

plt.plot(np.linspace(epochs-20, epochs, 20), cost_history[-20:], label="Training Cost")
plt.legend('Cost')
plt.ylabel('Training Cost')
plt.xlabel('Epochs')
plt.title("Cross-entropy cost values")
plt.show()
return w1, w2, b1, b2

def reportCosts(w1, w2, b1, b2, trainImg , trainLbl, valiImg, valiLbl, testImg, testLbl, alpha=0.):
print "Training cost: {}".format(J(w1, w2, b1, b2, trainImg, trainLbl, alpha))
print "Validation cost:  {}".format(J(w1, w2, b1, b2, valiImg, valiLbl, alpha))
print "Testing cost:  {}".format(J(w1, w2, b1, b2, testImg, testLbl, alpha))

def report_accuracy(w1, w2, b1, b2, images, labels):
h1, y_hat = feedforward(w1, w2, b1, b2, images, labels)
acc = np.mean(np.argmax(y_hat, axis=1) == np.argmax(labels, axis=1))
return acc * 100

def predict(images, labels, w1, w2, b1, b2):
h1, y_hat = feedforward(w1, w2, b1, b2, images, labels)
predicted = np.argmax(y_hat)
real = np.argmax(labels)
return predicted, real

def findBestHyperparameters(train_images, train_labels, val_images, val_labels):
h_nodes = [20, 20, 60, 50, 80, 30, 60, 40, 60, 20, 80, 80]
l_rate = [0.33, 1e-2, 0.4, 0.15, 0.001, 0.01, 0.006, 0.06, 0.2, 0.007, 0.15, 0.1]
b_size = [100, 250, 50, 125, 500, 275, 88, 40, 100, 625, 50, 25]
alpha = [0, 0.8, 0.02, 1e2, 5.0, 0.2, 0.3, 0.7, 0.05, 0.1, 0.02, 0.9]
epochs = [10, 20, 50, 10, 30, 40, 30, 20, 50, 10, 30, 40]
min_cost = 100
max_acc = 0
best = 0
for i in range(12):
print ("[%d] Trying parameters - Hidden Nodes: %d Learning Rate: %.6f Batch Size: %d Alpha: %.3f Epochs: %d" % (
i, h_nodes[i], l_rate[i], b_size[i], alpha[i], epochs[i]))
cost, acc = sgd(train_images, train_labels, val_images, val_labels,
h_nodes[i], l_rate[i], b_size[i], epochs[i], alpha[i], searching=True)
if cost < min_cost:
min_cost = cost
best = i
if acc > max_acc:
max_acc = acc
best = i
print ("Current best parameters - Hidden Nodes: %d Learning Rate: %.6f Batch Size: %d Alpha: %.3f Epochs: %d" %
(h_nodes[best], l_rate[best], b_size[best], alpha[best], epochs[best]))

return h_nodes[best], l_rate[best], b_size[best], alpha[best], epochs[best]

#####CHECK GRAD######
def checkgrad_unpack(w, images, labels):
w1_idx = images.shape[1] * hidden_nodes
b1_idx = hidden_nodes
w2_idx = labels.shape[1] * hidden_nodes
b2_idx = 10
w1 = np.reshape(w[:w1_idx], (hidden_nodes, images.shape[1]))
b1 = np.reshape(w[w1_idx:w1_idx + b1_idx], (1, hidden_nodes))
w2 = np.reshape(w[w1_idx + b1_idx:w1_idx + b1_idx + w2_idx], (labels.shape[1], hidden_nodes))
b2 = np.reshape(w[w1_idx + b1_idx + w2_idx:], (1, 10))
return w1, b1, w2, b2

def checkgrad_cost(w, images, labels, alpha=.0):
w1, b1, w2, b2 = checkgrad_unpack(w, images, labels)
cost = J(w1, w2, b1, b2, images, labels, alpha)
return cost

def checkgrad_grad(w, images, labels, alpha=.0):
w1, b1, w2, b2 = checkgrad_unpack(w, images, labels)
h1, y_hat = feedforward(w1, w2, b1, b2, images, labels)
gw2, gb2 = grad_layer2(h1, y_hat, w1, w2, images, labels, alpha)
gw1, gb1 = grad_layer1(h1, y_hat, w1, w2, images, labels, alpha)
final_w = np.concatenate((gw1.flatten(), gb1.flatten(), gw2.flatten(), gb2.flatten()))
return final_w
#####END OF CHECK GRAD######

if __name__ == "__main__":
# Load data
if 'trainingImages' not in globals():
trainingImages = np.load("datasets/mnist_train_images.npy")
trainingLabels = np.load("datasets/mnist_train_labels.npy")
validationImages = np.load("datasets/mnist_validation_images.npy")
validationLabels = np.load("datasets/mnist_validation_labels.npy")
testingImages = np.load("datasets/mnist_test_images.npy")
testingLabels = np.load("datasets/mnist_test_labels.npy")

import time

start = time.time()
#     hidden_nodes, learning_rate, batch_size, ridge_term, epochs = findBestHyperparameters(trainingImages, trainingLabels,
#                                                                                           validationImages, validationLabels)
hidden_nodes, learning_rate, batch_size, ridge_term, epochs = 80, 0.15, 50, 0.020, 30
print ("Best parameters - Hidden Nodes: %d Learning Rate: %.6f Batch Size: %d Alpha: %.3f Epochs: %d" %
(hidden_nodes, learning_rate, batch_size, ridge_term, epochs))
w_1, w_2, b_1, b_2 = sgd(trainingImages, trainingLabels, validationImages, validationLabels,
hidden_nodes, learning_rate, batch_size, epochs, ridge_term)

# from scipy.optimize import check_grad
# checkgrad_w = np.concatenate((w_1.flatten(), b_1.flatten(), w_2.flatten(), b_2.flatten()))
# print ("Check grad value is ", check_grad(checkgrad_cost, checkgrad_grad, checkgrad_w,
#                                           trainingImages[1:100], trainingLabels[1:100],ridge_term))

dt = int(time.time() - start)
print("Execution time %d sec" % dt)

reportCosts(w_1, w_2, b_1, b_2, trainingImages, trainingLabels, validationImages, validationLabels, testingImages, testingLabels)
print "Accuracy on Validation set: ", report_accuracy(w_1, w_2, b_1, b_2, validationImages, validationLabels), "%"
print "Accuracy on Testing set: ", report_accuracy(w_1, w_2, b_1, b_2, testingImages, testingLabels), "%"

``````
``````

Best parameters - Hidden Nodes: 80 Learning Rate: 0.150000 Batch Size: 50 Alpha: 0.020 Epochs: 30
Epochs: 0 Cost: 0.37458 Validation Acc: 94.24 ||w1||: 8.24261 ||w2||: 6.38255
Epochs: 2 Cost: 0.14060 Validation Acc: 96.58 ||w1||: 10.07440 ||w2||: 8.23535
Epochs: 4 Cost: 0.10092 Validation Acc: 97.24 ||w1||: 10.99758 ||w2||: 9.16372
Epochs: 6 Cost: 0.08188 Validation Acc: 97.24 ||w1||: 11.56093 ||w2||: 9.75170
Epochs: 8 Cost: 0.07057 Validation Acc: 97.42 ||w1||: 11.90471 ||w2||: 10.12030
Epochs: 10 Cost: 0.06322 Validation Acc: 97.52 ||w1||: 12.16546 ||w2||: 10.41706
Epochs: 12 Cost: 0.05848 Validation Acc: 97.68 ||w1||: 12.31993 ||w2||: 10.61274
Epochs: 14 Cost: 0.05476 Validation Acc: 97.74 ||w1||: 12.43382 ||w2||: 10.76787
Epochs: 16 Cost: 0.05203 Validation Acc: 97.80 ||w1||: 12.51797 ||w2||: 10.88632
Epochs: 18 Cost: 0.05049 Validation Acc: 97.62 ||w1||: 12.59070 ||w2||: 10.98034
Epochs: 20 Cost: 0.04852 Validation Acc: 97.88 ||w1||: 12.64291 ||w2||: 11.05597
Epochs: 22 Cost: 0.04727 Validation Acc: 97.90 ||w1||: 12.68203 ||w2||: 11.11782
Epochs: 24 Cost: 0.04627 Validation Acc: 97.40 ||w1||: 12.71705 ||w2||: 11.17078
Epochs: 26 Cost: 0.04552 Validation Acc: 97.92 ||w1||: 12.74085 ||w2||: 11.20406
Epochs: 28 Cost: 0.04512 Validation Acc: 97.78 ||w1||: 12.76470 ||w2||: 11.23364

Execution time 59 sec
Training cost: 0.0351485060428
Validation cost:  0.0693459503114
Testing cost:  0.0711645923269
Accuracy on Validation set:  97.8 %
Accuracy on Testing set:  97.86 %

``````
``````

In [174]:

w_1.shape

``````
``````

Out[174]:

(80, 784)

``````
``````

In [175]:

w_2.shape

``````
``````

Out[175]:

(10, 80)

``````
``````

In [176]:

plt.show(plt.imshow(w_2, cmap='gray'))

``````
``````

``````
``````

In [177]:

image = np.zeros((28,28))
for i in xrange(0,10):
temp = w_1[i].reshape(28,28)
image = np.concatenate((image,temp), axis=1)
img_plt = plt.imshow(image, cmap='gray')
plt.show(img_plt)

``````
``````

``````
``````

In [178]:

image = np.zeros((28,28))
for i in xrange(10,20):
temp = w_1[i].reshape(28,28)
image = np.concatenate((image,temp), axis=1)
img_plt = plt.imshow(image, cmap='gray')
plt.show(img_plt)

``````
``````

``````
``````

In [167]:

image = np.zeros((28,28))
for i in xrange(20,30):
temp = w_1[i].reshape(28,28)
image = np.concatenate((image,temp), axis=1)
img_plt = plt.imshow(image, cmap='gray')
plt.show(img_plt)

``````
``````

``````
``````

In [168]:

image = w_1[9].reshape(28,28)
img_plt = plt.imshow(image, cmap='gray')
plt.show(img_plt)

``````
``````

``````
``````

In [169]:

def draw_image(x, y, title_str, drawTest = False):
for c in range(1, 10):
plt.subplot(3, 3,c)
i = np.random.randint(x.shape[0])
predicted, real = predict(x[i], y[i], w1, w2, b1, b2)
im = testingImages[i].reshape((28,28))
plt.axis("off")
label = predicted
plt.title("{} = {}".format(title_str, label))
plt.imshow(im, cmap='gray')

``````
``````

In [170]:

draw_image(testingImages, testingLabels, "Label")

``````
``````

``````