In [4]:
import numpy as np;
import matplotlib
import matplotlib.pyplot as plt
mnist = np.load('mnist_data.npz')
X_train = mnist['X_train']
X_test = mnist['X_test']
y_train = mnist['y_train']
y_test = mnist['y_test']
We have 60000 28x28 images in our training set and 10000 in the test set.
In [5]:
print(X_train.shape)
print(X_test.shape)
The pixel value range is 0-255
In [6]:
print(X_train[0])
In [7]:
plt.imshow(X_train[0],cmap='gray')
plt.axis('off')
plt.show()
Let's see what they look like:
In [8]:
sz = 5;
for i in range(sz*sz):
plt.subplot(sz, sz, i+1)
plt.imshow(X_train[i],cmap='gray')
plt.axis('off')
plt.show()
The dataset also includes a label for each digit:
In [9]:
print(y_train[:sz*sz].reshape([sz,sz]))
In [10]:
train_data = X_train.reshape([-1,28*28])
test_data = X_test.reshape([-1,28*28])
print(train_data.shape)
print(test_data.shape)
We will normalize the data to the [0,1] range:
In [11]:
def normalize_data(x):
flat_x = np.reshape(x,[-1]);
L = np.min(x);
H = np.max(x);
return (x.astype(np.float32)-L)/(H-L);
train_data = normalize_data(train_data)
test_data = normalize_data(test_data)
print(train_data[0])
We also need to convert the labels into a 1-hot representation:
In [12]:
def to_one_hot(labels,num):
one_hot_labels = np.zeros((labels.shape[0],num))
one_hot_labels[np.arange(labels.shape[0]),labels] = 1.0
return one_hot_labels;
train_labels = to_one_hot(y_train,10)
test_labels = to_one_hot(y_test,10)
print('Initial labels:')
print(y_train[:5])
print('1-hot representation:')
print(train_labels[:5])
First let's import tensorflow:
In [13]:
import tensorflow as tf
The model will take the flattened digit as input. An input is declared as a "placeholder" variable meaning that the value of this tensor will be provided at run-time. For the computation of the loss function the class labels are also considered inputs to the model:
In [14]:
x = tf.placeholder(tf.float32, [None, 784])
y = tf.placeholder(tf.float32, [None, 10])
We will feed the input into a two dense layers with a tanh() non-linearity. Each layer consists of the weight matrix W and the bias vector b. These have to be declared as variables:
In [15]:
h1_sz = 64;
W1 = tf.get_variable("W1", [784,h1_sz])
b1 = tf.Variable(tf.zeros([h1_sz]))
h1 = tf.matmul(x,W1) + b1
h1 = tf.tanh(h1)
W2 = tf.get_variable("W2", [h1_sz,10])
b2 = tf.Variable(tf.zeros([10]))
h2 = tf.matmul(h1,W2) + b2
The activations of the layer are fed into a soft-max layer that outputs class probabilities:
In [16]:
class_probs = tf.nn.softmax(h2)
We will use the cross-entropy between the predicted and the actual labels as our loss function:
In [17]:
cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(y * tf.log(tf.nn.softmax(class_probs)),axis=[1]))
We also need to define what a training step looks like. The command below tells tensorflow to optimize the loss function using a Stochastic Gradient Descent (SGD) step with a learning rate of 0.5:
In [18]:
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy_loss)
We need to create a session that will run on the computational graph:
In [19]:
sess = tf.InteractiveSession()
We also need to initialize the variables of the model:
In [20]:
tf.global_variables_initializer().run()
We can use the session to feed the input to the model and get the value of a specific node:
In [21]:
batch_x = train_data[:5]
batch_y = train_labels[:5]
vis_probs = sess.run([class_probs], feed_dict={x:batch_x})
print(vis_probs)
Notice how we did not provide the labels since these are not part of the slice of the computational graph for the class probabilities.
We can also perform an SGD step on this batch by running the session on the training step:
In [22]:
for j in range(200):
[a,vis_probs] = sess.run([train_step,class_probs], feed_dict={x:batch_x,y:batch_y})
print(vis_probs)
Notice how re-running the command above shifts the output of the model towards the actual labels. Of course this instance of the model will be horribly overfit to these few digits. Let's re-initialize the model:
In [23]:
tf.global_variables_initializer().run()
Instead we will cycle over the whole training dataset a few times. We will process the dataset in mini-batches and take an SGD step for each such mini-batch.
In [24]:
epochs = 5;
batch_size = 32;
N = train_data.shape[0];
hist_loss = [];
for epoch in range(epochs):
print("Epoch:", epoch)
for index in range(int(N/(batch_size))):
batch_x = train_data[index*batch_size:(index+1)*batch_size];
batch_y = train_labels[index*batch_size:(index+1)*batch_size];
[vis_loss,a] = sess.run([cross_entropy_loss,train_step], feed_dict={x:batch_x,y:batch_y})
hist_loss += [vis_loss]
plt.plot(hist_loss)
plt.show()
Let's see what the model learns:
In [25]:
for j in range(5):
digit = test_data[j].reshape([1,784]);
actual_label = test_labels[j];
plt.imshow(digit.reshape([28,28]),cmap='gray')
plt.show()
print(actual_label)
[vis_probs] = sess.run([class_probs], feed_dict={x:digit})
print(vis_probs)
input()
To test the model more 'formally' we can compute the accuracy on the test dataset:
In [26]:
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(class_probs, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
train_accuracy = sess.run(accuracy,feed_dict={x:train_data,y:train_labels})
test_accuracy = sess.run(accuracy,feed_dict={x:test_data,y:test_labels})
print("Accuracy on training data:",train_accuracy)
print("Accuracy on test data:",test_accuracy)
In [ ]: