In [ ]:
import tensorflow as tf
tf.set_random_seed(1337)
In [ ]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
Every MNIST sample has two parts: an image (vectorized, raster-scanned) of a handwritten digit and a corresponding label.
In [ ]:
import matplotlib.pyplot as plt
def show_sample(index):
image = mnist.train.images[index].reshape(28, 28) # 784 -> 28x28
label = mnist.train.labels[index]
plt.imshow(image, cmap='Greys')
plt.show()
plt.clf()
plt.cla()
plt.close()
print('label[%d]: %s' % (index, str(label)))
show_sample(10)
print('------------------------------------------------------------')
show_sample(24)
print('------------------------------------------------------------')
show_sample(12)
print('------------------------------------------------------------')
show_sample(11)
print('------------------------------------------------------------')
show_sample(18)
print('------------------------------------------------------------')
In [ ]:
def build_inputs():
# `x` is a batch of input images (each reshaped into a vector: 28x28 -> 784)
x = tf.placeholder(tf.float32, [None, 784])
# `y` is a batch of labels
y = tf.placeholder(tf.float32, [None, 10])
# return for further processing
return x, y
We're going to train a model to look at images and predict what digits they are.
A function $M: \mathbb{R}^{28\times 28}\rightarrow \mathbb{R}^{10}$ outputs a classification score for each input digit. In other words, $M(\text{image})=\text{a vector of per-class scores}$. We want that a higher score for class $c$ translates to higher confidence that $c$ is the correct class.
For example, if $M$ outputs $$ (0.05, 0.03, 0.82, 0.02, 0.01, 0.02, 0.01, 0.02, 0.01, 0.1) $$ for an input image, it classifies that image as a $2$.
Let us choose a very simple classification model first: $$ M(\mathbf{x})= \mathbf{x}\cdot\mathbf{W} + \mathbf{b} , $$ where $\mathbf{x}\in\mathbb{R}^{784}$ is a vectorized input image, and $\mathbf{W}\in\mathbb{R}^{784\times 10}$ and $\mathbf{b}\in\mathbb{R}^{10}$ are the model parameters. The elements of $M(\mathbf{x})$ are sometimes called logits.
In [ ]:
def build_affine(x):
W = tf.Variable(tf.random_normal([784, 10], mean=0.0, stddev=0.01))
b = tf.Variable(tf.zeros([10]))
logits = tf.matmul(x, W) + b
return logits
Initially, $\mathbf{W}$ and $\mathbf{b}$ contain random values that will not produce correct classification results.
We have to tune these tensors by minimizing an appropirate loss function that will "measure" the quality of classification.
We will use the cross entropy criterion: $$ L(\mathbf{x}, c)= -\log p_c(\mathbf{x}) , $$ where $p_c(\mathbf{x})$ is the probability assigned by the model that $\mathbf{x}$ belongs to class $c$, $$ p_c= \frac{e^{l_c}}{\sum_{j=1}^{10} e^{l_j}} , $$ and $(l_0, l_1, \ldots, l_9)=M(\mathbf{x})$ are the logits output by the model.
The derivatives can now be computed by TensorFlow and the model can be tuned with stochastic gradient descent ($k=0, 1, 2, \ldots$): $$ \mathbf{W}_{k+1}= \mathbf{W}_k - \eta\frac{\partial L}{\partial\mathbf{W}_k} $$ $$ \mathbf{b}_{k+1}= \mathbf{b}_k - \eta\frac{\partial L}{\partial\mathbf{b}_k} $$
In [ ]:
def build_loss(logits, y):
# labels `y` are one-hot encoded class indicators (`c` in the text above)
loss = tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=logits)
return loss
The loss $L$ is usually approximated on a batch of images. The code above can handle this case as well. We set the batch size to $100$ is our experiment.
In [ ]:
def build_ncorrect(logits, y):
# get correct predictions as a vector of bools (0s and 1s)
correctpreds = tf.equal(tf.argmax(logits, 1), tf.argmax(y, 1))
# sum the zeros and ones
ncorrect = tf.reduce_sum(tf.cast(correctpreds, tf.float32))
# return for future processing
return ncorrect
def get_accuracy(ncorrect):
acc = 0.0
for i in range(0, 100):
acc = acc + sess.run(ncorrect, feed_dict={x: mnist.test.images[i*100:(i+1)*100, :], y: mnist.test.labels[i*100:(i+1)*100, :]})
acc = 100*acc/10000
return acc
In [ ]:
def run_training_loop(step, ncorrect, batchsize, niters):
#
for k in range(niters):
X, Y = mnist.train.next_batch(batchsize)
sess.run(step, feed_dict={x: X, y: Y})
if k % 500 == 0:
acc = get_accuracy(ncorrect)
print('* iter %d: test set accuracy=%.2f %%' % (k, acc))
In [ ]:
# inputs and model outputs
x, y = build_inputs()
logits = build_affine(x)
# loss-computation grah
loss = build_loss(logits, y)
# graph for calculating the number of correct classifications
ncorrect = build_ncorrect(logits, y)
# we use SGD to gradually tune the model parameters
step = tf.train.GradientDescentOptimizer(1e-4).minimize(loss)
# final preparations for learning
sess = tf.Session()
tf.global_variables_initializer().run(session=sess)
# start the learning process: batch size=100, number of iterations=10000
run_training_loop(step, ncorrect, 100, 30001)
# clear the current session (so we can start another one later)
sess.close()
tf.reset_default_graph()
The obtained classification accuracy on the test set should be between 91 and 93 percent. This is a pretty bad result. Can we do better that that?
In [ ]:
def build_convnet(x):
# reshape input to image
x = tf.reshape(x, [-1, 28, 28, 1])
# first conv block
x = tf.layers.conv2d(x, 32, 5, padding='SAME')
x = tf.nn.relu(x)
x = tf.layers.max_pooling2d(x, 2, 2)
# second conv block
x = tf.layers.conv2d(x, 64, 5, padding='SAME')
x = tf.nn.relu(x)
x = tf.layers.max_pooling2d(x, 2, 2)
# reshape the resulting tensor into a vector and reduce its dimension to 10
x = tf.reshape(x, [-1, 7*7*64])
x = tf.layers.dense(x, 128)
x = tf.nn.relu(x)
logits = tf.layers.dense(x, 10)
# return for future processing
return logits
In [ ]:
# inputs and model outputs
x, y = build_inputs()
logits = build_convnet(x)
# loss-computation grah
loss = build_loss(logits, y)
# testing-accuracy graph
ncorrect = build_ncorrect(logits, y)
# we use RMSProp to gradually tune the model parameters (similar to SGD, but better in most cases)
step = tf.train.RMSPropOptimizer(1e-3).minimize(loss)
# final preparations for learning
sess = tf.Session()
tf.global_variables_initializer().run(session=sess)
# start the learning process: batch size=100, number of iterations=5000
run_training_loop(step, ncorrect, 100, 5001)
# clear the current session (so we can start another one later)
sess.close()
tf.reset_default_graph()
The obtained classification accuracy should be well over 99%.