Deep Learning

Assignment 4

Previously in 2_fullyconnected.ipynb and 3_regularization.ipynb, we trained fully connected networks to classify notMNIST characters.

The goal of this assignment is make the neural network convolutional.


In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range
import os

In [2]:
# Create data directory path
dpath = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
dpath = os.path.join(dpath, 'data')
# create pickle data file path
pickle_file = os.path.join(dpath,'notMNIST.pickle')

with open(pickle_file, 'rb') as f:
    save = pickle.load(f)
    train_dataset = save['train_dataset']
    train_labels = save['train_labels']
    valid_dataset = save['valid_dataset']
    valid_labels = save['valid_labels']
    test_dataset = save['test_dataset']
    test_labels = save['test_labels']
    del save  # hint to help gc free up memory
    # Note: wouldn't getting out of with remove save object?
    
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)


Training set (500000, 28, 28) (500000,)
Validation set (29000, 28, 28) (29000,)
Test set (18000, 28, 28) (18000,)

Reformat into a TensorFlow-friendly shape:

  • convolutions need the image data formatted as a cube (width by height by #channels)
  • labels as float 1-hot encodings.

In [3]:
image_size = 28
num_labels = 10
num_channels = 1 # grayscale

import numpy as np

def reformat(dataset, labels):
    dataset = dataset.reshape(
        (-1, image_size, image_size, num_channels)).astype(np.float32)
    labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
    return (dataset, labels)

train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)


Training set (500000, 28, 28, 1) (500000, 10)
Validation set (29000, 28, 28, 1) (29000, 10)
Test set (18000, 28, 28, 1) (18000, 10)

In [4]:
def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
            / predictions.shape[0])

Let's build a small network with two convolutional layers, followed by one fully connected layer. Convolutional networks are more expensive computationally, so we'll limit its depth and number of fully connected nodes.

Note: Initial configuration was slightly changed while experimenting.


In [5]:
batch_size = 64
# patch_size is the size of the square of the 2d convolution (~)
patch_size = 5
depth = 16
num_hidden = 64

graph = tf.Graph()

with graph.as_default():

    # Input data.
    tf_train_dataset = tf.placeholder(
      tf.float32, shape=(batch_size, image_size, image_size, num_channels))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
  
    # Variables.
    # layerX_weights are the kernels of the convolution
    layer1_weights = tf.Variable(tf.truncated_normal(
        [patch_size, patch_size, num_channels, depth], stddev=0.1))
    layer1_biases = tf.Variable(tf.zeros([depth]))
    layer2_weights = tf.Variable(tf.truncated_normal(
        [patch_size, patch_size, depth, depth], stddev=0.1))
    layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
    layer3_weights = tf.Variable(tf.truncated_normal(
        [image_size // 4 * image_size // 4 * depth, num_hidden], stddev=0.1))
    layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
    layer4_weights = tf.Variable(tf.truncated_normal(
        [num_hidden, num_labels], stddev=0.1))
    layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
  
    # Model.
    def model(data):
        conv = tf.nn.conv2d(data, layer1_weights, [1, 2, 2, 1], padding='SAME')
        hidden = tf.nn.relu(conv + layer1_biases)
        conv = tf.nn.conv2d(hidden, layer2_weights, [1, 2, 2, 1], padding='SAME')
        hidden = tf.nn.relu(conv + layer2_biases)
        shape = hidden.get_shape().as_list()
        # debug:
        # print(shape)
        # exit()
        # shape[0] is batch size !
        # we reshape to feed the output of the convolution layer to the fully connected hidden layer.
        reshape = tf.reshape(hidden, [shape[0], shape[1] * shape[2] * shape[3]])
        hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
        return(tf.matmul(hidden, layer4_weights) + layer4_biases)
  
    # Training computation.
    logits = model(tf_train_dataset)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
    
    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
  
    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
    test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [6]:
num_steps = 5001

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run(
            [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            print('Minibatch loss at step %d: %f' % (step, l))
            print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
            print('Validation accuracy: %.1f%%' % accuracy(
                valid_prediction.eval(), valid_labels))
    
    print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))


Initialized
Minibatch loss at step 0: 3.385154
Minibatch accuracy: 15.6%
Validation accuracy: 10.2%
Minibatch loss at step 500: 1.206689
Minibatch accuracy: 68.8%
Validation accuracy: 81.3%
Minibatch loss at step 1000: 0.387424
Minibatch accuracy: 89.1%
Validation accuracy: 84.2%
Minibatch loss at step 1500: 0.502751
Minibatch accuracy: 85.9%
Validation accuracy: 85.3%
Minibatch loss at step 2000: 0.475271
Minibatch accuracy: 87.5%
Validation accuracy: 85.9%
Minibatch loss at step 2500: 0.355538
Minibatch accuracy: 89.1%
Validation accuracy: 86.6%
Minibatch loss at step 3000: 0.447590
Minibatch accuracy: 85.9%
Validation accuracy: 86.9%
Minibatch loss at step 3500: 0.595857
Minibatch accuracy: 79.7%
Validation accuracy: 87.3%
Minibatch loss at step 4000: 0.516294
Minibatch accuracy: 85.9%
Validation accuracy: 87.4%
Minibatch loss at step 4500: 0.471127
Minibatch accuracy: 87.5%
Validation accuracy: 87.7%
Minibatch loss at step 5000: 0.328227
Minibatch accuracy: 89.1%
Validation accuracy: 87.8%
Test accuracy: 93.9%

Problem 1

The convolutional model above uses convolutions with stride 2 to reduce the dimensionality. Replace the strides by a max pooling operation (nn.max_pool()) of stride 2 and kernel size 2.



In [7]:
batch_size = 64
patch_size = 5
depth = 16
num_hidden = 64

graph = tf.Graph()

with graph.as_default():

    # Input data.
    tf_train_dataset = tf.placeholder(
      tf.float32, shape=(batch_size, image_size, image_size, num_channels))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
  
    # Variables.
    # layerX_weights are the kernels of the convolution
    layer1_weights = tf.Variable(tf.truncated_normal(
        [patch_size, patch_size, num_channels, depth], stddev=0.1))
    layer1_biases = tf.Variable(tf.zeros([depth]))
    layer2_weights = tf.Variable(tf.truncated_normal(
        [patch_size, patch_size, depth, depth], stddev=0.1))
    layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
    # we need to change layer 3 size because of the addition of max_pool
    # 28//16 = 1 but we need 2 - hard code it for now!
    layer3_weights = tf.Variable(tf.truncated_normal(
        [(image_size // 16 +1) * (image_size // 16 +1) * depth, num_hidden], stddev=0.1))
    layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
    layer4_weights = tf.Variable(tf.truncated_normal(
        [num_hidden, num_labels], stddev=0.1))
    layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
  
    # Model.
    def model(data):
        # print(data.get_shape().as_list())
        # [64, 28, 28, 1]
        conv = tf.nn.conv2d(data, layer1_weights, [1, 2, 2, 1], padding='SAME')
        hidden = tf.nn.relu(conv + layer1_biases)
        # print(hidden.get_shape().as_list())
        # [64, 14, 14, 16]
        mpool = tf.nn.max_pool(hidden, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME')
        # print(mpool.get_shape().as_list())
        # [64, 7, 7, 16]
        conv = tf.nn.conv2d(mpool, layer2_weights, [1, 2, 2, 1], padding='SAME')
        hidden = tf.nn.relu(conv + layer2_biases)
        # print(hidden.get_shape().as_list())
        # [64, 4, 4, 16]
        mpool = tf.nn.max_pool(hidden, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME')
        shape = mpool.get_shape().as_list()
        # print(shape)
        # [64, 2, 2, 16]
        reshape = tf.reshape(mpool, [shape[0], shape[1] * shape[2] * shape[3]])
        # print(reshape.get_shape().as_list())
        # [64, 64]
        hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
        return(tf.matmul(hidden, layer4_weights) + layer4_biases)
  
    # Training computation.
    logits = model(tf_train_dataset)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
    
    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
  
    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
    test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [8]:
num_steps = 5001

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run(
            [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            print('Minibatch loss at step %d: %f' % (step, l))
            print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
            print('Validation accuracy: %.1f%%' % accuracy(
                valid_prediction.eval(), valid_labels))
    
    print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))


Initialized
Minibatch loss at step 0: 2.664329
Minibatch accuracy: 4.7%
Validation accuracy: 10.2%
Minibatch loss at step 500: 1.174007
Minibatch accuracy: 68.8%
Validation accuracy: 78.9%
Minibatch loss at step 1000: 0.436153
Minibatch accuracy: 85.9%
Validation accuracy: 83.3%
Minibatch loss at step 1500: 0.511909
Minibatch accuracy: 82.8%
Validation accuracy: 83.9%
Minibatch loss at step 2000: 0.567806
Minibatch accuracy: 85.9%
Validation accuracy: 85.3%
Minibatch loss at step 2500: 0.434193
Minibatch accuracy: 89.1%
Validation accuracy: 86.0%
Minibatch loss at step 3000: 0.440116
Minibatch accuracy: 87.5%
Validation accuracy: 86.5%
Minibatch loss at step 3500: 0.587870
Minibatch accuracy: 78.1%
Validation accuracy: 86.6%
Minibatch loss at step 4000: 0.484286
Minibatch accuracy: 84.4%
Validation accuracy: 86.8%
Minibatch loss at step 4500: 0.483584
Minibatch accuracy: 85.9%
Validation accuracy: 87.0%
Minibatch loss at step 5000: 0.320805
Minibatch accuracy: 90.6%
Validation accuracy: 87.4%
Test accuracy: 93.5%

We got almost the same accuracy as with the previous effort. We likely didn't make efficient use of max pooling. Let's try again:

  • We try by separating the convolutions in each dimension.

In [9]:
batch_size = 64
patch_size = 5
depth = 16
num_hidden = 64

graph = tf.Graph()

with graph.as_default():

    # Input data.
    tf_train_dataset = tf.placeholder(
      tf.float32, shape=(batch_size, image_size, image_size, num_channels))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
  
    # Variables.
    # layerX_weights are the kernels of the convolution
    layer1_weights = tf.Variable(tf.truncated_normal(
        [patch_size, patch_size, num_channels, depth], stddev=0.1))
    layer1_biases = tf.Variable(tf.zeros([depth]))
    layer2_weights = tf.Variable(tf.truncated_normal(
        [patch_size, patch_size, depth, depth], stddev=0.1))
    layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
    # we need to change layer 3 size because of the addition of max_pool
    # 28//16 = 1 but we need 2 - hard code it for now!
    layer3_weights = tf.Variable(tf.truncated_normal(
        [round(image_size / 8) * round(image_size / 8) * depth, num_hidden], stddev=0.1))
    layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
    layer4_weights = tf.Variable(tf.truncated_normal(
        [num_hidden, num_labels], stddev=0.1))
    layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
  
    # Model.
    def model(data):
        # print(data.get_shape().as_list())
        # [64, 28, 28, 1]
        conv = tf.nn.conv2d(data, layer1_weights, [1, 2, 2, 1], padding='SAME')
        hidden = tf.nn.relu(conv + layer1_biases)
        # print(hidden.get_shape().as_list())
        # [64, 14, 14, 16]
        mpool = tf.nn.max_pool(hidden, [1, 2, 2, 1], [1, 1, 2, 1], padding='SAME')
        # print(mpool.get_shape().as_list())
        # [64, 7, 7, 16]
        conv = tf.nn.conv2d(mpool, layer2_weights, [1, 2, 2, 1], padding='SAME')
        hidden = tf.nn.relu(conv + layer2_biases)
        # print(hidden.get_shape().as_list())
        # [64, 4, 4, 16]
        mpool = tf.nn.max_pool(hidden, [1, 2, 2, 1], [1, 2, 1, 1], padding='SAME')
        shape = mpool.get_shape().as_list()
        # print(shape)
        # [64, 2, 2, 16]
        reshape = tf.reshape(mpool, [shape[0], shape[1] * shape[2] * shape[3]])
        # print(reshape.get_shape().as_list())
        # [64, 64]
        hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
        return(tf.matmul(hidden, layer4_weights) + layer4_biases)
  
    # Training computation.
    logits = model(tf_train_dataset)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
    
    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
  
    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
    test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [10]:
num_steps = 5001

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run(
            [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            print('Minibatch loss at step %d: %f' % (step, l))
            print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
            print('Validation accuracy: %.1f%%' % accuracy(
                valid_prediction.eval(), valid_labels))
    
    print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))


Initialized
Minibatch loss at step 0: 2.357803
Minibatch accuracy: 10.9%
Validation accuracy: 10.0%
Minibatch loss at step 500: 1.087885
Minibatch accuracy: 73.4%
Validation accuracy: 82.9%
Minibatch loss at step 1000: 0.397545
Minibatch accuracy: 85.9%
Validation accuracy: 84.7%
Minibatch loss at step 1500: 0.440976
Minibatch accuracy: 84.4%
Validation accuracy: 85.9%
Minibatch loss at step 2000: 0.463475
Minibatch accuracy: 85.9%
Validation accuracy: 86.6%
Minibatch loss at step 2500: 0.359310
Minibatch accuracy: 87.5%
Validation accuracy: 87.4%
Minibatch loss at step 3000: 0.472556
Minibatch accuracy: 81.2%
Validation accuracy: 87.6%
Minibatch loss at step 3500: 0.613864
Minibatch accuracy: 81.2%
Validation accuracy: 87.7%
Minibatch loss at step 4000: 0.557986
Minibatch accuracy: 79.7%
Validation accuracy: 87.8%
Minibatch loss at step 4500: 0.418163
Minibatch accuracy: 89.1%
Validation accuracy: 88.3%
Minibatch loss at step 5000: 0.290997
Minibatch accuracy: 90.6%
Validation accuracy: 88.4%
Test accuracy: 94.2%

We got slightly better results. Seems like two max pools after each convolution on such a small image size was excessive.

  • Let's retry with only max pool

In [11]:
batch_size = 64
patch_size = 5
depth = 16
num_hidden = 64

graph = tf.Graph()

with graph.as_default():

    # Input data.
    tf_train_dataset = tf.placeholder(
      tf.float32, shape=(batch_size, image_size, image_size, num_channels))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
  
    # Variables.
    # layerX_weights are the kernels of the convolution
    layer1_weights = tf.Variable(tf.truncated_normal(
        [patch_size, patch_size, num_channels, depth], stddev=0.1))
    layer1_biases = tf.Variable(tf.zeros([depth]))
    layer2_weights = tf.Variable(tf.truncated_normal(
        [patch_size, patch_size, depth, depth], stddev=0.1))
    layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
    # we need to change layer 3 size because of the addition of max_pool
    # 28//16 = 1 but we need 2 - hard code it for now!
    layer3_weights = tf.Variable(tf.truncated_normal(
        [round(image_size / 8) * round(image_size / 8) * depth, num_hidden], stddev=0.1))
    layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
    layer4_weights = tf.Variable(tf.truncated_normal(
        [num_hidden, num_labels], stddev=0.1))
    layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
  
    # Model.
    def model(data):
        # print(data.get_shape().as_list())
        # [64, 28, 28, 1]
        conv = tf.nn.conv2d(data, layer1_weights, [1, 2, 2, 1], padding='SAME')
        hidden = tf.nn.relu(conv + layer1_biases)
        # print(hidden.get_shape().as_list())
        # [64, 14, 14, 16]
        mpool = tf.nn.max_pool(hidden, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME')
        # print(mpool.get_shape().as_list())
        # [64, 7, 7, 16]
        conv = tf.nn.conv2d(mpool, layer2_weights, [1, 2, 2, 1], padding='SAME')
        hidden = tf.nn.relu(conv + layer2_biases)
        shape = hidden.get_shape().as_list()
        # print(shape)
        # [64, 4, 4, 16]
        reshape = tf.reshape(hidden, [shape[0], shape[1] * shape[2] * shape[3]])
        # print(reshape.get_shape().as_list())
        # [64, 256]
        hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
        return(tf.matmul(hidden, layer4_weights) + layer4_biases)
  
    # Training computation.
    logits = model(tf_train_dataset)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
    
    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
  
    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
    test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [12]:
num_steps = 5001

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run(
            [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            print('Minibatch loss at step %d: %f' % (step, l))
            print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
            print('Validation accuracy: %.1f%%' % accuracy(
                valid_prediction.eval(), valid_labels))
    
    print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))


Initialized
Minibatch loss at step 0: 2.839052
Minibatch accuracy: 14.1%
Validation accuracy: 10.0%
Minibatch loss at step 500: 1.113260
Minibatch accuracy: 68.8%
Validation accuracy: 81.5%
Minibatch loss at step 1000: 0.402020
Minibatch accuracy: 87.5%
Validation accuracy: 84.0%
Minibatch loss at step 1500: 0.469976
Minibatch accuracy: 89.1%
Validation accuracy: 85.4%
Minibatch loss at step 2000: 0.539048
Minibatch accuracy: 85.9%
Validation accuracy: 85.9%
Minibatch loss at step 2500: 0.447509
Minibatch accuracy: 85.9%
Validation accuracy: 86.5%
Minibatch loss at step 3000: 0.455593
Minibatch accuracy: 82.8%
Validation accuracy: 86.9%
Minibatch loss at step 3500: 0.545252
Minibatch accuracy: 81.2%
Validation accuracy: 87.4%
Minibatch loss at step 4000: 0.476494
Minibatch accuracy: 84.4%
Validation accuracy: 87.6%
Minibatch loss at step 4500: 0.438607
Minibatch accuracy: 87.5%
Validation accuracy: 88.0%
Minibatch loss at step 5000: 0.299811
Minibatch accuracy: 93.8%
Validation accuracy: 88.1%
Test accuracy: 94.1%

Looks like putting the max_pooling operations together offers almost, but not exactly the same, performance as doing them separately.


Problem 2

Try to get the best performance you can use a convolutional net. Look for example at the classic LeNet5 architecture, adding Dropout, and/or adding learning rate decay.


We use figure 2 from Gradient-Based Learning Applied to Document Recognition as a guide in creating the model infrastructure.

  • We add variable learning rate to increase training accuracy

In [13]:
batch_size = 64
patch_size = 5
depth1 = 6
depth2 = 16
num_hidden1 = 120
num_hidden2 = 84

graph = tf.Graph()

with graph.as_default():

    # Input data.
    tf_train_dataset = tf.placeholder(
        tf.float32, shape=(batch_size, image_size, image_size, num_channels))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
  
    # Variables.
    # C1 layer:
    layer1_weights = tf.Variable(tf.truncated_normal(
        [patch_size, patch_size, num_channels, depth1], stddev=0.1))
    layer1_biases = tf.Variable(tf.zeros([depth1]))
    # S2 avg_pool - no need to specify weights
    # C3 layer:
    layer2_weights = tf.Variable(tf.truncated_normal(
        [patch_size, patch_size, depth1, depth2], stddev=0.1))
    layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth2]))
    # S4 avg_pool - no need to specify weights
    # C5 hidden1
    size = ((image_size - patch_size + 1) // 2 - patch_size + 1) // 2
    layer3_weights = tf.Variable(tf.truncated_normal(
        [size * size * depth2, num_hidden1], stddev=np.sqrt(2.0 / num_hidden1)))
    layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden1]))
    # F6 hidden2
    layer4_weights = tf.Variable(tf.truncated_normal(
        [num_hidden1, num_hidden2], stddev=np.sqrt(2.0 / num_hidden2)))
    layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden2]))
    # Output
    layer5_weights = tf.Variable(tf.truncated_normal(
        [num_hidden2, num_labels], stddev=0.1))
    layer5_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
  
    # Model.
    def model(data):
        # C1 input 28 x 28
        conv = tf.nn.conv2d(data, layer1_weights, [1, 1, 1, 1], padding='VALID')
        layer = tf.nn.relu(conv + layer1_biases)
        # S2 input 24 x 24
        pool = tf.nn.max_pool(layer, [1, 2, 2, 1], [1, 2, 2, 1], padding='VALID')
        # C3 input 12 x 12
        conv = tf.nn.conv2d(pool, layer2_weights, [1, 1, 1, 1], padding='VALID')
        layer = tf.nn.relu(conv + layer2_biases)
        # S4 input 8 x 8
        pool = tf.nn.max_pool(layer, [1, 2, 2, 1], [1, 2, 2, 1], padding='VALID')
        # C5 input 4 x 4
        shape = pool.get_shape().as_list()
        reshape = tf.reshape(pool, [shape[0], shape[1] * shape[2] * shape[3]])
        hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
        # F6
        hidden = tf.nn.relu(tf.matmul(hidden, layer4_weights) + layer4_biases)
        # return output logits
        return (tf.matmul(hidden, layer5_weights) + layer5_biases)
  
    # Training computation.
    logits = model(tf_train_dataset)
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(logits=logits,
                                                labels=tf_train_labels))
    # Optimizer - with variable learning rate.
    gstep = tf.Variable(0)  # steps taken
    ilrate = tf.placeholder(tf.float32)
    flrate = tf.train.exponential_decay(ilrate, gstep, 4000, 0.6)
    
    optimizer = tf.train.MomentumOptimizer(flrate, momentum=0.9, use_nesterov=True).minimize(
        loss, global_step=gstep)
    
    # # Optimizer.
    # optimizer = tf.train.GradientDescentOptimizer(0.005).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
    test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [14]:
num_steps = 20001
# learning rate (initial)
learning_rate_i = 0.01

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        feed_dict = {tf_train_dataset : batch_data,
                     tf_train_labels : batch_labels,
                     ilrate : learning_rate_i}
        _, l, predictions = session.run(
            [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 1000 == 0):
            print('Minibatch loss at step %d: %f' % (step, l))
            print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
            print('Validation accuracy: %.1f%%' % accuracy(
                valid_prediction.eval(), valid_labels))
            print("Current learning rate: {}".format(flrate.eval(feed_dict=feed_dict)))
    
    print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))


Initialized
Minibatch loss at step 0: 3.209201
Minibatch accuracy: 15.6%
Validation accuracy: 10.0%
Current learning rate: 0.00999872200191021
Minibatch loss at step 1000: 0.365294
Minibatch accuracy: 89.1%
Validation accuracy: 85.9%
Current learning rate: 0.008799993433058262
Minibatch loss at step 2000: 0.395343
Minibatch accuracy: 85.9%
Validation accuracy: 87.3%
Current learning rate: 0.007744977250695229
Minibatch loss at step 3000: 0.407963
Minibatch accuracy: 89.1%
Validation accuracy: 88.3%
Current learning rate: 0.006816445384174585
Minibatch loss at step 4000: 0.495282
Minibatch accuracy: 85.9%
Validation accuracy: 88.3%
Current learning rate: 0.005999234039336443
Minibatch loss at step 5000: 0.317762
Minibatch accuracy: 89.1%
Validation accuracy: 88.7%
Current learning rate: 0.005279996432363987
Minibatch loss at step 6000: 0.437098
Minibatch accuracy: 90.6%
Validation accuracy: 89.3%
Current learning rate: 0.004646986722946167
Minibatch loss at step 7000: 0.396564
Minibatch accuracy: 87.5%
Validation accuracy: 89.3%
Current learning rate: 0.0040898676961660385
Minibatch loss at step 8000: 0.300358
Minibatch accuracy: 90.6%
Validation accuracy: 89.6%
Current learning rate: 0.0035995400976389647
Minibatch loss at step 9000: 0.331443
Minibatch accuracy: 87.5%
Validation accuracy: 89.7%
Current learning rate: 0.00316799758002162
Minibatch loss at step 10000: 0.313100
Minibatch accuracy: 90.6%
Validation accuracy: 89.7%
Current learning rate: 0.0027881916612386703
Minibatch loss at step 11000: 0.451407
Minibatch accuracy: 85.9%
Validation accuracy: 89.9%
Current learning rate: 0.002453920431435108
Minibatch loss at step 12000: 0.138213
Minibatch accuracy: 95.3%
Validation accuracy: 89.9%
Current learning rate: 0.0021597242448478937
Minibatch loss at step 13000: 0.312804
Minibatch accuracy: 89.1%
Validation accuracy: 90.1%
Current learning rate: 0.0019007986411452293
Minibatch loss at step 14000: 0.243872
Minibatch accuracy: 93.8%
Validation accuracy: 90.2%
Current learning rate: 0.0016729151830077171
Minibatch loss at step 15000: 0.252324
Minibatch accuracy: 93.8%
Validation accuracy: 90.2%
Current learning rate: 0.0014723524218425155
Minibatch loss at step 16000: 0.256924
Minibatch accuracy: 92.2%
Validation accuracy: 90.3%
Current learning rate: 0.0012958347797393799
Minibatch loss at step 17000: 0.352299
Minibatch accuracy: 89.1%
Validation accuracy: 90.3%
Current learning rate: 0.00114047946408391
Minibatch loss at step 18000: 0.246187
Minibatch accuracy: 92.2%
Validation accuracy: 90.4%
Current learning rate: 0.0010037493193522096
Minibatch loss at step 19000: 0.218907
Minibatch accuracy: 93.8%
Validation accuracy: 90.5%
Current learning rate: 0.000883411499671638
Minibatch loss at step 20000: 0.342937
Minibatch accuracy: 89.1%
Validation accuracy: 90.4%
Current learning rate: 0.0007775008562020957
Test accuracy: 95.6%

Even though this is not as accurate as we have gotten with 4 fully connected layers (97.4%) it is close (and we have not fully optimised it).


Let's try tweaking the LeNet-5 architecture:

  • Let's change the activation functions
  • Let's change batch size
  • Let's change the depth and number of hidden nodes
  • Let's reduce the standard deviation of the initialisation weights

In [15]:
batch_size = 128
patch_size = 5
depth1 = 10
depth2 = 30
num_hidden1 = 240
num_hidden2 = 160

graph = tf.Graph()

with graph.as_default():

    # Input data.
    tf_train_dataset = tf.placeholder(
        tf.float32, shape=(batch_size, image_size, image_size, num_channels))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
  
    # Variables.
    # C1 layer:
    layer1_weights = tf.Variable(tf.truncated_normal(
        [patch_size, patch_size, num_channels, depth1], stddev=0.1))
    layer1_biases = tf.Variable(tf.zeros([depth1]))
    # S2 avg_pool - no need to specify weights
    # C3 layer:
    layer2_weights = tf.Variable(tf.truncated_normal(
        [patch_size, patch_size, depth1, depth2], stddev=0.1))
    layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth2]))
    # S4 avg_pool - no need to specify weights
    # C5 hidden1
    size = ((image_size - patch_size + 1) // 2 - patch_size + 1) // 2
    layer3_weights = tf.Variable(tf.truncated_normal(
        [size * size * depth2, num_hidden1], stddev=np.sqrt(0.025 / num_hidden1)))
    layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden1]))
    # F6 hidden2
    layer4_weights = tf.Variable(tf.truncated_normal(
        [num_hidden1, num_hidden2], stddev=np.sqrt(0.025 / num_hidden2)))
    layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden2]))
    # Output
    layer5_weights = tf.Variable(tf.truncated_normal(
        [num_hidden2, num_labels], stddev=np.sqrt(0.025 / num_labels)))
    layer5_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
  
    # Model.
    def model(data):
        # C1 input 28 x 28
        conv = tf.nn.conv2d(data, layer1_weights, [1, 1, 1, 1], padding='VALID')
        layer = tf.nn.relu(conv + layer1_biases)
        # S2 input 24 x 24
        pool = tf.nn.max_pool(layer, [1, 2, 2, 1], [1, 2, 2, 1], padding='VALID')
        # C3 input 12 x 12
        conv = tf.nn.conv2d(pool, layer2_weights, [1, 1, 1, 1], padding='VALID')
        layer = tf.nn.relu(conv + layer2_biases)
        # S4 input 8 x 8
        pool = tf.nn.max_pool(layer, [1, 2, 2, 1], [1, 2, 2, 1], padding='VALID')
        # C5 input 4 x 4
        shape = pool.get_shape().as_list()
        reshape = tf.reshape(pool, [shape[0], shape[1] * shape[2] * shape[3]])
        hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
        # F6
        hidden = tf.nn.relu(tf.matmul(hidden, layer4_weights) + layer4_biases)
        # return output logits
        return (tf.matmul(hidden, layer5_weights) + layer5_biases)
  
    # Training computation.
    logits = model(tf_train_dataset)
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(logits=logits,
                                                labels=tf_train_labels))
    
    # Optimizer - with variable learning rate.
    gstep = tf.Variable(0)  # steps taken
    ilrate = tf.placeholder(tf.float32)
    flrate = tf.train.exponential_decay(ilrate, gstep, 8000, 0.5)
    
    optimizer = tf.train.MomentumOptimizer(flrate, momentum=0.9, use_nesterov=True).minimize(
        loss, global_step=gstep)
    
    # # Optimizer.
    # optimizer = tf.train.GradientDescentOptimizer(0.005).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
    test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [16]:
num_steps = 20001
# learning rate (initial)
learning_rate_i = 0.01

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        feed_dict = {tf_train_dataset : batch_data,
                     tf_train_labels : batch_labels,
                     ilrate : learning_rate_i}
        _, l, predictions = session.run(
            [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 1000 == 0):
            print('Minibatch loss at step %d: %f' % (step, l))
            print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
            print('Validation accuracy: %.1f%%' % accuracy(
                valid_prediction.eval(), valid_labels))
            print("Current learning rate: {}".format(flrate.eval(feed_dict=feed_dict)))
    
    print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))


Initialized
Minibatch loss at step 0: 2.399433
Minibatch accuracy: 5.5%
Validation accuracy: 10.0%
Current learning rate: 0.009999132715165615
Minibatch loss at step 1000: 0.431082
Minibatch accuracy: 89.8%
Validation accuracy: 86.2%
Current learning rate: 0.009169245138764381
Minibatch loss at step 2000: 0.491584
Minibatch accuracy: 85.2%
Validation accuracy: 88.4%
Current learning rate: 0.008408235386013985
Minibatch loss at step 3000: 0.412086
Minibatch accuracy: 87.5%
Validation accuracy: 89.5%
Current learning rate: 0.007710385601967573
Minibatch loss at step 4000: 0.233656
Minibatch accuracy: 92.2%
Validation accuracy: 89.9%
Current learning rate: 0.007070454768836498
Minibatch loss at step 5000: 0.374780
Minibatch accuracy: 88.3%
Validation accuracy: 90.6%
Current learning rate: 0.006483635865151882
Minibatch loss at step 6000: 0.267530
Minibatch accuracy: 93.8%
Validation accuracy: 90.9%
Current learning rate: 0.005945519544184208
Minibatch loss at step 7000: 0.270144
Minibatch accuracy: 91.4%
Validation accuracy: 91.1%
Current learning rate: 0.005452065262943506
Minibatch loss at step 8000: 0.237299
Minibatch accuracy: 93.0%
Validation accuracy: 91.4%
Current learning rate: 0.0049995663575828075
Minibatch loss at step 9000: 0.348732
Minibatch accuracy: 87.5%
Validation accuracy: 91.4%
Current learning rate: 0.004584622569382191
Minibatch loss at step 10000: 0.266315
Minibatch accuracy: 89.8%
Validation accuracy: 91.5%
Current learning rate: 0.004204117693006992
Minibatch loss at step 11000: 0.285902
Minibatch accuracy: 93.0%
Validation accuracy: 91.8%
Current learning rate: 0.0038551928009837866
Minibatch loss at step 12000: 0.214064
Minibatch accuracy: 92.2%
Validation accuracy: 91.8%
Current learning rate: 0.0035352271515876055
Minibatch loss at step 13000: 0.289196
Minibatch accuracy: 91.4%
Validation accuracy: 91.8%
Current learning rate: 0.003241817932575941
Minibatch loss at step 14000: 0.257250
Minibatch accuracy: 90.6%
Validation accuracy: 92.0%
Current learning rate: 0.002972759772092104
Minibatch loss at step 15000: 0.161104
Minibatch accuracy: 95.3%
Validation accuracy: 92.0%
Current learning rate: 0.002726032631471753
Minibatch loss at step 16000: 0.245270
Minibatch accuracy: 94.5%
Validation accuracy: 92.1%
Current learning rate: 0.0024997834116220474
Minibatch loss at step 17000: 0.205510
Minibatch accuracy: 96.1%
Validation accuracy: 92.0%
Current learning rate: 0.0022923112846910954
Minibatch loss at step 18000: 0.194940
Minibatch accuracy: 94.5%
Validation accuracy: 92.2%
Current learning rate: 0.002102058846503496
Minibatch loss at step 19000: 0.334199
Minibatch accuracy: 92.2%
Validation accuracy: 92.2%
Current learning rate: 0.0019275964004918933
Minibatch loss at step 20000: 0.273460
Minibatch accuracy: 91.4%
Validation accuracy: 92.2%
Current learning rate: 0.0017676136922091246
Test accuracy: 96.8%
  • We observe that the less the standard deviation on the initialisation the better performance we get. This suggests that we need to regularise the weights.

In [17]:
batch_size = 128
patch_size = 5
depth1 = 10
depth2 = 30
num_hidden1 = 240
num_hidden2 = 160

graph = tf.Graph()

with graph.as_default():

    # Input data.
    tf_train_dataset = tf.placeholder(
        tf.float32, shape=(batch_size, image_size, image_size, num_channels))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
  
    # Variables.
    # C1 layer:
    layer1_weights = tf.Variable(tf.truncated_normal(
        [patch_size, patch_size, num_channels, depth1], stddev=0.1))
    layer1_biases = tf.Variable(tf.zeros([depth1]))
    # S2 avg_pool - no need to specify weights
    # C3 layer:
    layer2_weights = tf.Variable(tf.truncated_normal(
        [patch_size, patch_size, depth1, depth2], stddev=0.1))
    layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth2]))
    # S4 avg_pool - no need to specify weights
    # C5 hidden1
    size = ((image_size - patch_size + 1) // 2 - patch_size + 1) // 2
    layer3_weights = tf.Variable(tf.truncated_normal(
        [size * size * depth2, num_hidden1], stddev=np.sqrt(1 / num_hidden1)))
    layer3_biases = tf.Variable(tf.constant(0.01, shape=[num_hidden1]))
    # F6 hidden2
    layer4_weights = tf.Variable(tf.truncated_normal(
        [num_hidden1, num_hidden2], stddev=np.sqrt(1 / num_hidden2)))
    layer4_biases = tf.Variable(tf.constant(0.01, shape=[num_hidden2]))
    # Output
    layer5_weights = tf.Variable(tf.truncated_normal(
        [num_hidden2, num_labels], stddev=np.sqrt(1 / num_labels)))
    layer5_biases = tf.Variable(tf.constant(0.01, shape=[num_labels]))
  
    # Model.
    def model(data):
        # C1 input 28 x 28
        conv = tf.nn.conv2d(data, layer1_weights, [1, 1, 1, 1], padding='VALID')
        layer = tf.nn.relu(conv + layer1_biases)
        # S2 input 24 x 24
        pool = tf.nn.max_pool(layer, [1, 2, 2, 1], [1, 2, 2, 1], padding='VALID')
        # C3 input 12 x 12
        conv = tf.nn.conv2d(pool, layer2_weights, [1, 1, 1, 1], padding='VALID')
        layer = tf.nn.relu(conv + layer2_biases)
        # S4 input 8 x 8
        pool = tf.nn.max_pool(layer, [1, 2, 2, 1], [1, 2, 2, 1], padding='VALID')
        # C5 input 4 x 4
        shape = pool.get_shape().as_list()
        reshape = tf.reshape(pool, [shape[0], shape[1] * shape[2] * shape[3]])
        hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
        # F6
        hidden = tf.nn.relu(tf.matmul(hidden, layer4_weights) + layer4_biases)
        # return output logits
        return (tf.matmul(hidden, layer5_weights) + layer5_biases)
  
    # Training computation.
    logits = model(tf_train_dataset)
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(logits=logits,
                                                labels=tf_train_labels))
        
    # add regularisation for all weights.
    regconst = tf.placeholder(tf.float32)
    loss = loss + regconst * (
    tf.nn.l2_loss(layer3_weights) + tf.nn.l2_loss(layer4_weights) + tf.nn.l2_loss(layer5_weights))
    
    # Optimizer - with variable learning rate.
    gstep = tf.Variable(0)  # steps taken
    ilrate = tf.placeholder(tf.float32)
    flrate = tf.train.exponential_decay(ilrate, gstep, 8000, 0.75)
    
    optimizer = tf.train.MomentumOptimizer(flrate, momentum=0.9, use_nesterov=True).minimize(
        loss, global_step=gstep)
    
    # # Optimizer.
    # optimizer = tf.train.GradientDescentOptimizer(0.005).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
    test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [18]:
num_steps = 20001
# learning rate (initial)
learning_rate_i = 0.01
# regularisation constant
gamma = 0.00001

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        feed_dict = {tf_train_dataset : batch_data,
                     tf_train_labels : batch_labels,
                     regconst : gamma,
                     ilrate : learning_rate_i}
        _, l, predictions = session.run(
            [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 2000 == 0 or step in ([250, 500, 750, 1000])):
            print('Minibatch loss at step %d: %f' % (step, l))
            print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
            print('Validation accuracy: %.1f%%' % accuracy(
                valid_prediction.eval(), valid_labels))
            print("Current learning rate: {}".format(flrate.eval(feed_dict=feed_dict)))
    
    print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))


Initialized
Minibatch loss at step 0: 3.671459
Minibatch accuracy: 10.9%
Validation accuracy: 10.0%
Current learning rate: 0.009999639354646206
Minibatch loss at step 250: 0.924837
Minibatch accuracy: 72.7%
Validation accuracy: 81.6%
Current learning rate: 0.00991014577448368
Minibatch loss at step 500: 0.401814
Minibatch accuracy: 88.3%
Validation accuracy: 84.2%
Current learning rate: 0.009821451269090176
Minibatch loss at step 750: 0.492037
Minibatch accuracy: 84.4%
Validation accuracy: 85.6%
Current learning rate: 0.009733552113175392
Minibatch loss at step 1000: 0.387445
Minibatch accuracy: 88.3%
Validation accuracy: 86.4%
Current learning rate: 0.009646438993513584
Minibatch loss at step 2000: 0.504136
Minibatch accuracy: 82.8%
Validation accuracy: 88.1%
Current learning rate: 0.009305713698267937
Minibatch loss at step 4000: 0.151501
Minibatch accuracy: 95.3%
Validation accuracy: 89.6%
Current learning rate: 0.008659943006932735
Minibatch loss at step 6000: 0.323024
Minibatch accuracy: 91.4%
Validation accuracy: 90.5%
Current learning rate: 0.008058983832597733
Minibatch loss at step 8000: 0.255916
Minibatch accuracy: 93.8%
Validation accuracy: 91.3%
Current learning rate: 0.007499729748815298
Minibatch loss at step 10000: 0.257315
Minibatch accuracy: 91.4%
Validation accuracy: 91.3%
Current learning rate: 0.006979284808039665
Minibatch loss at step 12000: 0.217794
Minibatch accuracy: 93.0%
Validation accuracy: 91.7%
Current learning rate: 0.006494956556707621
Minibatch loss at step 14000: 0.258305
Minibatch accuracy: 93.0%
Validation accuracy: 91.5%
Current learning rate: 0.006044237874448299
Minibatch loss at step 16000: 0.236297
Minibatch accuracy: 93.0%
Validation accuracy: 91.9%
Current learning rate: 0.005624797195196152
Minibatch loss at step 18000: 0.204073
Minibatch accuracy: 93.8%
Validation accuracy: 92.0%
Current learning rate: 0.005234464071691036
Minibatch loss at step 20000: 0.248450
Minibatch accuracy: 92.2%
Validation accuracy: 92.1%
Current learning rate: 0.004871217533946037
Test accuracy: 96.8%

Regularisation didn't significantly improve permormance. Let's try dropout.


In [19]:
batch_size = 128
patch_size = 5
depth1 = 10
depth2 = 30
num_hidden1 = 240
num_hidden2 = 160

graph = tf.Graph()

with graph.as_default():

    # Input data.
    tf_train_dataset = tf.placeholder(
        tf.float32, shape=(batch_size, image_size, image_size, num_channels))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
  
    # Variables.
    # C1 layer:
    layer1_weights = tf.Variable(tf.truncated_normal(
        [patch_size, patch_size, num_channels, depth1], stddev=0.1))
    layer1_biases = tf.Variable(tf.zeros([depth1]))
    # S2 avg_pool - no need to specify weights
    # C3 layer:
    layer2_weights = tf.Variable(tf.truncated_normal(
        [patch_size, patch_size, depth1, depth2], stddev=0.1))
    layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth2]))
    # S4 avg_pool - no need to specify weights
    # C5 hidden1
    size = ((image_size - patch_size + 1) // 2 - patch_size + 1) // 2
    layer3_weights = tf.Variable(tf.truncated_normal(
        [size * size * depth2, num_hidden1], stddev=np.sqrt(1 / num_hidden1)))
    layer3_biases = tf.Variable(tf.constant(0.01, shape=[num_hidden1]))
    # F6 hidden2
    layer4_weights = tf.Variable(tf.truncated_normal(
        [num_hidden1, num_hidden2], stddev=np.sqrt(1 / num_hidden2)))
    layer4_biases = tf.Variable(tf.constant(0.01, shape=[num_hidden2]))
    # Output
    layer5_weights = tf.Variable(tf.truncated_normal(
        [num_hidden2, num_labels], stddev=np.sqrt(1 / num_labels)))
    layer5_biases = tf.Variable(tf.constant(0.01, shape=[num_labels]))
    
    # introduce dropout
    keep_prob = tf.placeholder(tf.float32)
  
    # Model.
    def model(data):
        # C1 input 28 x 28
        conv = tf.nn.conv2d(data, layer1_weights, [1, 1, 1, 1], padding='VALID')
        layer = tf.nn.relu(conv + layer1_biases)
        # S2 input 24 x 24
        pool = tf.nn.max_pool(layer, [1, 2, 2, 1], [1, 2, 2, 1], padding='VALID')
        # C3 input 12 x 12
        conv = tf.nn.conv2d(pool, layer2_weights, [1, 1, 1, 1], padding='VALID')
        layer = tf.nn.relu(conv + layer2_biases)
        # S4 input 8 x 8
        pool = tf.nn.max_pool(layer, [1, 2, 2, 1], [1, 2, 2, 1], padding='VALID')
        # C5 input 4 x 4
        shape = pool.get_shape().as_list()
        reshape = tf.reshape(pool, [shape[0], shape[1] * shape[2] * shape[3]])
        hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
        hidden_d = tf.nn.dropout(hidden, keep_prob)
        # F6
        hidden = tf.nn.relu(tf.matmul(hidden, layer4_weights) + layer4_biases)
        hidden2 = tf.nn.relu(tf.matmul(hidden_d, layer4_weights) + layer4_biases)
        hidden_d = tf.nn.dropout(hidden2, keep_prob)
        # return output logits
        output = (tf.matmul(hidden, layer5_weights) + layer5_biases)
        output_d = (tf.matmul(hidden_d, layer5_weights) + layer5_biases)
        # dropput passes through both fully connected layers.
        return ([output_d, output])
  
    # Training computation.
    logits = model(tf_train_dataset)[0]
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(logits=logits,
                                                labels=tf_train_labels))
        
    # add regularisation for all weights.
    regconst = tf.placeholder(tf.float32)
    loss = loss + regconst * (
        tf.nn.l2_loss(layer3_weights) + tf.nn.l2_loss(layer4_weights) + 
        tf.nn.l2_loss(layer5_weights))
    
    # Optimizer - with variable learning rate.
    gstep = tf.Variable(0)  # steps taken
    ilrate = tf.placeholder(tf.float32)
    flrate = tf.train.exponential_decay(ilrate, gstep, 8000, 0.75)
    
    optimizer = tf.train.MomentumOptimizer(flrate, momentum=0.9, use_nesterov=True).minimize(
        loss, global_step=gstep)
    
    # # Optimizer.
    # optimizer = tf.train.GradientDescentOptimizer(0.005).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(model(tf_train_dataset)[1])
    valid_prediction = tf.nn.softmax(model(tf_valid_dataset)[1])
    test_prediction = tf.nn.softmax(model(tf_test_dataset)[1])

In [20]:
num_steps = 20001
# learning rate (initial)
learning_rate_i = 0.01
# regularisation constant
gamma = 1e-5
# dropout layer keep probability
keep_probl = 0.8 # cannot have the same name as graph variable!

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        feed_dict = {tf_train_dataset : batch_data,
                     tf_train_labels : batch_labels,
                     regconst : gamma,
                     keep_prob : keep_probl,
                     ilrate : learning_rate_i}
        _, l, predictions = session.run(
            [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 2000 == 0 or step in ([250, 500, 750, 1000])):
            print('Minibatch loss at step %d: %f' % (step, l))
            print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
            print('Validation accuracy: %.1f%%' % accuracy(
                valid_prediction.eval(), valid_labels))
            print("Current learning rate: {}".format(flrate.eval(feed_dict=feed_dict)))
    
    print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))


Initialized
Minibatch loss at step 0: 5.213852
Minibatch accuracy: 5.5%
Validation accuracy: 13.0%
Current learning rate: 0.009999639354646206
Minibatch loss at step 250: 0.984002
Minibatch accuracy: 75.8%
Validation accuracy: 82.7%
Current learning rate: 0.00991014577448368
Minibatch loss at step 500: 0.450437
Minibatch accuracy: 89.1%
Validation accuracy: 85.3%
Current learning rate: 0.009821451269090176
Minibatch loss at step 750: 0.428279
Minibatch accuracy: 88.3%
Validation accuracy: 86.3%
Current learning rate: 0.009733552113175392
Minibatch loss at step 1000: 0.449292
Minibatch accuracy: 88.3%
Validation accuracy: 86.9%
Current learning rate: 0.009646438993513584
Minibatch loss at step 2000: 0.461160
Minibatch accuracy: 84.4%
Validation accuracy: 88.5%
Current learning rate: 0.009305713698267937
Minibatch loss at step 4000: 0.203412
Minibatch accuracy: 93.8%
Validation accuracy: 89.7%
Current learning rate: 0.008659943006932735
Minibatch loss at step 6000: 0.322330
Minibatch accuracy: 91.4%
Validation accuracy: 90.4%
Current learning rate: 0.008058983832597733
Minibatch loss at step 8000: 0.269513
Minibatch accuracy: 91.4%
Validation accuracy: 91.0%
Current learning rate: 0.007499729748815298
Minibatch loss at step 10000: 0.306383
Minibatch accuracy: 90.6%
Validation accuracy: 91.2%
Current learning rate: 0.006979284808039665
Minibatch loss at step 12000: 0.241794
Minibatch accuracy: 92.2%
Validation accuracy: 91.4%
Current learning rate: 0.006494956556707621
Minibatch loss at step 14000: 0.255972
Minibatch accuracy: 93.0%
Validation accuracy: 91.5%
Current learning rate: 0.006044237874448299
Minibatch loss at step 16000: 0.346515
Minibatch accuracy: 91.4%
Validation accuracy: 91.8%
Current learning rate: 0.005624797195196152
Minibatch loss at step 18000: 0.244449
Minibatch accuracy: 93.8%
Validation accuracy: 91.9%
Current learning rate: 0.005234464071691036
Minibatch loss at step 20000: 0.276668
Minibatch accuracy: 92.2%
Validation accuracy: 92.0%
Current learning rate: 0.004871217533946037
Test accuracy: 96.7%