In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function

import numpy as np
import tensorflow as tf

from tensorflow.examples.tutorials.mnist import input_data
from sklearn.linear_model import LogisticRegression

Some utility functions


In [2]:
def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

# Reformat the dataset for the convolutional networks
def reformat(dataset):
    dataset = dataset.reshape((-1, image_size, image_size, num_channels)).astype(np.float32)
    return dataset

Load in the mnist dataset


In [3]:
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
# The mnist images have a dimension of 28*28. 
image_size = 28
# There are 10 labels.
num_labels = 10
train_dataset = mnist.train.images
train_labels = mnist.train.labels

perm = np.random.permutation(mnist.test.images.shape[0])

split_point = int(mnist.test.images.shape[0] * 0.1)
valid_dataset, test_dataset = mnist.test.images[:split_point], mnist.test.images[split_point:]
valid_labels, test_labels = mnist.test.labels[:split_point], mnist.test.labels[split_point:]


Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz

Simple logistic regression a' la sklearn

Let's set a baseline with a simple logistic regression, ommiting reguralization. Just to see where we are starting from.


In [7]:
train_labels_not_hot = np.nonzero(mnist.train.labels)[1]
test_labels_not_hot = np.nonzero(mnist.test.labels[split_point:])[1]

In [10]:
lr = LogisticRegression()

In [11]:
lr.fit(train_dataset, train_labels_not_hot)
lr.score(test_dataset, test_labels_not_hot)


Out[11]:
0.9211111111111111

1-hidden layer neural network with rectified linear units nn.relu() and 1024 hidden nodes.

Building the model


In [12]:
batch_size = 128

graph = tf.Graph()
with graph.as_default():

    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    with tf.name_scope('input'):
        tf_train_dataset = tf.placeholder(tf.float32,
                                        shape=(batch_size, image_size * image_size))
        tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
        tf_valid_dataset = tf.constant(valid_dataset)
        tf_test_dataset = tf.constant(test_dataset)

    # Variables.
    with tf.name_scope('hidden'):
        weights_hidden = tf.Variable(tf.truncated_normal([image_size * image_size, 1024], stddev=0.1), 
                                   name='weights')
        biases_hidden = tf.Variable(tf.constant(0.1, shape=[1024]), name='biases')
        relu_output = tf.nn.relu(tf.matmul(tf_train_dataset, weights_hidden) + biases_hidden)

    with tf.name_scope('output'):
        weights_output = tf.Variable(tf.truncated_normal([1024, num_labels], stddev=0.1), name='weights')
        biases_output = tf.Variable(tf.constant(0.1, shape=[num_labels]), name='biases')
        logits = tf.matmul(relu_output, weights_output) + biases_output

    loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))

    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(
                                                tf.matmul(tf_valid_dataset, weights_hidden) + 
                                                biases_hidden),
                                               weights_output) + 
                                     biases_output)

    test_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(
                                                tf.matmul(tf_test_dataset, weights_hidden) + 
                                                biases_hidden),
                                               weights_output) + 
                                    biases_output)

Training the model


In [13]:
num_steps = 3001

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized")
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
       
        # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict = {tf_train_dataset : batch_data, 
                     tf_train_labels : batch_labels}
        
        _, l, predictions = session.run([optimizer, loss, train_prediction],
                                        feed_dict=feed_dict)
        
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
       
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

    # Merge all the summaries and write them out to /tmp/mnist_logs (by default)
    merged = tf.merge_all_summaries()
    train_writer = tf.train.SummaryWriter('./train',
                                        session.graph)
    test_writer = tf.train.SummaryWriter('./test')


Initialized
Minibatch loss at step 0: 2.947026
Minibatch accuracy: 6.2%
Validation accuracy: 13.7%
Minibatch loss at step 500: 0.117899
Minibatch accuracy: 96.1%
Validation accuracy: 96.0%
Minibatch loss at step 1000: 0.018200
Minibatch accuracy: 100.0%
Validation accuracy: 97.0%
Minibatch loss at step 1500: 0.094146
Minibatch accuracy: 97.7%
Validation accuracy: 97.3%
Minibatch loss at step 2000: 0.052630
Minibatch accuracy: 99.2%
Validation accuracy: 97.6%
Minibatch loss at step 2500: 0.018096
Minibatch accuracy: 100.0%
Validation accuracy: 97.9%
Minibatch loss at step 3000: 0.004794
Minibatch accuracy: 100.0%
Validation accuracy: 98.0%
Test accuracy: 98.0%

Same network, but with dropout and l2 reguralization


In [ ]:
batch_size = 128
beta = 0.001

graph = tf.Graph()
with graph.as_default():

    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    with tf.name_scope('input'):
        tf_train_dataset = tf.placeholder(tf.float32,
                                        shape=(batch_size, image_size * image_size))
        tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
        tf_valid_dataset = tf.constant(valid_dataset)
        tf_test_dataset = tf.constant(test_dataset)

    # Variables.
    keep_prob = tf.placeholder(tf.float32)
    
    with tf.name_scope('hidden'):
        weights_hidden = tf.Variable(tf.truncated_normal([image_size * image_size, 1024], stddev=0.1),
                                     name='weights')
        weights_hidden_dropped = tf.nn.dropout(weights_hidden, keep_prob)
        biases_hidden = tf.Variable(tf.constant(0.1, shape=[1024]), name='biases')
        relu_output = tf.nn.relu(tf.matmul(tf_train_dataset, weights_hidden_dropped) + biases_hidden)

    with tf.name_scope('output'):
        weights_output = tf.Variable(tf.truncated_normal([1024, num_labels], stddev=0.1), name='weights')
        weights_output_dropped = tf.nn.dropout(weights_output, keep_prob)
        biases_output = tf.Variable(tf.constant(0.1, shape=[num_labels]), name='biases')
        logits = tf.matmul(relu_output, weights_output_dropped) + biases_output

    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
    loss = tf.reduce_mean( loss + beta * tf.nn.l2_loss(weights_output_dropped))


    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(
    tf.matmul(
        tf.nn.relu(tf.matmul(
            tf_valid_dataset, weights_hidden) + biases_hidden),
        weights_output) + biases_output)

    test_prediction = tf.nn.softmax(
    tf.matmul(
       tf.nn.relu(tf.matmul(
            tf_test_dataset, weights_hidden) + biases_hidden),
        weights_output) + biases_output)

num_steps = 3001

for kp in np.arange(0.5,1,0.1):
    with tf.Session(graph=graph) as session:
      tf.initialize_all_variables().run()
      print("Initialized")
      for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size) % 10
        # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict = {
            tf_train_dataset : batch_data, 
            tf_train_labels : batch_labels,
            keep_prob: kp}
        
        _, l, predictions = session.run([optimizer, loss, train_prediction], 
                                        feed_dict=feed_dict)

      print("Keep prob: %s Test accuracy: %.1f%%" % (kp, accuracy(test_prediction.eval(), test_labels)))
      accuracy_val_nn_l2.append(accuracy(test_prediction.eval(), test_labels))

The accuracy measures with different dropout tactics:

Dropout on both layers results:

Initialized
Keep prob: 0.5 Test accuracy: 75.3%
Initialized
Keep prob: 0.6 Test accuracy: 76.7%
Initialized
Keep prob: 0.7 Test accuracy: 76.8%
Initialized
Keep prob: 0.8 Test accuracy: 76.4%
Initialized
Keep prob: 0.9 Test accuracy: 74.1%

Dropout on both layers plus l2 reguralization with a 0.0001 beta:

Initialized
Keep prob: 0.5 Test accuracy: 75.9%
Initialized
Keep prob: 0.6 Test accuracy: 76.0%
Initialized
Keep prob: 0.7 Test accuracy: 75.8%
Initialized
Keep prob: 0.8 Test accuracy: 75.3%
Initialized
Keep prob: 0.9 Test accuracy: 75.5%

Convolutional Part

Prepare data and variables for convolutions


In [15]:
num_channels = 1
batch_size = 16
patch_size = 5
depth = 32
num_hidden = 64
num_channels = 1

train_dataset_conv = reformat(train_dataset)
valid_dataset_conv = reformat(valid_dataset)
test_dataset_conv = reformat(test_dataset)

print(train_dataset_conv.shape, train_labels.shape)
print(valid_dataset_conv.shape, valid_labels.shape)
print(test_dataset_conv.shape, test_labels.shape)


(55000, 28, 28, 1) (55000, 10)
(1000, 28, 28, 1) (1000, 10)
(9000, 28, 28, 1) (9000, 10)

Simple convolutional network, stride=2


In [16]:
depth = 16
graph = tf.Graph()

with graph.as_default():

    # Input data.
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size, image_size, num_channels))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))

    tf_valid_dataset = tf.constant(valid_dataset_conv)
    tf_test_dataset = tf.constant(test_dataset_conv)

    # Variables.
    layer1_weights = tf.Variable(tf.truncated_normal([patch_size, patch_size, num_channels, depth], stddev=0.1))
    layer1_biases = tf.Variable(tf.zeros([depth]))

    layer2_weights = tf.Variable(tf.truncated_normal([patch_size, patch_size, depth, depth], stddev=0.1))
    layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))

    layer3_weights = tf.Variable(tf.truncated_normal([image_size // 4 * image_size // 4 * depth, num_hidden], 
                                                    stddev=0.1))
    layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))

    layer4_weights = tf.Variable(tf.truncated_normal([num_hidden, num_labels], stddev=0.1))
    layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))

    # Model.
    def model(data):
        conv = tf.nn.conv2d(data, layer1_weights, [1, 2, 2, 1], padding='SAME')
        hidden = tf.nn.relu(conv + layer1_biases)

        conv = tf.nn.conv2d(hidden, layer2_weights, [1, 2, 2, 1], padding='SAME')
        hidden = tf.nn.relu(conv + layer2_biases)

        shape = hidden.get_shape().as_list()
        reshape = tf.reshape(hidden, [shape[0], shape[1] * shape[2] * shape[3]])

        hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
        return tf.matmul(hidden, layer4_weights) + layer4_biases

    # Training computation.
    logits = model(tf_train_dataset)
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))

    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
    test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [17]:
num_steps = 1001

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print('Initialized')
    for step in range(num_steps):

        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset_conv[offset:(offset + batch_size), :, :, :]
        batch_labels = train_labels[offset:(offset + batch_size), :]

        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}

        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)

        if (step % 50 == 0):
            print('Minibatch loss at step %d: %f' % (step, l))
            print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
            print('Validation accuracy: %.1f%%' % accuracy(valid_prediction.eval(), valid_labels))

    print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))


Initialized
Minibatch loss at step 0: 4.475591
Minibatch accuracy: 6.2%
Validation accuracy: 12.7%
Minibatch loss at step 50: 2.185037
Minibatch accuracy: 12.5%
Validation accuracy: 23.2%
Minibatch loss at step 100: 0.858086
Minibatch accuracy: 68.8%
Validation accuracy: 62.9%
Minibatch loss at step 150: 0.733255
Minibatch accuracy: 75.0%
Validation accuracy: 74.9%
Minibatch loss at step 200: 1.191509
Minibatch accuracy: 75.0%
Validation accuracy: 80.9%
Minibatch loss at step 250: 0.284247
Minibatch accuracy: 87.5%
Validation accuracy: 80.0%
Minibatch loss at step 300: 0.651432
Minibatch accuracy: 75.0%
Validation accuracy: 78.5%
Minibatch loss at step 350: 0.048483
Minibatch accuracy: 100.0%
Validation accuracy: 86.4%
Minibatch loss at step 400: 0.254464
Minibatch accuracy: 87.5%
Validation accuracy: 86.8%
Minibatch loss at step 450: 0.304775
Minibatch accuracy: 87.5%
Validation accuracy: 82.8%
Minibatch loss at step 500: 0.541844
Minibatch accuracy: 75.0%
Validation accuracy: 84.8%
Minibatch loss at step 550: 0.112686
Minibatch accuracy: 93.8%
Validation accuracy: 87.1%
Minibatch loss at step 600: 0.279609
Minibatch accuracy: 93.8%
Validation accuracy: 89.8%
Minibatch loss at step 650: 0.304930
Minibatch accuracy: 87.5%
Validation accuracy: 91.3%
Minibatch loss at step 700: 0.304760
Minibatch accuracy: 87.5%
Validation accuracy: 89.3%
Minibatch loss at step 750: 0.513473
Minibatch accuracy: 81.2%
Validation accuracy: 90.5%
Minibatch loss at step 800: 0.563449
Minibatch accuracy: 93.8%
Validation accuracy: 89.6%
Minibatch loss at step 850: 0.105676
Minibatch accuracy: 93.8%
Validation accuracy: 92.1%
Minibatch loss at step 900: 0.188333
Minibatch accuracy: 93.8%
Validation accuracy: 90.5%
Minibatch loss at step 950: 0.826272
Minibatch accuracy: 87.5%
Validation accuracy: 90.6%
Minibatch loss at step 1000: 0.213254
Minibatch accuracy: 93.8%
Validation accuracy: 90.6%
Test accuracy: 92.1%

Buffed up convolutional network

  • max pooling
  • dropouts

Some helper functions


In [18]:
def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

def conv2d(x, W):
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

def max_pool_2x2(x):
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
                        strides=[1, 2, 2, 1], padding='SAME')

In [19]:
depth = 32
graph = tf.Graph()


with graph.as_default():

    # Placeholders
    keep_prob = tf.placeholder(tf.float32)
    
    # Input data.
    tf_train_batch = tf.placeholder(tf.float32, shape=(None, image_size, image_size, num_channels))
    # The None at the shape argument means that the dimension is not defined,
    tf_train_labels = tf.placeholder(tf.float32, shape=(None, num_labels))

    # Constants
    tf_valid_dataset = tf.constant(valid_dataset_conv)
    tf_test_dataset = tf.constant(test_dataset_conv)

    # Variables.
    h_conv1_weights = weight_variable([patch_size, patch_size, num_channels, depth])
    h_conv1_biases = bias_variable([depth])

    h_conv2_weights = weight_variable([patch_size, patch_size, depth, depth * 2])
    h_conv2_biases = bias_variable([depth * 2])
    
    conv_image_size = image_size // 4
    fc1_weights = weight_variable([conv_image_size * conv_image_size * depth * 2, num_hidden])
    fc1_biases = bias_variable([num_hidden])

    output_softmax_weights = weight_variable([num_hidden, num_labels])
    output_softmax_biases = bias_variable([num_labels])
    
    #Define the model:
    # First layer, patches of 5x5 into 32 features
    h_conv1 = tf.nn.relu(conv2d(tf_train_batch, h_conv1_weights) + h_conv1_biases)
    h_pool1 = max_pool_2x2(h_conv1)

    # Second layer, patches of 5x5 into 64 features
    h_conv2 = tf.nn.relu(conv2d(h_pool1, h_conv2_weights) + h_conv2_biases)
    h_pool2 = max_pool_2x2(h_conv2)

    # Reshape into the densely connected layer
    h_pool2_flat = tf.reshape(h_pool2, [-1, conv_image_size * conv_image_size * depth * 2])
    
    # Define the fully connected layer
    h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, fc1_weights) + fc1_biases)
    h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

    # Readout layer
    y_conv = tf.nn.softmax(tf.matmul(h_fc1_drop, output_softmax_weights) + output_softmax_biases)

In [20]:
with tf.Session(graph=graph) as sess:
    # Training computation.
    cross_entropy = tf.reduce_mean(-tf.reduce_sum(tf_train_labels * tf.log(y_conv), reduction_indices=[1]))

    # Optimizer
    train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
    
    # These two lines are measure the accuracy of our model.
    # y_conv is a softmax output, the highest entry is the most probable according to our model 
    # (e.g.: [0.7, 0.2, 0.5, 0.5])
    # tf_train_labels are the original labels for the training set. 
    # (eg.: [0, 0, 0, 1])
    correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(tf_train_labels,1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    
    # Initialize the session variables.
    sess.run(tf.initialize_all_variables())

    for step in range(3001):
        
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        
        # I should randomize this part a bit more to reduce the possibility of reoccuring batches.
        batch_data = train_dataset_conv[offset:(offset + batch_size), :, :, :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        if step % 100 == 0:
            train_accuracy = accuracy.eval(feed_dict={tf_train_batch:  batch_data, 
                                                      tf_train_labels: batch_labels, 
                                                      keep_prob: 1.0})
            print("step %d, training accuracy %g" % (step, train_accuracy))
        
        train_step.run(feed_dict={tf_train_batch:  batch_data, 
                                  tf_train_labels: batch_labels, 
                                  keep_prob: 0.5})
        
    print("test accuracy %g" % accuracy.eval(feed_dict={tf_train_batch: test_dataset_conv,
                                                        tf_train_labels: test_labels, 
                                                        keep_prob: 1.0}))


step 0, training accuracy 0.1875
step 100, training accuracy 0.6875
step 200, training accuracy 0.4375
step 300, training accuracy 0.75
step 400, training accuracy 0.875
step 500, training accuracy 0.8125
step 600, training accuracy 1
step 700, training accuracy 0.5625
step 800, training accuracy 0.6875
step 900, training accuracy 0.9375
step 1000, training accuracy 0.8125
step 1100, training accuracy 0.9375
step 1200, training accuracy 0.8125
step 1300, training accuracy 0.875
step 1400, training accuracy 1
step 1500, training accuracy 0.9375
step 1600, training accuracy 0.8125
step 1700, training accuracy 0.9375
step 1800, training accuracy 0.9375
step 1900, training accuracy 0.875
step 2000, training accuracy 1
step 2100, training accuracy 0.8125
step 2200, training accuracy 1
step 2300, training accuracy 1
step 2400, training accuracy 1
step 2500, training accuracy 0.9375
step 2600, training accuracy 1
step 2700, training accuracy 1
step 2800, training accuracy 0.9375
step 2900, training accuracy 1
step 3000, training accuracy 1
test accuracy 0.952111

At 20'000 Iterations it has achieved a test accuracy of 0.986...


In [ ]: