This notebook contains code to train a neural network with a single hidden layer on MNIST. At the end is a short exercise to add a second hidden layer, transforming it into a deep neural network.


In [ ]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data

import numpy as np

import math
import os

%matplotlib inline
import matplotlib.pyplot as plt

In [ ]:
LOGDIR = './graphs'

In [ ]:
tf.reset_default_graph()
sess = tf.Session()

In [ ]:
mnist = input_data.read_data_sets('/tmp/data', one_hot=True)

In [ ]:
NUM_CLASSES = 10
NUM_PIXELS = 28 * 28
TRAIN_STEPS = 1000
BATCH_SIZE = 100

HIDDEN1_UNITS = 128

LEARNING_RATE = 0.5

In [ ]:
x = tf.placeholder(tf.float32, [None, NUM_PIXELS], name="pixels")
y_ = tf.placeholder(tf.float32, [None, NUM_CLASSES], name="labels")

# helped functions to create a weight and bias variable
# with proper initialization 
def weight_variable(inputs, outputs, name):
    # why do we initialize weights this way?
    # see http://cs231n.github.io/neural-networks-2/ for more details
    initial = tf.truncated_normal(shape=[inputs, outputs], stddev=1.0 / math.sqrt(float(inputs)))
    return tf.Variable(initial, name=name)

def bias_variable(shape, name):
    initial = tf.constant(0.1, shape=[shape])
    return tf.Variable(initial, name=name)

with tf.name_scope("hidden_layer_1"):

    # weights and baises for the first layer
    weights1 = weight_variable(NUM_PIXELS, HIDDEN1_UNITS, "weights1")
    biases1 = bias_variable(HIDDEN1_UNITS, "biases1")

    # activations for the first hidden layer
    hidden1 = tf.nn.relu(tf.matmul(x, weights1) + biases1, name="hidden1")

with tf.name_scope("output_layer"):
    
    # weights and biases for the second layer
    weights2 = weight_variable(HIDDEN1_UNITS, NUM_CLASSES, "weights2")
    biases2 = bias_variable(NUM_CLASSES, "biases2")

    # logits - you can think of these (roughly)
    # as unnormalized probabilities, or the amount of
    # evidence we have that the input image corresponds to
    # each digit
    y = tf.matmul(hidden1, weights2) + biases2

with tf.name_scope("loss"):
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_))
    tf.summary.scalar('loss', loss)

with tf.name_scope("optimizer"):
    train = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(loss)
    

train_writer = tf.summary.FileWriter(os.path.join(LOGDIR, "train"))
train_writer.add_graph(sess.graph)

test_writer = tf.summary.FileWriter(os.path.join(LOGDIR, "test"))

correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
tf.summary.scalar('accuracy', accuracy)

summary_op = tf.summary.merge_all()

sess.run(tf.global_variables_initializer())

for step in range(TRAIN_STEPS):
    batch_xs, batch_ys = mnist.train.next_batch(BATCH_SIZE)
    summary_result, _ = sess.run([summary_op, train], 
                                    feed_dict={x: batch_xs, y_: batch_ys})

    train_writer.add_summary(summary_result, step)
    train_writer.add_run_metadata(tf.RunMetadata(), 'step%03d' % step)
    
    # calculate accuracy on the test set
    if step % 100 == 0:
        summary_result, acc = sess.run([summary_op, accuracy], 
                                       feed_dict={x: mnist.test.images, 
                                                  y_: mnist.test.labels})
        test_writer.add_summary(summary_result, step)
        test_writer.add_run_metadata(tf.RunMetadata(), 'step%03d' % step)
        print ("test accuracy: %f at step %d" % (acc, step))


print("Accuracy %f" % sess.run(accuracy, 
                               feed_dict={x: mnist.test.images,
                                          y_: mnist.test.labels}))

train_writer.close()
test_writer.close()

Exercise

Add a second hidden layer to the above code, with 64 units. Experiment with the parameters (batch size, steps, learning rate, units per layer) to see if you can achieve higher accuracy than the single hidden layer model. Keep in mind there's randomness between runs.


In [ ]:
# Put your solution here or modify the above code.