In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
import cPickle as pickle
import numpy as np
import tensorflow as tf
First reload the data we generated in notmist.ipynb.
In [2]:
pickle_file = 'notMNIST.pickle'
with open(pickle_file, 'rb') as f:
save = pickle.load(f)
train_dataset = save['train_dataset']
train_labels = save['train_labels']
valid_dataset = save['valid_dataset']
valid_labels = save['valid_labels']
test_dataset = save['test_dataset']
test_labels = save['test_labels']
del save # hint to help gc free up memory
print 'Training set', train_dataset.shape, train_labels.shape
print 'Validation set', valid_dataset.shape, valid_labels.shape
print 'Test set', test_dataset.shape, test_labels.shape
Reformat into a shape that's more adapted to the models we're going to train:
In [3]:
image_size = 28
num_labels = 10
def reformat(dataset, labels):
dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
# Map 2 to [0.0, 1.0, 0.0 ...], 3 to [0.0, 0.0, 1.0 ...]
labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print 'Training set', train_dataset.shape, train_labels.shape
print 'Validation set', valid_dataset.shape, valid_labels.shape
print 'Test set', test_dataset.shape, test_labels.shape
In [4]:
def accuracy(predictions, labels):
return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
/ predictions.shape[0])
Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compue the L2 loss for a tensor t
using nn.l2_loss(t)
. The right amount of regularization should improve your validation / test accuracy.
In [95]:
n_hidden = 1024
L2_weight = 0.5e-3
def forward(tf_X):
"""
assert tf.shape(tf_X)[1] == image_size*image_size,\
"Training data not of correct shape. Each input should be of shape: %s" % (image_size*image_size)
"""
with tf.name_scope('hidden1'):
weights = tf.Variable(tf.truncated_normal([image_size*image_size, n_hidden]), name="weights")
biases = tf.Variable(tf.zeros([n_hidden]), name="biases")
z01 = tf.matmul(tf_X, weights) + biases
hidden1 = tf.nn.relu(z01)
l2_reg_01 = tf.nn.l2_loss(weights)
with tf.name_scope('z12'):
weights = tf.Variable(tf.truncated_normal([n_hidden, num_labels]), name="weights")
biases = tf.Variable(tf.zeros([num_labels]), name="biases")
z12 = tf.matmul(hidden1, weights) + biases
l2_reg_12 = tf.nn.l2_loss(weights)
return z12, l2_reg_01+l2_reg_12
# Define loss
def get_loss(z12, l2_loss, tf_Y):
"""
assert tf.shape(tf_X)[1] == image_size*image_size,\
"Training data not of correct shape. got %s require %s" % (tf.shape(tf_X)[1], image_size*image_size)
assert tf.shape(tf_Y)[1] == num_labels,\
"Training data not of correct shape. got %s require %s" % (tf.shape(tf_Y)[1], num_labels)
"""
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(z12, tf_training_labels))
total_loss = loss + L2_weight*l2_loss
return total_loss
# Define the network graph
graph = tf.Graph()
with graph.as_default():
#tf_training_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size*image_size))
#tf_training_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_training_dataset = tf.placeholder(tf.float32) # Should have shape (batch_size, image_size*image_size)
tf_training_labels = tf.placeholder(tf.float32) # Should have shape (batch_size, num_labels)
z12, l2_loss = forward(tf_training_dataset)
total_loss = get_loss(z12, l2_loss, tf_training_labels)
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(total_loss)
In [96]:
# train the model
num_steps = 3001
batch_size = 128
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
print "Initialized, using batch size: %s" % batch_size
for step in xrange(num_steps):
idx = np.random.randint(train_dataset.shape[0], size=batch_size)
#offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[idx]
batch_labels = train_labels[idx]
#batch_data = train_dataset[offset:(offset + batch_size), :]
#batch_labels = train_labels[offset:(offset + batch_size), :]
feed_dict = {tf_training_dataset : batch_data, tf_training_labels : batch_labels}
_, l, predictions = session.run([optimizer, total_loss, z12], feed_dict=feed_dict)
if (step % 500 == 0):
#batch_size += 100
print "Updated batch size: %s" % batch_size
print "Minibatch loss at step", step, ":", l
print "Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels)
predictions = session.run(z12, feed_dict={tf_training_dataset: valid_dataset})
print "Validation accuracy: %.1f%%" % accuracy(predictions, valid_labels)
predictions = session.run(z12, feed_dict={tf_training_dataset: test_dataset})
print "Test accuracy: %.1f%%" % accuracy(predictions, test_labels)
In [58]:
train_dataset.shape
Out[58]:
In [97]:
# Overfitting using very small subset of data
num_steps = 3001
batch_size = 100
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
print "Initialized, using batch size: %s" % batch_size
for step in xrange(num_steps):
idx = np.random.randint(train_dataset.shape[0]/100, size=batch_size)
#offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[idx]
batch_labels = train_labels[idx]
#batch_data = train_dataset[offset:(offset + batch_size), :]
#batch_labels = train_labels[offset:(offset + batch_size), :]
feed_dict = {tf_training_dataset : batch_data, tf_training_labels : batch_labels}
_, l, predictions = session.run([optimizer, total_loss, z12], feed_dict=feed_dict)
if (step % 500 == 0):
#batch_size += 100
print "Updated batch size: %s" % batch_size
print "Minibatch loss at step", step, ":", l
print "Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels)
predictions = session.run(z12, feed_dict={tf_training_dataset: valid_dataset})
print "Validation accuracy: %.1f%%" % accuracy(predictions, valid_labels)
predictions = session.run(z12, feed_dict={tf_training_dataset: test_dataset})
print "Test accuracy: %.1f%%" % accuracy(predictions, test_labels)
Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides nn.dropout()
for that, but you have to make sure it's only inserted during training.
What happens to our extreme overfitting case?
In [105]:
batch_size = 128
n_hidden = 1024
L2_weight = 0.5e-3
def forward(tf_X, dropout_p):
"""
assert tf.shape(tf_X)[1] == image_size*image_size,\
"Training data not of correct shape. Each input should be of shape: %s" % (image_size*image_size)
"""
with tf.name_scope('hidden1'):
weights = tf.Variable(tf.truncated_normal([image_size*image_size, n_hidden]), name="weights")
biases = tf.Variable(tf.zeros([n_hidden]), name="biases")
z01 = tf.matmul(tf_X, weights) + biases
hidden1 = tf.nn.dropout(tf.nn.relu(z01), dropout_p) # Added dropout
l2_reg_01 = tf.nn.l2_loss(weights)
with tf.name_scope('z12'):
weights = tf.Variable(tf.truncated_normal([n_hidden, num_labels]), name="weights")
biases = tf.Variable(tf.zeros([num_labels]), name="biases")
z12 = tf.matmul(hidden1, weights) + biases
l2_reg_12 = tf.nn.l2_loss(weights)
return z12, l2_reg_01+l2_reg_12
#return z12, 0
# Define loss
def get_loss(z12, l2_loss, tf_Y):
"""
assert tf.shape(tf_X)[1] == image_size*image_size,\
"Training data not of correct shape. got %s require %s" % (tf.shape(tf_X)[1], image_size*image_size)
assert tf.shape(tf_Y)[1] == num_labels,\
"Training data not of correct shape. got %s require %s" % (tf.shape(tf_Y)[1], num_labels)
"""
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(z12, tf_training_labels))
total_loss = loss + L2_weight*l2_loss
return total_loss
# Define the network graph
tf.python.framework.ops.reset_default_graph()
graph = tf.Graph()
with graph.as_default():
#tf_training_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size*image_size))
#tf_training_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_training_dataset = tf.placeholder(tf.float32) # Should have shape (batch_size, image_size*image_size)
tf_training_labels = tf.placeholder(tf.float32) # Should have shape (batch_size, num_labels)
dropout_p = tf.placeholder(tf.float32)
z12, l2_loss = forward(tf_training_dataset, dropout_p)
total_loss = get_loss(z12, l2_loss, tf_training_labels)
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(total_loss)
In [109]:
# train the model
num_steps = 3001
batch_size = 128
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
print "Initialized, using batch size: %s" % batch_size
for step in xrange(num_steps):
idx = np.random.randint(train_dataset.shape[0], size=batch_size)
#offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[idx]
batch_labels = train_labels[idx]
#batch_data = train_dataset[offset:(offset + batch_size), :]
#batch_labels = train_labels[offset:(offset + batch_size), :]
feed_dict = {tf_training_dataset : batch_data, tf_training_labels : batch_labels, dropout_p: 0.5}
_, l = session.run([optimizer, total_loss], feed_dict=feed_dict)
predictions = session.run(z12, feed_dict={tf_training_dataset: batch_data, dropout_p: 1})
if (step % 500 == 0):
#batch_size += 100
print "Updated batch size: %s" % batch_size
print "Minibatch loss at step", step, ":", l
print "Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels)
predictions = session.run(z12, feed_dict={tf_training_dataset: valid_dataset, dropout_p: 1})
print "Validation accuracy: %.1f%%" % accuracy(predictions, valid_labels)
predictions = session.run(z12, feed_dict={tf_training_dataset: test_dataset, dropout_p: 1})
print "Test accuracy: %.1f%%" % accuracy(predictions, test_labels)
In [108]:
# train the model using smaller sample resulting in overfitting
num_steps = 3001
batch_size = 100
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
print "Initialized, using batch size: %s" % batch_size
for step in xrange(num_steps):
idx = np.random.randint(train_dataset[:2000].shape[0], size=batch_size)
#offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[:2000][idx]
batch_labels = train_labels[:2000][idx]
#batch_data = train_dataset[offset:(offset + batch_size), :]
#batch_labels = train_labels[offset:(offset + batch_size), :]
feed_dict = {tf_training_dataset : batch_data, tf_training_labels : batch_labels, dropout_p: 0.5}
_, l = session.run([optimizer, total_loss], feed_dict=feed_dict)
predictions = session.run(z12, feed_dict={tf_training_dataset: batch_data, dropout_p: 1})
if (step % 500 == 0):
#batch_size += 100
print "Updated batch size: %s" % batch_size
print "Minibatch loss at step", step, ":", l
print "Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels)
predictions = session.run(z12, feed_dict={tf_training_dataset: valid_dataset, dropout_p: 1})
print "Validation accuracy: %.1f%%" % accuracy(predictions, valid_labels)
predictions = session.run(z12, feed_dict={tf_training_dataset: test_dataset, dropout_p: 1})
print "Test accuracy: %.1f%%" % accuracy(predictions, test_labels)
Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is 97.1%.
One avenue you can explore is to add multiple layers.
Another one is to use learning rate decay:
global_step = tf.Variable(0) # count the number of steps taken.
learning_rate = tf.train.exponential_decay(0.5, step, ...)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
In [186]:
## BEST MODEL
"""
[Step: 5000] Minibatch loss 12.6376, accuracy: 89.5%
[Step: 5000] Validation loss 12.6891, accuracy: 86.9%
Test loss 12.4793, accuracy: 93.0%
"""
batch_size = 128
n_hidden = 1024
L2_weight = 0.5e-3
def forward(tf_X, dropout_p):
"""
assert tf.shape(tf_X)[1] == image_size*image_size,\
"Training data not of correct shape. Each input should be of shape: %s" % (image_size*image_size)
"""
l2_weight_loss = [0]
#tf.Variable(0, name="l2_weight_loss")
with tf.name_scope('hidden1'):
weights = tf.Variable(tf.truncated_normal([image_size*image_size, n_hidden]), name="weights")
biases = tf.Variable(tf.zeros([n_hidden]), name="biases")
z01 = tf.matmul(tf.nn.dropout(tf_X, 0.9), weights) + biases # Dropout input keeping 0.9 inputs always
hidden1 = tf.nn.dropout(tf.nn.relu(z01), dropout_p) # Added dropout
#hidden1 = tf.nn.relu(z01) # No dropout
l2_weight_loss.append(tf.nn.l2_loss(weights))
"""
with tf.name_scope('z12'):
weights = tf.Variable(tf.truncated_normal([n_hidden, n_hidden]), name="weights")
biases = tf.Variable(tf.zeros([n_hidden]), name="biases")
z12 = tf.matmul(hidden1, weights) + biases
hidden2 = tf.nn.dropout(tf.nn.tanh(z12), dropout_p) # Added dropout
#hidden2 = tf.nn.relu(z12) # No dropout
#l2_weight_loss.append(tf.nn.l2_loss(weights))
"""
with tf.name_scope('outputs'):
weights = tf.Variable(tf.truncated_normal([n_hidden, num_labels]), name="weights")
biases = tf.Variable(tf.zeros([num_labels]), name="biases")
outputs = tf.matmul(hidden1, weights) + biases # Add constant to ensure input to log is never zero.
l2_weight_loss.append(tf.nn.l2_loss(weights))
return outputs, reduce(lambda x, y: x + y, l2_weight_loss)
#return outputs, 0
# Define loss
def get_loss(outputs, l2_loss, tf_Y):
"""
assert tf.shape(tf_X)[1] == image_size*image_size,\
"Training data not of correct shape. got %s require %s" % (tf.shape(tf_X)[1], image_size*image_size)
assert tf.shape(tf_Y)[1] == num_labels,\
"Training data not of correct shape. got %s require %s" % (tf.shape(tf_Y)[1], num_labels)
"""
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(outputs, tf_training_labels))
total_loss = loss + L2_weight*l2_loss
return total_loss
# Define the network graph
tf.python.framework.ops.reset_default_graph()
graph = tf.Graph()
with graph.as_default():
tf_training_dataset = tf.placeholder(tf.float32) # Should have shape (batch_size, image_size*image_size)
tf_training_labels = tf.placeholder(tf.float32) # Should have shape (batch_size, num_labels)
dropout_p = tf.placeholder(tf.float32)
outputs, l2_loss = forward(tf_training_dataset, dropout_p)
total_loss = get_loss(outputs, l2_loss, tf_training_labels)
global_step = tf.Variable(0, trainable=False) # count the number of steps taken.
#learning_rate = tf.train.exponential_decay(0.5, global_step, 10000, 0.96)
learning_rate = 0.5
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(total_loss, global_step=global_step)
In [188]:
# train the model
num_steps = 5001
batch_size = 128
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
print "Initialized, using batch size: %s" % batch_size
for step in xrange(num_steps):
idx = np.random.randint(train_dataset.shape[0], size=batch_size)
#offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[idx]
batch_labels = train_labels[idx]
#batch_data = train_dataset[offset:(offset + batch_size), :]
#batch_labels = train_labels[offset:(offset + batch_size), :]
feed_dict = {tf_training_dataset : batch_data, tf_training_labels : batch_labels, dropout_p: 1}
_, l = session.run([optimizer, total_loss], feed_dict=feed_dict)
predictions = session.run(outputs, feed_dict={tf_training_dataset: batch_data, dropout_p: 1})
if (step % 500 == 0):
batch_size += 100
print "Updated batch size: %s" % batch_size
print "[Step: %s] Minibatch loss %s, accuracy: %.1f%%" % (step, l, accuracy(predictions, batch_labels))
predictions, l = session.run([outputs, total_loss],
feed_dict={tf_training_dataset: valid_dataset, tf_training_labels : valid_labels, dropout_p: 1})
print "[Step: %s] Validation loss %s, accuracy: %.1f%%" % (step, l, accuracy(predictions, valid_labels))
predictions, l = session.run([outputs, total_loss],
feed_dict={tf_training_dataset: test_dataset, tf_training_labels : test_labels, dropout_p: 1})
print "Test loss %s, accuracy: %.1f%%" % (l, accuracy(predictions, test_labels))
In [ ]: