In [28]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
import pandas as pd
from six.moves import cPickle as pickle
First reload the data we generated in notMNIST_nonTensorFlow_comparisons.ipynb.
In [2]:
pickle_file = 'notMNIST.pickle'
with open(pickle_file, 'rb') as f:
save = pickle.load(f)
train_dataset = save['train_dataset']
train_labels = save['train_labels']
valid_dataset = save['valid_dataset']
valid_labels = save['valid_labels']
test_dataset = save['test_dataset']
test_labels = save['test_labels']
del save # hint to help gc free up memory
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
Reformat into a shape that's more adapted to the models we're going to train:
In [3]:
image_size = 28
num_labels = 10
def reformat(dataset, labels):
dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
# Map 1 to [0.0, 1.0, 0.0 ...], 2 to [0.0, 0.0, 1.0 ...]
labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
In [4]:
def accuracy(predictions, labels):
return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
/ predictions.shape[0])
Let's introduce Dropout on the hidden layers of the neural networks. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides nn.dropout() for that, but we have to make sure it's only inserted during training.
tf.nn.dropout(x, keep_prob, noise_shape=None, seed=None, name=None)
{#dropout}Computes dropout.
With probability keep_prob
, outputs the input element scaled up by
1 / keep_prob
, otherwise outputs 0
. The scaling is so that the expected
sum is unchanged.
By default, each element is kept or dropped independently. If noise_shape
is specified, it must be
broadcastable
to the shape of x
, and only dimensions with noise_shape[i] == shape(x)[i]
will make independent decisions. For example, if shape(x) = [k, l, m, n]
and noise_shape = [k, 1, 1, n]
, each batch and channel component will be
kept independently and each row and column will be kept or not kept together.
Tensor
with the same type as x. The probability
that each element is kept.Tensor
of type int32
, representing the
shape for randomly generated keep/drop flags.set_random_seed
for behavior.A Tensor of the same shape of x
.
keep_prob
is not in (0, 1]
.We create a placeholder for the probability that a neuron's output is kept during dropout. This allows us to turn dropout on during training, and turn it off during testing. TensorFlow's tf.nn.dropout op automatically handles scaling neuron outputs in addition to masking them, so dropout just works without any additional scaling
Further details: https://www.tensorflow.org/versions/r0.11/tutorials/mnist/pros/
In [16]:
import math
def create_nn1_model_dropout_and_run(graph,
train_dataset,
train_labels,
valid_dataset,
valid_labels,
test_dataset,
test_labels,
dropout,
num_steps,
hidden_size = 1024,
num_labels=10,batch_size = 128):
uniMax = 1/math.sqrt(hidden_size)
with graph.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
# Hidden 1
weights_1 = tf.Variable(tf.random_uniform([image_size * image_size, hidden_size], minval=-uniMax, maxval=uniMax),
name='weights_1')
biases_1 = tf.Variable(tf.random_uniform([hidden_size],minval=-uniMax, maxval=uniMax),name='biases_1')
hidden_1 = tf.nn.relu(tf.matmul(tf_train_dataset, weights_1) + biases_1)
if dropout>0:
dropped = tf.nn.dropout(hidden_1, dropout)
else:
dropped = hidden_1
# Softmax
weights_2 = tf.Variable(tf.random_uniform([hidden_size, num_labels],minval=-uniMax, maxval=uniMax), name='weights_2')
biases_2 = tf.Variable(tf.random_uniform([num_labels],minval=-uniMax, maxval=uniMax),name='biases_2')
logits = tf.matmul(dropped, weights_2) + biases_2
#
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
# Optimizer.
global_step = tf.Variable(0) # count the number of steps taken.
learning_rate = tf.train.exponential_decay(0.5, global_step, 100000, 0.96, staircase=True)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
#optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(logits)
valid_prediction = tf.nn.softmax(
tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, weights_1) + biases_1), weights_2) + biases_2)
test_prediction = tf.nn.softmax(
tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, weights_1) + biases_1), weights_2) + biases_2)
test_accuracy = 0
with tf.Session(graph=graph) as session:
tf.global_variables_initializer().run()
print("Initialized")
for step in range(num_steps):
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
_, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
test_accuracy = accuracy(test_prediction.eval(), test_labels)
print("Test accuracy: %.1f%%" % test_accuracy)
return test_accuracy
In [17]:
num_steps = 3001
keep_probs = [0, 0.3,0.4, 0.5, 0.6,0.7]
test_accuracy = np.zeros(len(keep_probs))
i = 0
for keep_prob in keep_probs:
print("\n>>>>>>>>>> keep_prob: %f" % keep_prob)
graph = tf.Graph()
test_accuracy[i] = create_nn1_model_dropout_and_run(graph,
train_dataset,
train_labels,
valid_dataset,
valid_labels,
test_dataset,
test_labels,
keep_prob,
num_steps)
i = i +1
In [19]:
print("*** Best keep_prob:"+str(keep_probs[np.argmax(test_accuracy)])+ " -- accuracy:" + str(test_accuracy[np.argmax(test_accuracy)]))
We did not get an improvement in test accuracy by using dropout as the best accuracy occours for keep_prob=0 that means no dropout.
In [66]:
def create_nn2_model_dropout_and_run(graph,
train_dataset,
train_labels,
valid_dataset,
valid_labels,
test_dataset,
test_labels,
dropout_vect,
num_steps,
hidden_size = 1024,
num_labels=10,batch_size = 128):
assert dropout_vect.shape == (2,)
uniMax = 1/math.sqrt(hidden_size)
with graph.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
# Hidden 1
weights_1 = tf.Variable(tf.random_uniform([image_size * image_size, hidden_size], minval=-uniMax, maxval=uniMax),
name='weights_1')
biases_1 = tf.Variable(tf.random_uniform([hidden_size],minval=-uniMax, maxval=uniMax),name='biases_1')
hidden_1 = tf.nn.relu(tf.matmul(tf_train_dataset, weights_1) + biases_1)
if dropout_vect[0]>0:
dropped_1 = tf.nn.dropout(hidden_1, dropout_vect[0])
else:
dropped_1 = hidden_1
# Hidden 2
weights_2 = tf.Variable(tf.random_uniform([hidden_size, hidden_size], minval=-uniMax, maxval=uniMax),name='weights_2')
biases_2 = tf.Variable(tf.random_uniform([hidden_size],minval=-uniMax, maxval=uniMax),name='biases_2')
hidden_2 = tf.nn.relu(tf.matmul(dropped_1, weights_2) + biases_2)
if dropout_vect[1]>0:
dropped_2 = tf.nn.dropout(hidden_2, dropout_vect[1])
else:
dropped_2 = hidden_2
# Softmax
weights_3 = tf.Variable(tf.random_uniform([hidden_size, num_labels],minval=-uniMax, maxval=uniMax), name='weights_3')
biases_3 = tf.Variable(tf.random_uniform([num_labels],minval=-uniMax, maxval=uniMax),name='biases_3')
logits = tf.matmul(dropped_2, weights_3) + biases_3
#
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
# Optimizer.
global_step = tf.Variable(0) # count the number of steps taken.
learning_rate = tf.train.exponential_decay(0.5, global_step, 100000, 0.96, staircase=True)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
#optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(logits)
valid_prediction = tf.nn.softmax(
tf.matmul(tf.nn.relu(tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, weights_1) + biases_1), weights_2) + biases_2),
weights_3) + biases_3)
test_prediction = tf.nn.softmax(
tf.matmul(tf.nn.relu(tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, weights_1) + biases_1), weights_2) + biases_2),
weights_3) + biases_3)
test_accuracy = 0
with tf.Session(graph=graph) as session:
tf.global_variables_initializer().run()
print("Initialized")
for step in range(num_steps):
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
_, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
test_accuracy = accuracy(test_prediction.eval(), test_labels)
print("Test accuracy: %.1f%%" % test_accuracy)
return test_accuracy
In [67]:
keep_probs = [0, 0.3,0.4, 0.5, 0.6,0.7]
tuneGrid = pd.DataFrame.from_records([(kp1,kp2,0) for kp1 in keep_probs for kp2 in keep_probs],
columns=['drop_1','drop_2','test_accuracy'])
#tuneGrid.head()
for i in range(0,tuneGrid.shape[0]):
drop_1 , drop_2 = tuneGrid.iloc[i,0] , tuneGrid.iloc[i,1]
print("\n>>>>>>>>>> keep_prob_1: %f ---- keep_prob_2: %f" % (drop_1 , drop_2))
graph = tf.Graph()
tuneGrid.iloc[i,2] = create_nn2_model_dropout_and_run(graph,
train_dataset,
train_labels,
valid_dataset,
valid_labels,
test_dataset,
test_labels,
np.array([drop_1,drop_2]),
num_steps)
In [70]:
tuneGrid.sort_values(by=['test_accuracy'],ascending=[False]).head(10)
Out[70]:
We did not get an improvement in test accuracy by using dropout as the best accuracy occours for keep_prob=0 both for hidden layer 1 and hidden layer 2 that means no dropout.
In [75]:
def create_nn3_model_dropout_and_run(graph,
train_dataset,
train_labels,
valid_dataset,
valid_labels,
test_dataset,
test_labels,
dropout_vect,
num_steps,
hidden_size = 1024,
num_labels=10,batch_size = 128):
assert dropout_vect.shape == (3,)
uniMax = 1/math.sqrt(hidden_size)
with graph.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
# Hidden 1
weights_1 = tf.Variable(tf.random_uniform([image_size * image_size, hidden_size], minval=-uniMax, maxval=uniMax),
name='weights_1')
biases_1 = tf.Variable(tf.random_uniform([hidden_size],minval=-uniMax, maxval=uniMax),name='biases_1')
hidden_1 = tf.nn.relu(tf.matmul(tf_train_dataset, weights_1) + biases_1)
if dropout_vect[0]>0:
dropped_1 = tf.nn.dropout(hidden_1, dropout_vect[0])
else:
dropped_1 = hidden_1
# Hidden 2
weights_2 = tf.Variable(tf.random_uniform([hidden_size, hidden_size], minval=-uniMax, maxval=uniMax),name='weights_2')
biases_2 = tf.Variable(tf.random_uniform([hidden_size],minval=-uniMax, maxval=uniMax),name='biases_2')
hidden_2 = tf.nn.relu(tf.matmul(dropped_1, weights_2) + biases_2)
if dropout_vect[1]>0:
dropped_2 = tf.nn.dropout(hidden_2, dropout_vect[1])
else:
dropped_2 = hidden_2
# Hidden 3
weights_3 = tf.Variable(tf.random_uniform([hidden_size, hidden_size], minval=-uniMax, maxval=uniMax),name='weights_3')
biases_3 = tf.Variable(tf.random_uniform([hidden_size],minval=-uniMax, maxval=uniMax),name='biases_3')
hidden_3 = tf.nn.relu(tf.matmul(dropped_2, weights_3) + biases_3)
if dropout_vect[2]>0:
dropped_3 = tf.nn.dropout(hidden_3, dropout_vect[2])
else:
dropped_3 = hidden_3
# Softmax
weights_4 = tf.Variable(tf.random_uniform([hidden_size, num_labels],minval=-uniMax, maxval=uniMax), name='weights_4')
biases_4 = tf.Variable(tf.random_uniform([num_labels],minval=-uniMax, maxval=uniMax),name='biases_4')
logits = tf.matmul(dropped_3, weights_4) + biases_4
#
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
# Optimizer.
global_step = tf.Variable(0) # count the number of steps taken.
learning_rate = tf.train.exponential_decay(0.5, global_step, 100000, 0.96, staircase=True)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
#optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(logits)
valid_prediction = tf.nn.softmax(
tf.matmul(tf.nn.relu(tf.matmul(tf.nn.relu(tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, weights_1) + biases_1), weights_2) + biases_2),
weights_3) + biases_3), weights_3) + biases_3)
test_prediction = tf.nn.softmax(
tf.matmul(tf.nn.relu(tf.matmul(tf.nn.relu(tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, weights_1) + biases_1), weights_2) + biases_2),
weights_3) + biases_3), weights_3) + biases_3)
test_accuracy = 0
with tf.Session(graph=graph) as session:
tf.global_variables_initializer().run()
print("Initialized")
for step in range(num_steps):
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
_, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
test_accuracy = accuracy(test_prediction.eval(), test_labels)
print("Test accuracy: %.1f%%" % test_accuracy)
return test_accuracy
In [79]:
keep_probs = [0, 0.5, 0.7]
tuneGrid = pd.DataFrame.from_records([(kp1,kp2,kp3,0) for kp1 in keep_probs for kp2 in keep_probs for kp3 in keep_probs],
columns=['drop_1','drop_2','drop_3','test_accuracy'])
#tuneGrid.head()
for i in range(0,tuneGrid.shape[0]):
drop_1 , drop_2 , drop_3 = tuneGrid.iloc[i,0] , tuneGrid.iloc[i,1] , tuneGrid.iloc[i,2]
print("\n>>>>>>>>>> keep_prob_1: %f ---- keep_prob_2: %f ---- keep_prob_3: %f" % (drop_1 , drop_2, drop_3))
graph = tf.Graph()
tuneGrid.iloc[i,3] = create_nn2_model_dropout_and_run(graph,
train_dataset,
train_labels,
valid_dataset,
valid_labels,
test_dataset,
test_labels,
np.array([drop_1,drop_2]),
num_steps)
In [80]:
tuneGrid.sort_values(by=['test_accuracy'],ascending=[False]).head(10)
Out[80]:
We did not get an improvement in test accuracy by using dropout as although the best accuracy occours for keep_prob = 0 for hidden layers 1 and 2 and keep_prob = 0.5 for layers 3 the difference with the second best combination that is the no dropout option for all hidden layers is very tiny.
Also, considering at most 3 hidden layers, the best model comes with 3 hidden layers but without L2 regularization and/or dropout with a test accuracy ~95.1%