In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
In [2]:
# Some personnal imports
import matplotlib.pyplot as plt
%matplotlib inline
First reload the data we generated in notmnist.ipynb.
In [3]:
pickle_file = 'notMNIST.pickle'
with open(pickle_file, 'rb') as f:
save = pickle.load(f)
train_dataset = save['train_dataset']
train_labels = save['train_labels']
valid_dataset = save['valid_dataset']
valid_labels = save['valid_labels']
test_dataset = save['test_dataset']
test_labels = save['test_labels']
del save # hint to help gc free up memory
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
Reformat into a shape that's more adapted to the models we're going to train:
In [4]:
image_size = 28
num_labels = 10
def reformat(dataset, labels):
dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
# Map 2 to [0.0, 1.0, 0.0 ...], 3 to [0.0, 0.0, 1.0 ...]
labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
In [5]:
def accuracy(predictions, labels):
return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
/ predictions.shape[0])
Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor t
using nn.l2_loss(t)
. The right amount of regularization should improve your validation / test accuracy.
Let's start with the logistic model:
In [6]:
batch_size = 128
graph = tf.Graph()
with graph.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,
shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
beta_regul = tf.placeholder(tf.float32)
# Variables.
weights = tf.Variable(
tf.truncated_normal([image_size * image_size, num_labels]))
biases = tf.Variable(tf.zeros([num_labels]))
# Training computation.
logits = tf.matmul(tf_train_dataset, weights) + biases
loss = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) + beta_regul * tf.nn.l2_loss(weights)
# Optimizer.
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(logits)
valid_prediction = tf.nn.softmax(
tf.matmul(tf_valid_dataset, weights) + biases)
test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)
In [7]:
num_steps = 3001
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
print("Initialized")
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, beta_regul : 1e-3}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
The L2 regularization introduces a new meta parameter that should be tuned. Since I do not have any idea of what should be the right value for this meta parameter, I will plot the accuracy by the meta parameter value (in a logarithmic scale).
In [8]:
num_steps = 3001
regul_val = [pow(10, i) for i in np.arange(-4, -2, 0.1)]
accuracy_val = []
for regul in regul_val:
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, beta_regul : regul}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
accuracy_val.append(accuracy(test_prediction.eval(), test_labels))
In [9]:
plt.semilogx(regul_val, accuracy_val)
plt.grid(True)
plt.title('Test accuracy by regularization (logistic)')
plt.show()
Let's see if the same technique will improve the prediction of the 1-layer neural network:
In [10]:
batch_size = 128
num_hidden_nodes = 1024
graph = tf.Graph()
with graph.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,
shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
beta_regul = tf.placeholder(tf.float32)
# Variables.
weights1 = tf.Variable(
tf.truncated_normal([image_size * image_size, num_hidden_nodes]))
biases1 = tf.Variable(tf.zeros([num_hidden_nodes]))
weights2 = tf.Variable(
tf.truncated_normal([num_hidden_nodes, num_labels]))
biases2 = tf.Variable(tf.zeros([num_labels]))
# Training computation.
lay1_train = tf.nn.relu(tf.matmul(tf_train_dataset, weights1) + biases1)
logits = tf.matmul(lay1_train, weights2) + biases2
loss = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) + \
beta_regul * (tf.nn.l2_loss(weights1) + tf.nn.l2_loss(weights2))
# Optimizer.
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(logits)
lay1_valid = tf.nn.relu(tf.matmul(tf_valid_dataset, weights1) + biases1)
valid_prediction = tf.nn.softmax(tf.matmul(lay1_valid, weights2) + biases2)
lay1_test = tf.nn.relu(tf.matmul(tf_test_dataset, weights1) + biases1)
test_prediction = tf.nn.softmax(tf.matmul(lay1_test, weights2) + biases2)
In [11]:
num_steps = 3001
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
print("Initialized")
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, beta_regul : 1e-3}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
Finally something above 90%! I will also plot the final accuracy by the L2 parameter to find the best value.
In [12]:
num_steps = 3001
regul_val = [pow(10, i) for i in np.arange(-4, -2, 0.1)]
accuracy_val = []
for regul in regul_val:
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, beta_regul : regul}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
accuracy_val.append(accuracy(test_prediction.eval(), test_labels))
In [13]:
plt.semilogx(regul_val, accuracy_val)
plt.grid(True)
plt.title('Test accuracy by regularization (1-layer net)')
plt.show()
In [14]:
batch_size = 128
num_hidden_nodes = 1024
graph = tf.Graph()
with graph.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,
shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
beta_regul = tf.placeholder(tf.float32)
# Variables.
weights1 = tf.Variable(
tf.truncated_normal([image_size * image_size, num_hidden_nodes]))
biases1 = tf.Variable(tf.zeros([num_hidden_nodes]))
weights2 = tf.Variable(
tf.truncated_normal([num_hidden_nodes, num_labels]))
biases2 = tf.Variable(tf.zeros([num_labels]))
# Training computation.
lay1_train = tf.nn.relu(tf.matmul(tf_train_dataset, weights1) + biases1)
logits = tf.matmul(lay1_train, weights2) + biases2
loss = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
# Optimizer.
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(logits)
lay1_valid = tf.nn.relu(tf.matmul(tf_valid_dataset, weights1) + biases1)
valid_prediction = tf.nn.softmax(tf.matmul(lay1_valid, weights2) + biases2)
lay1_test = tf.nn.relu(tf.matmul(tf_test_dataset, weights1) + biases1)
test_prediction = tf.nn.softmax(tf.matmul(lay1_test, weights2) + biases2)
In [15]:
num_steps = 101
num_batches = 3
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
print("Initialized")
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
#offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
offset = ((step % num_batches) * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, beta_regul : 1e-3}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 2 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
Since there are far too much parameters and no regularization, the accuracy of the batches is 100%. The generalization capability is poor, as shown in the validation and test accuracy.
Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides nn.dropout()
for that, but you have to make sure it's only inserted during training.
What happens to our extreme overfitting case?
In [16]:
batch_size = 128
num_hidden_nodes = 1024
graph = tf.Graph()
with graph.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,
shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
# Variables.
weights1 = tf.Variable(
tf.truncated_normal([image_size * image_size, num_hidden_nodes]))
biases1 = tf.Variable(tf.zeros([num_hidden_nodes]))
weights2 = tf.Variable(
tf.truncated_normal([num_hidden_nodes, num_labels]))
biases2 = tf.Variable(tf.zeros([num_labels]))
# Training computation.
lay1_train = tf.nn.relu(tf.matmul(tf_train_dataset, weights1) + biases1)
drop1 = tf.nn.dropout(lay1_train, 0.5)
logits = tf.matmul(drop1, weights2) + biases2
loss = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
# Optimizer.
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(logits)
lay1_valid = tf.nn.relu(tf.matmul(tf_valid_dataset, weights1) + biases1)
valid_prediction = tf.nn.softmax(tf.matmul(lay1_valid, weights2) + biases2)
lay1_test = tf.nn.relu(tf.matmul(tf_test_dataset, weights1) + biases1)
test_prediction = tf.nn.softmax(tf.matmul(lay1_test, weights2) + biases2)
In [17]:
num_steps = 101
num_batches = 3
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
print("Initialized")
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
#offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
offset = step % num_batches
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 2 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
The first conclusion is that 100% of accuracy on the minibatches is more difficult achieved or to keep. As a result, the test accuracy is improved by 6%, the final net is more capable of generalization.
Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is 97.1%.
One avenue you can explore is to add multiple layers.
Another one is to use learning rate decay:
global_step = tf.Variable(0) # count the number of steps taken.
learning_rate = tf.train.exponential_decay(0.5, step, ...)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
Let's do a first try with 2 layers. Note how the parameters are initialized, compared to the previous cases.
In [18]:
batch_size = 128
num_hidden_nodes1 = 1024
num_hidden_nodes2 = 100
beta_regul = 1e-3
graph = tf.Graph()
with graph.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,
shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
global_step = tf.Variable(0)
# Variables.
weights1 = tf.Variable(
tf.truncated_normal(
[image_size * image_size, num_hidden_nodes1],
stddev=np.sqrt(2.0 / (image_size * image_size)))
)
biases1 = tf.Variable(tf.zeros([num_hidden_nodes1]))
weights2 = tf.Variable(
tf.truncated_normal([num_hidden_nodes1, num_hidden_nodes2], stddev=np.sqrt(2.0 / num_hidden_nodes1)))
biases2 = tf.Variable(tf.zeros([num_hidden_nodes2]))
weights3 = tf.Variable(
tf.truncated_normal([num_hidden_nodes2, num_labels], stddev=np.sqrt(2.0 / num_hidden_nodes2)))
biases3 = tf.Variable(tf.zeros([num_labels]))
# Training computation.
lay1_train = tf.nn.relu(tf.matmul(tf_train_dataset, weights1) + biases1)
lay2_train = tf.nn.relu(tf.matmul(lay1_train, weights2) + biases2)
logits = tf.matmul(lay2_train, weights3) + biases3
loss = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) + \
beta_regul * (tf.nn.l2_loss(weights1) + tf.nn.l2_loss(weights2) + tf.nn.l2_loss(weights3))
# Optimizer.
learning_rate = tf.train.exponential_decay(0.5, global_step, 1000, 0.65, staircase=True)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(logits)
lay1_valid = tf.nn.relu(tf.matmul(tf_valid_dataset, weights1) + biases1)
lay2_valid = tf.nn.relu(tf.matmul(lay1_valid, weights2) + biases2)
valid_prediction = tf.nn.softmax(tf.matmul(lay2_valid, weights3) + biases3)
lay1_test = tf.nn.relu(tf.matmul(tf_test_dataset, weights1) + biases1)
lay2_test = tf.nn.relu(tf.matmul(lay1_test, weights2) + biases2)
test_prediction = tf.nn.softmax(tf.matmul(lay2_test, weights3) + biases3)
In [19]:
num_steps = 9001
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
print("Initialized")
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
This is getting really good. Let's try one layer deeper with dropouts.
In [20]:
batch_size = 128
num_hidden_nodes1 = 1024
num_hidden_nodes2 = 256
num_hidden_nodes3 = 128
keep_prob = 0.5
graph = tf.Graph()
with graph.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,
shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
global_step = tf.Variable(0)
# Variables.
weights1 = tf.Variable(
tf.truncated_normal(
[image_size * image_size, num_hidden_nodes1],
stddev=np.sqrt(2.0 / (image_size * image_size)))
)
biases1 = tf.Variable(tf.zeros([num_hidden_nodes1]))
weights2 = tf.Variable(
tf.truncated_normal([num_hidden_nodes1, num_hidden_nodes2], stddev=np.sqrt(2.0 / num_hidden_nodes1)))
biases2 = tf.Variable(tf.zeros([num_hidden_nodes2]))
weights3 = tf.Variable(
tf.truncated_normal([num_hidden_nodes2, num_hidden_nodes3], stddev=np.sqrt(2.0 / num_hidden_nodes2)))
biases3 = tf.Variable(tf.zeros([num_hidden_nodes3]))
weights4 = tf.Variable(
tf.truncated_normal([num_hidden_nodes3, num_labels], stddev=np.sqrt(2.0 / num_hidden_nodes3)))
biases4 = tf.Variable(tf.zeros([num_labels]))
# Training computation.
lay1_train = tf.nn.relu(tf.matmul(tf_train_dataset, weights1) + biases1)
lay2_train = tf.nn.relu(tf.matmul(lay1_train, weights2) + biases2)
lay3_train = tf.nn.relu(tf.matmul(lay2_train, weights3) + biases3)
logits = tf.matmul(lay3_train, weights4) + biases4
loss = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
# Optimizer.
learning_rate = tf.train.exponential_decay(0.5, global_step, 4000, 0.65, staircase=True)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(logits)
lay1_valid = tf.nn.relu(tf.matmul(tf_valid_dataset, weights1) + biases1)
lay2_valid = tf.nn.relu(tf.matmul(lay1_valid, weights2) + biases2)
lay3_valid = tf.nn.relu(tf.matmul(lay2_valid, weights3) + biases3)
valid_prediction = tf.nn.softmax(tf.matmul(lay3_valid, weights4) + biases4)
lay1_test = tf.nn.relu(tf.matmul(tf_test_dataset, weights1) + biases1)
lay2_test = tf.nn.relu(tf.matmul(lay1_test, weights2) + biases2)
lay3_test = tf.nn.relu(tf.matmul(lay2_test, weights3) + biases3)
test_prediction = tf.nn.softmax(tf.matmul(lay3_test, weights4) + biases4)
In [21]:
num_steps = 18001
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
print("Initialized")
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
Huge! That's my best score on this dataset. I have also tried more parameters, but it does not help:
In [22]:
batch_size = 128
num_hidden_nodes1 = 1024
num_hidden_nodes2 = 512
num_hidden_nodes3 = 256
keep_prob = 0.5
graph = tf.Graph()
with graph.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,
shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
global_step = tf.Variable(0)
# Variables.
weights1 = tf.Variable(
tf.truncated_normal(
[image_size * image_size, num_hidden_nodes1],
stddev=np.sqrt(2.0 / (image_size * image_size)))
)
biases1 = tf.Variable(tf.zeros([num_hidden_nodes1]))
weights2 = tf.Variable(
tf.truncated_normal([num_hidden_nodes1, num_hidden_nodes2], stddev=np.sqrt(2.0 / num_hidden_nodes1)))
biases2 = tf.Variable(tf.zeros([num_hidden_nodes2]))
weights3 = tf.Variable(
tf.truncated_normal([num_hidden_nodes2, num_hidden_nodes3], stddev=np.sqrt(2.0 / num_hidden_nodes2)))
biases3 = tf.Variable(tf.zeros([num_hidden_nodes3]))
weights4 = tf.Variable(
tf.truncated_normal([num_hidden_nodes3, num_labels], stddev=np.sqrt(2.0 / num_hidden_nodes3)))
biases4 = tf.Variable(tf.zeros([num_labels]))
# Training computation.
lay1_train = tf.nn.relu(tf.matmul(tf_train_dataset, weights1) + biases1)
drop1 = tf.nn.dropout(lay1_train, 0.5)
lay2_train = tf.nn.relu(tf.matmul(drop1, weights2) + biases2)
drop2 = tf.nn.dropout(lay2_train, 0.5)
lay3_train = tf.nn.relu(tf.matmul(drop2, weights3) + biases3)
drop3 = tf.nn.dropout(lay3_train, 0.5)
logits = tf.matmul(drop3, weights4) + biases4
loss = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
# Optimizer.
learning_rate = tf.train.exponential_decay(0.5, global_step, 5000, 0.80, staircase=True)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(logits)
lay1_valid = tf.nn.relu(tf.matmul(tf_valid_dataset, weights1) + biases1)
lay2_valid = tf.nn.relu(tf.matmul(lay1_valid, weights2) + biases2)
lay3_valid = tf.nn.relu(tf.matmul(lay2_valid, weights3) + biases3)
valid_prediction = tf.nn.softmax(tf.matmul(lay3_valid, weights4) + biases4)
lay1_test = tf.nn.relu(tf.matmul(tf_test_dataset, weights1) + biases1)
lay2_test = tf.nn.relu(tf.matmul(lay1_test, weights2) + biases2)
lay3_test = tf.nn.relu(tf.matmul(lay2_test, weights3) + biases3)
test_prediction = tf.nn.softmax(tf.matmul(lay3_test, weights4) + biases4)
In [23]:
num_steps = 20001
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
print("Initialized")
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))