This is a quick illustration of a single-layer network training on the MNIST data.
( Credit for the original workbook : Eben Olson :: https://github.com/ebenolson/pydata2015 )
In [ ]:
import numpy as np
import theano
import theano.tensor as T
import lasagne
import matplotlib.pyplot as plt
%matplotlib inline
import gzip
import pickle
In [ ]:
# Seed for reproducibility
np.random.seed(42)
In [ ]:
# Download the MNIST digits dataset (only if not present locally)
import os
import urllib.request
mnist_data = './data/MNIST'
mnist_pkl_gz = mnist_data+'/mnist.pkl.gz'
if not os.path.isfile(mnist_pkl_gz):
if not os.path.exists(mnist_data):
os.makedirs(mnist_data)
print("Downloading MNIST data file")
urllib.request.urlretrieve(
'http://deeplearning.net/data/mnist/mnist.pkl.gz',
mnist_pkl_gz)
print("MNIST data file available locally")
In [ ]:
# Load training and test splits as numpy arrays
train, val, test = pickle.load(gzip.open(mnist_pkl_gz), encoding='iso-8859-1')
X_train, y_train = train
X_val, y_val = val
In [ ]:
# The original 28x28 pixel images are flattened into 784 dimensional feature vectors
X_train.shape
In [ ]:
# Plot the first few examples
plt.figure(figsize=(12,3))
for i in range(10):
plt.subplot(1, 10, i+1)
plt.imshow(X_train[i].reshape((28, 28)), cmap='gray', interpolation='nearest')
plt.axis('off')
In [ ]:
# For training, we want to sample examples at random in small batches
def batch_gen(X, y, N):
while True:
idx = np.random.choice(len(y), N)
yield X[idx].astype('float32'), y[idx].astype('int32')
In [ ]:
# A very simple network, a single layer with one neuron per target class.
# Using the softmax activation function gives us a probability distribution at the output.
l_in = lasagne.layers.InputLayer((None, 784))
l_out = lasagne.layers.DenseLayer(
l_in,
num_units=10,
nonlinearity=lasagne.nonlinearities.softmax)
In [ ]:
# Symbolic variables for our input features and targets
X_sym = T.matrix()
y_sym = T.ivector()
In [ ]:
# Theano expressions for the output distribution and predicted class
output = lasagne.layers.get_output(l_out, X_sym)
pred = output.argmax(-1)
In [ ]:
# The loss function is cross-entropy averaged over a minibatch, we also compute accuracy as an evaluation metric
loss = T.mean(lasagne.objectives.categorical_crossentropy(output, y_sym))
acc = T.mean(T.eq(pred, y_sym))
In [ ]:
# We retrieve all the trainable parameters in our network - a single weight matrix and bias vector
params = lasagne.layers.get_all_params(l_out)
print(params)
In [ ]:
# Compute the gradient of the loss function with respect to the parameters.
# The stochastic gradient descent algorithm produces updates for each param
grad = T.grad(loss, params)
updates = lasagne.updates.sgd(grad, params, learning_rate=0.05)
print(updates)
Create the functions that define the training loss, validation loss, and predicted class for any given inputs (combined with targets for the loss functions)
In [ ]:
# We define a training function that will compute the loss and accuracy, and take a single optimization step
f_train = theano.function([X_sym, y_sym], [loss, acc], updates=updates)
In [ ]:
# The validation function is similar, but does not update the parameters
f_val = theano.function([X_sym, y_sym], [loss, acc])
In [ ]:
# The prediction function doesn't require targets, and outputs only the predicted class values
f_predict = theano.function([X_sym], pred)
In [ ]:
# We'll choose a batch size, and calculate the number of batches in an "epoch"
# (approximately one pass through the data).
BATCH_SIZE = 64
N_BATCHES = len(X_train) // BATCH_SIZE
N_VAL_BATCHES = len(X_val) // BATCH_SIZE
In [ ]:
# Minibatch generators for the training and validation sets
train_batches = batch_gen(X_train, y_train, BATCH_SIZE)
val_batches = batch_gen(X_val, y_val, BATCH_SIZE)
In [ ]:
# Try sampling from the batch generator.
# Plot an image and corresponding label to verify they match.
X, y = next(train_batches)
plt.imshow(X[0].reshape((28, 28)), cmap='gray', interpolation='nearest')
print(y[0])
For each epoch, we call the training function N_BATCHES times, accumulating an estimate of the training loss and accuracy.
Then we do the same thing for the validation set.
We print out the ratio of loss in the validation set vs the training set to help recognize overfitting.
In [ ]:
for epoch in range(10):
train_loss = 0
train_acc = 0
for _ in range(N_BATCHES):
X, y = next(train_batches)
loss, acc = f_train(X, y)
train_loss += loss
train_acc += acc
train_loss /= N_BATCHES
train_acc /= N_BATCHES
val_loss = 0
val_acc = 0
for _ in range(N_VAL_BATCHES):
X, y = next(val_batches)
loss, acc = f_val(X, y)
val_loss += loss
val_acc += acc
val_loss /= N_VAL_BATCHES
val_acc /= N_VAL_BATCHES
print('Epoch {:2d}, Train loss {:.03f} (validation : {:.03f}) ratio {:.03f}'.format(
epoch, train_loss, val_loss, val_loss/train_loss))
print(' Train accuracy {:.03f} (validation : {:.03f})'.format(train_acc, val_acc))
print("DONE")
In [ ]:
weights = l_out.W.get_value()
print(weights.shape)
In [ ]:
plt.figure(figsize=(12,3))
for i in range(10):
plt.subplot(1, 10, i+1)
plt.imshow(weights[:,i].reshape((28, 28)), cmap='gray', interpolation='nearest')
plt.axis('off')
In [ ]:
# Uncomment and execute this cell for an example solution
# %load ../models/spoilers/logreg.py
In [ ]:
# Uncomment and execute this cell for an example solution
# %load ../models/spoilers/hiddenlayer.py
Try one of the other algorithms available in lasagne.updates. You may also want to adjust the learning rate.
Visualize and compare the trained weights. Different optimization trajectories may lead to very different results, even if the performance is similar. This can be important when training more complicated networks.
In [ ]:
# Uncomment and execute this cell for an example solution
# %load ../models/spoilers/optimizer.py
In [ ]: