In [14]:
from __future__ import print_function
import gzip
import itertools
import pickle
import os
import sys
import numpy as np
import lasagne
from lasagne.layers import cuda_convnet
import theano
import theano.tensor as T
import time
In [8]:
DATA_FILENAME = 'mnist.pkl.gz'
NUM_EPOCHS = 50
BATCH_SIZE = 600
LEARNING_RATE = 0.01
MOMENTUM = 0.9
In [9]:
def load_data(data):
X_train, y_train = data[0]
X_valid, y_valid = data[1]
X_test, y_test = data[2]
# reshape for convolutions
X_train = X_train.reshape((X_train.shape[0], 1, 28, 28))
X_valid = X_valid.reshape((X_valid.shape[0], 1, 28, 28))
X_test = X_test.reshape((X_test.shape[0], 1, 28, 28))
return dict(
X_train=theano.shared(lasagne.utils.floatX(X_train)),
y_train=T.cast(theano.shared(y_train), 'int32'),
X_valid=theano.shared(lasagne.utils.floatX(X_valid)),
y_valid=T.cast(theano.shared(y_valid), 'int32'),
X_test=theano.shared(lasagne.utils.floatX(X_test)),
y_test=T.cast(theano.shared(y_test), 'int32'),
num_examples_train=X_train.shape[0],
num_examples_valid=X_valid.shape[0],
num_examples_test=X_test.shape[0],
input_height=X_train.shape[2],
input_width=X_train.shape[3],
output_dim=10,
)
In [10]:
def build_model(input_width, input_height, output_dim,
batch_size=BATCH_SIZE, dimshuffle=True):
l_in = lasagne.layers.InputLayer(
shape=(batch_size, 1, input_width, input_height),
)
if not dimshuffle:
l_in = cuda_convnet.bc01_to_c01b(l_in)
l_conv1 = cuda_convnet.Conv2DCCLayer(
l_in,
num_filters=32,
filter_size=(5, 5),
nonlinearity=lasagne.nonlinearities.rectify,
dimshuffle=dimshuffle,
)
l_pool1 = cuda_convnet.MaxPool2DCCLayer(
l_conv1,
pool_size=(2, 2),
dimshuffle=dimshuffle,
)
l_conv2 = cuda_convnet.Conv2DCCLayer(
l_pool1,
num_filters=32,
filter_size=(5, 5),
nonlinearity=lasagne.nonlinearities.rectify,
dimshuffle=dimshuffle,
)
l_pool2 = cuda_convnet.MaxPool2DCCLayer(
l_conv2,
pool_size=(2, 2),
dimshuffle=dimshuffle,
)
if not dimshuffle:
l_pool2 = cuda_convnet.c01b_to_bc01(l_pool2)
l_hidden1 = lasagne.layers.DenseLayer(
l_pool2,
num_units=256,
nonlinearity=lasagne.nonlinearities.rectify,
)
l_hidden1_dropout = lasagne.layers.DropoutLayer(l_hidden1, p=0.5)
# l_hidden2 = lasagne.layers.DenseLayer(
# l_hidden1_dropout,
# num_units=256,
# nonlinearity=lasagne.nonlinearities.rectify,
# )
# l_hidden2_dropout = lasagne.layers.DropoutLayer(l_hidden2, p=0.5)
l_out = lasagne.layers.DenseLayer(
l_hidden1_dropout,
num_units=output_dim,
nonlinearity=lasagne.nonlinearities.softmax,
)
return l_out
In [11]:
def create_iter_functions(dataset, output_layer,
X_tensor_type=T.matrix,
batch_size=BATCH_SIZE,
learning_rate=LEARNING_RATE, momentum=MOMENTUM):
"""Create functions for training, validation and testing to iterate one
epoch.
"""
batch_index = T.iscalar('batch_index')
X_batch = X_tensor_type('x')
y_batch = T.ivector('y')
batch_slice = slice(batch_index * batch_size,
(batch_index + 1) * batch_size)
objective = lasagne.objectives.Objective(output_layer,
loss_function=lasagne.objectives.categorical_crossentropy)
loss_train = objective.get_loss(X_batch, target=y_batch)
loss_eval = objective.get_loss(X_batch, target=y_batch,
deterministic=True)
pred = T.argmax(
lasagne.layers.get_output(output_layer, X_batch, deterministic=True),
axis=1)
accuracy = T.mean(T.eq(pred, y_batch), dtype=theano.config.floatX)
all_params = lasagne.layers.get_all_params(output_layer)
updates = lasagne.updates.nesterov_momentum(
loss_train, all_params, learning_rate, momentum)
iter_train = theano.function(
[batch_index], loss_train,
updates=updates,
givens={
X_batch: dataset['X_train'][batch_slice],
y_batch: dataset['y_train'][batch_slice],
},
)
iter_valid = theano.function(
[batch_index], [loss_eval, accuracy],
givens={
X_batch: dataset['X_valid'][batch_slice],
y_batch: dataset['y_valid'][batch_slice],
},
)
iter_test = theano.function(
[batch_index], [loss_eval, accuracy],
givens={
X_batch: dataset['X_test'][batch_slice],
y_batch: dataset['y_test'][batch_slice],
},
)
return dict(
train=iter_train,
valid=iter_valid,
test=iter_test,
)
In [12]:
def train(iter_funcs, dataset, batch_size=BATCH_SIZE):
"""Train the model with `dataset` with mini-batch training. Each
mini-batch has `batch_size` recordings.
"""
num_batches_train = dataset['num_examples_train'] // batch_size
num_batches_valid = dataset['num_examples_valid'] // batch_size
for epoch in itertools.count(1):
batch_train_losses = []
for b in range(num_batches_train):
batch_train_loss = iter_funcs['train'](b)
batch_train_losses.append(batch_train_loss)
avg_train_loss = np.mean(batch_train_losses)
batch_valid_losses = []
batch_valid_accuracies = []
for b in range(num_batches_valid):
batch_valid_loss, batch_valid_accuracy = iter_funcs['valid'](b)
batch_valid_losses.append(batch_valid_loss)
batch_valid_accuracies.append(batch_valid_accuracy)
avg_valid_loss = np.mean(batch_valid_losses)
avg_valid_accuracy = np.mean(batch_valid_accuracies)
yield {
'number': epoch,
'train_loss': avg_train_loss,
'valid_loss': avg_valid_loss,
'valid_accuracy': avg_valid_accuracy,
}
In [13]:
print("Loading data...")
with gzip.open(DATA_FILENAME, 'rb') as f:
data = pickle.load(f)
dataset = load_data(data)
print("Building model and compiling functions...")
output_layer = build_model(
input_height=dataset['input_height'],
input_width=dataset['input_width'],
output_dim=dataset['output_dim'],
)
iter_funcs = create_iter_functions(
dataset,
output_layer,
X_tensor_type=T.tensor4,
)
num_epochs = NUM_EPOCHS
print("Starting training...")
now = time.time()
try:
for epoch in train(iter_funcs, dataset):
print("Epoch {} of {} took {:.3f}s".format(
epoch['number'], num_epochs, time.time() - now))
now = time.time()
print(" training loss:\t\t{:.6f}".format(epoch['train_loss']))
print(" validation loss:\t\t{:.6f}".format(epoch['valid_loss']))
print(" validation accuracy:\t\t{:.2f} %%".format(
epoch['valid_accuracy'] * 100))
if epoch['number'] >= num_epochs:
break
except KeyboardInterrupt:
pass