We use data from the standard MNIST set
In [1]:
require 'torch'
require 'nn'
require 'optim'
mnist = require 'mnist'
In [2]:
fullset = mnist.traindataset()
testset = mnist.testdataset()
In [3]:
fullset
Out[3]:
We inspect the data just to get an idea of the content
In [4]:
itorch.image(fullset.data[1])
Out[4]:
In [5]:
fullset.label[1]
Out[5]:
We can split the full dataset into a trainin component and a validation component, which will be used to train hyperparameters.
While doing so, we convert the dataset to double
In [6]:
trainset = {
size = 50000,
data = fullset.data[{{1,50000}}]:double(),
label = fullset.label[{{1,50000}}]
}
In [7]:
validationset = {
size = 10000,
data = fullset.data[{{50001,60000}}]:double(),
label = fullset.label[{{50001,60000}}]
}
We use a model with a single hidden layer, using a hyperbolic tangent activation, but ask the output to be the same as the input. Through this compression, we hope to learn meaningful features
In [8]:
layer_size = 49
In [9]:
model = nn.Sequential()
In [10]:
model:add(nn.Reshape(28*28))
model:add(nn.Linear(28*28, layer_size))
model:add(nn.Tanh())
model:add(nn.Linear(layer_size, 28*28))
model:add(nn.Reshape(28, 28))
We also define a loss function, using the Euclidean distance
In [11]:
criterion = nn.MSECriterion()
We will make use of the optim package to train the network. optim contains several optimization algorithms. All of these algorithms assume the same parameters:
x, given a point xWe define a step function that performs training for a single epoch and returns the current loss value
In [12]:
sgd_params = {
learningRate = 1e-2,
learningRateDecay = 1e-4,
weightDecay = 1e-3,
momentum = 1e-4
}
In [13]:
x, dl_dx = model:getParameters()
In [14]:
step = function(batch_size)
local current_loss = 0
local shuffle = torch.randperm(trainset.size)
batch_size = batch_size or 200
for t = 1,trainset.size,batch_size do
-- setup inputs for this mini-batch
-- no need to setup targets, since they are the same
local size = math.min(t + batch_size - 1, trainset.size) - t
local inputs = torch.Tensor(size, 28, 28)
for i = 1,size do
inputs[i] = trainset.data[shuffle[i+t]]
end
local feval = function(x_new)
-- reset data
if x ~= x_new then x:copy(x_new) end
dl_dx:zero()
-- perform mini-batch gradient descent
local loss = criterion:forward(model:forward(inputs), inputs)
model:backward(inputs, criterion:backward(model.output, inputs))
return loss, dl_dx
end
_, fs = optim.sgd(feval, x, sgd_params)
-- fs is a table containing value of the loss function
-- (just 1 value for the SGD optimization)
current_loss = current_loss + fs[1]
end
return current_loss
end
Before starting the training, we also need to be able to evaluate accuracy on a separate dataset, in order to define when to stop
In [15]:
eval = function(dataset, batch_size)
local loss = 0
batch_size = batch_size or 200
for i = 1,dataset.size,batch_size do
local size = math.min(i + batch_size - 1, dataset.size) - i
local inputs = dataset.data[{{i,i+size-1}}]
local outputs = model:forward(inputs)
loss = loss + criterion:forward(model:forward(inputs), inputs)
end
return loss
end
We are now ready to perform the actual training. After each epoch, we evaluate the accuracy on the validation dataset, in order to decide whether to stop
In [16]:
max_iters = 30
In [17]:
do
local last_loss = 0
local increasing = 0
local threshold = 1 -- how many deacreasing epochs we allow
for i = 1,max_iters do
local loss = step()
print(string.format('Epoch: %d Current loss: %4f', i, loss))
local validation_loss = eval(validationset)
print(string.format('Loss on the validation set: %4f', validation_loss))
if last_loss < validation_loss then
if increasing > threshold then break end
increasing = increasing + 1
else
increasing = 0
end
last_loss = validation_loss
end
end
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Let us test the model loss on the test set
In [18]:
testset.data = testset.data:double()
In [19]:
eval(testset)
Out[19]:
We can try to see which features we have actually learned. To do so, we can take a basis vector in the feature space and encode it back to the image space using the model
In [20]:
linear = model.modules[4]
In [21]:
vec = torch.zeros(layer_size)
vec[1] = 1
In [22]:
translate = nn.Sequential()
translate:add(linear)
translate:add(nn.Reshape(28, 28))
In [23]:
itorch.image(translate:forward(vec))
Out[23]:
We can do the same for all vectors at once
In [24]:
basis = torch.eye(layer_size)
In [25]:
itorch.image(translate:forward(basis))
Out[25]: