We use data from the standard MNIST set
In [1]:
require 'torch'
require 'nn'
require 'optim'
mnist = require 'mnist'
In [2]:
fullset = mnist.traindataset()
testset = mnist.testdataset()
In [3]:
fullset
Out[3]:
We inspect the data just to get an idea of the content
In [4]:
itorch.image(fullset.data[1])
Out[4]:
In [5]:
fullset.label[1]
Out[5]:
We can split the full dataset into a trainin component and a validation component, which will be used to train hyperparameters.
While doing so, we convert the dataset to double
In [6]:
trainset = {
size = 50000,
data = fullset.data[{{1,50000}}]:double(),
label = fullset.label[{{1,50000}}]
}
In [7]:
validationset = {
size = 10000,
data = fullset.data[{{50001,60000}}]:double(),
label = fullset.label[{{50001,60000}}]
}
We use a model with a single hidden layer, using a hyperbolic tangent activation, and a softmax output. We also use a first layer to reshape the input - which is a 28x28 square - to fit into the linear layer
In [8]:
model = nn.Sequential()
In [9]:
model:add(nn.Reshape(28*28))
model:add(nn.Linear(28*28, 30))
model:add(nn.Tanh())
model:add(nn.Linear(30, 10))
model:add(nn.LogSoftMax())
We also define a loss function, using the negative log likelihood criterion
In [10]:
criterion = nn.ClassNLLCriterion()
As explained in the documentation, the NLL criterion require the output of the neural network to contain log-probabilities of each class, and this is the reason for our use of LogSoftMax above.
We will make use of the optim package to train the network. optim contains several optimization algorithms. All of these algorithms assume the same parameters:
x, given a point xWe define a step function that performs training for a single epoch and returns the current loss value
In [11]:
sgd_params = {
learningRate = 1e-2,
learningRateDecay = 1e-4,
weightDecay = 1e-3,
momentum = 1e-4
}
In [12]:
x, dl_dx = model:getParameters()
In [13]:
step = function(batch_size)
local current_loss = 0
local count = 0
local shuffle = torch.randperm(trainset.size)
batch_size = batch_size or 200
for t = 1,trainset.size,batch_size do
-- setup inputs and targets for this mini-batch
local size = math.min(t + batch_size - 1, trainset.size) - t
local inputs = torch.Tensor(size, 28, 28)
local targets = torch.Tensor(size)
for i = 1,size do
local input = trainset.data[shuffle[i+t]]
local target = trainset.label[shuffle[i+t]]
-- if target == 0 then target = 10 end
inputs[i] = input
targets[i] = target
end
targets:add(1)
local feval = function(x_new)
-- reset data
if x ~= x_new then x:copy(x_new) end
dl_dx:zero()
-- perform mini-batch gradient descent
local loss = criterion:forward(model:forward(inputs), targets)
model:backward(inputs, criterion:backward(model.output, targets))
return loss, dl_dx
end
_, fs = optim.sgd(feval, x, sgd_params)
-- fs is a table containing value of the loss function
-- (just 1 value for the SGD optimization)
count = count + 1
current_loss = current_loss + fs[1]
end
-- normalize loss
return current_loss / count
end
Before starting the training, we also need to be able to evaluate accuracy on a separate dataset, in order to define when to stop
In [14]:
eval = function(dataset, batch_size)
local count = 0
batch_size = batch_size or 200
for i = 1,dataset.size,batch_size do
local size = math.min(i + batch_size - 1, dataset.size) - i
local inputs = dataset.data[{{i,i+size-1}}]
local targets = dataset.label[{{i,i+size-1}}]:long()
local outputs = model:forward(inputs)
local _, indices = torch.max(outputs, 2)
indices:add(-1)
local guessed_right = indices:eq(targets):sum()
count = count + guessed_right
end
return count / dataset.size
end
We are now ready to perform the actual training. After each epoch, we evaluate the accuracy on the validation dataset, in order to decide whether to stop
In [15]:
max_iters = 30
In [16]:
do
local last_accuracy = 0
local decreasing = 0
local threshold = 1 -- how many deacreasing epochs we allow
for i = 1,max_iters do
local loss = step()
print(string.format('Epoch: %d Current loss: %4f', i, loss))
local accuracy = eval(validationset)
print(string.format('Accuracy on the validation set: %4f', accuracy))
if accuracy < last_accuracy then
if decreasing > threshold then break end
decreasing = decreasing + 1
else
decreasing = 0
end
last_accuracy = accuracy
end
end
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Out[16]:
Let us test the model accuracy on the test set
In [17]:
testset.data = testset.data:double()
In [18]:
eval(testset)
Out[18]:
The paths module can be used to manipulate filesystem paths
In [19]:
paths = require 'paths'
In [20]:
filename = paths.concat(paths.cwd(), 'model.net')
In [21]:
filename
Out[21]:
We can then save our model to file like this
In [22]:
help(torch.save)
Out[22]:
In [23]:
torch.save(filename, model)
Out[23]:
Let us check that restoring from file works as expected
In [24]:
help(torch.load)
Out[24]:
In [25]:
model1 = torch.load(filename)
We redefine our evaluation function to use the loaded model
In [26]:
eval1 = function(dataset)
local count = 0
for i = 1,dataset.size do
local output = model1:forward(dataset.data[i])
local _, index = torch.max(output, 1) -- max index
local digit = index[1] % 10
if digit == dataset.label[i] then count = count + 1 end
end
return count / dataset.size
end
In [27]:
eval1(testset)
Out[27]: