(http://karpathy.github.io/2015/05/21/rnn-effectiveness/)
This file trains a character-level multi-layer RNN on text data
Code is based on implementation in https://github.com/oxford-cs-ml-2015/practical6 but modified to have multi-layer support, GPU support, as well as many other common model/optimization bells and whistles. The practical6 code is in turn based on https://github.com/wojciechz/learning_to_execute which is turn based on other stuff in Torch, etc... (long lineage)
In [1]:
require 'crnn'
CharSplitLMMinibatchLoader = require 'crnn.util.CharSplitLMMinibatchLoader'
model_utils = require 'crnn.util.model_utils'
LSTM = require 'crnn.model.LSTM'
In [2]:
opt = {
-- data directory. Should contain the file input.txt with input data
data_dir = 'crnn_data/tinyshakespeare',
-- model parameters
-- size of LSTM internal state
rnn_size = 128,
-- number of layers in the LSTM
num_layers = 2,
-- for now only lstm is supported. keep fixed
model = 'lstm',
-- optimization parameters
-- learning rate
learning_rate = 2e-3,
-- learning rate decay
learning_rate_decay = 0.97,
-- in number of epochs, when to start decaying the learning rate
learning_rate_decay_after = 10,
-- decay rate for rmsprop
decay_rate = 0.95,
-- dropout to use just before classifier. 0 = no dropout
dropout = 0,
-- number of timesteps to unroll for
seq_length = 50,
-- number of sequences to train on in parallel
batch_size = 50,
-- number of full passes through the training data
max_epochs = 30,
-- clip gradients at
grad_clip = 5,
-- fraction of data that goes into train set
train_frac = 0.95,
-- fraction of data that goes into validation set
val_frac = 0.05,
-- bookkeeping
-- torch manual random number generator seed
seed = 123,
-- how many steps/minibatches between printing out the loss
print_every = 1,
-- every how many iterations should we evaluate on validation data?
eval_val_every = 1000,
-- output directory where checkpoints get written
checkpoint_dir = 'cv',
-- filename to autosave the checkpont to. Will be inside checkpoint_dir/
savefile = 'lstm',
-- GPU/CPU
-- which gpu to use. -1 = use CPU
gpuid = -1,
}
torch.setnumthreads(2)
torch.manualSeed(opt.seed)
-- train / val / test split for data, in fractions
test_frac = math.max(0, 1 - opt.train_frac - opt.val_frac)
split_sizes = {opt.train_frac, opt.val_frac, test_frac}
The data will be split as follows:
In [3]:
print(string.format('Train Split = %g\nValidation Split = %g\nTest Split = %g\n',unpack(split_sizes)))
Out[3]:
In [4]:
-- create the data loader class
loader = CharSplitLMMinibatchLoader.create(opt.data_dir, opt.batch_size, opt.seq_length, split_sizes)
-- the number of distinct characters
vocab_size = loader.vocab_size
print('vocab size: ' .. vocab_size)
-- make sure output directory exists
if not path.exists(opt.checkpoint_dir) then
lfs.mkdir(opt.checkpoint_dir)
end
In [ ]:
-- define the model: prototypes for one timestep, then clone them in time
protos = {}
print('creating an LSTM with ' .. opt.num_layers .. ' layers')
protos.rnn = LSTM.lstm(vocab_size, opt.rnn_size, opt.num_layers, opt.dropout)
-- the initial state of the cell/hidden states
init_state = {}
for L=1,opt.num_layers do
local h_init = torch.zeros(opt.batch_size, opt.rnn_size)
if opt.gpuid >=0 then h_init = h_init:cuda() end
table.insert(init_state, h_init:clone())
table.insert(init_state, h_init:clone())
end
-- training criterion (negative log likelihood)
protos.criterion = nn.ClassNLLCriterion()
In [ ]:
graph.dot(protos.rnn.fg, 'CharLstm_color', 'CharLstm_color4')
In [ ]:
-- put the above things into one flattened parameters tensor
params, grad_params = protos.rnn:getParameters()
In [ ]:
-- initialization
params:uniform(-0.08, 0.08) -- small numbers uniform
print('number of parameters in the model: ' .. params:nElement())
In [ ]:
clones = {}
for name,proto in pairs(protos) do
print('cloning ' .. name)
clones[name] = crnn.clone_many_times(proto, opt.seq_length)
end
In [ ]:
-- evaluate the loss over an entire split
function eval_split(split_index, max_batches)
print('evaluating loss over split index ' .. split_index)
local n = loader.split_sizes[split_index]
if max_batches ~= nil then n = math.min(max_batches, n) end
loader:reset_batch_pointer(split_index) -- move batch iteration pointer for this split to front
local loss = 0
local rnn_state = {[0] = init_state}
for i = 1,n do -- iterate over batches in the split
-- fetch a batch
local x, y = loader:next_batch(split_index)
if opt.gpuid >= 0 then -- ship the input arrays to GPU
-- have to convert to float because integers can't be cuda()'d
x = x:float():cuda()
y = y:float():cuda()
end
-- forward pass
for t=1,opt.seq_length do
clones.rnn[t]:evaluate() -- for dropout proper functioning
local lst = clones.rnn[t]:forward{x[{{}, t}], unpack(rnn_state[t-1])}
rnn_state[t] = {}
for i=1,#init_state do table.insert(rnn_state[t], lst[i]) end
prediction = lst[#lst]
loss = loss + clones.criterion[t]:forward(prediction, y[{{}, t}])
end
-- carry over lstm state
rnn_state[0] = rnn_state[#rnn_state]
print(i .. '/' .. n .. '...')
end
loss = loss / opt.seq_length / n
return loss
end
In [ ]:
-- do fwd/bwd and return loss, grad_params
local init_state_global = crnn.clone_list(init_state)
function feval(x)
if x ~= params then
params:copy(x)
end
grad_params:zero()
------------------ get minibatch -------------------
local x, y = loader:next_batch(1)
if opt.gpuid >= 0 then -- ship the input arrays to GPU
-- have to convert to float because integers can't be cuda()'d
x = x:float():cuda()
y = y:float():cuda()
end
------------------- forward pass -------------------
local rnn_state = {[0] = init_state_global}
local predictions = {} -- softmax outputs
local loss = 0
for t=1,opt.seq_length do
clones.rnn[t]:training() -- make sure we are in correct mode (this is cheap, sets flag)
local lst = clones.rnn[t]:forward{x[{{}, t}], unpack(rnn_state[t-1])}
rnn_state[t] = {}
for i=1,#init_state do table.insert(rnn_state[t], lst[i]) end -- extract the state, without output
predictions[t] = lst[#lst] -- last element is the prediction
loss = loss + clones.criterion[t]:forward(predictions[t], y[{{}, t}])
end
loss = loss / opt.seq_length
------------------ backward pass -------------------
-- initialize gradient at time t to be zeros (there's no influence from future)
local drnn_state = {[opt.seq_length] = crnn.clone_list(init_state, true)} -- true also zeros the clones
for t=opt.seq_length,1,-1 do
-- backprop through loss, and softmax/linear
local doutput_t = clones.criterion[t]:backward(predictions[t], y[{{}, t}])
table.insert(drnn_state[t], doutput_t)
local dlst = clones.rnn[t]:backward({x[{{}, t}], unpack(rnn_state[t-1])}, drnn_state[t])
drnn_state[t-1] = {}
for k,v in pairs(dlst) do
if k > 1 then -- k == 1 is gradient on x, which we dont need
-- note we do k-1 because first item is dembeddings, and then follow the
-- derivatives of the state, starting at index 2. I know...
drnn_state[t-1][k-1] = v
end
end
end
------------------------ misc ----------------------
-- transfer final state to initial state (BPTT)
init_state_global = rnn_state[#rnn_state] -- NOTE: I don't think this needs to be a clone, right?
-- clip gradient element-wise
grad_params:clamp(-opt.grad_clip, opt.grad_clip)
return loss, grad_params
end
In [ ]:
-- start optimization here
train_losses = {}
val_losses = {}
local optim_state = {learningRate = opt.learning_rate, alpha = opt.decay_rate}
local iterations = opt.max_epochs * loader.ntrain
local iterations_per_epoch = loader.ntrain
local loss0 = nil
for i = 1, iterations do
local epoch = i / iterations_per_epoch
local timer = torch.Timer()
local _, loss = optim.rmsprop(feval, params, optim_state)
local time = timer:time().real
local train_loss = loss[1] -- the loss is inside a list, pop it
train_losses[i] = train_loss
-- exponential learning rate decay
if i % loader.ntrain == 0 and opt.learning_rate_decay < 1 then
if epoch >= opt.learning_rate_decay_after then
local decay_factor = opt.learning_rate_decay
optim_state.learningRate = optim_state.learningRate * decay_factor -- decay it
print('decayed learning rate by a factor ' .. decay_factor .. ' to ' .. optim_state.learningRate)
end
end
-- every now and then or on last iteration
if i % opt.eval_val_every == 0 or i == iterations then
-- evaluate loss on validation data
local val_loss = eval_split(2) -- 2 = validation
val_losses[i] = val_loss
local savefile = string.format('%s/lm_%s_epoch%.2f_%.4f.t7', opt.checkpoint_dir, opt.savefile, epoch, val_loss)
print('saving checkpoint to ' .. savefile)
local checkpoint = {}
checkpoint.protos = protos
checkpoint.opt = opt
checkpoint.train_losses = train_losses
checkpoint.val_loss = val_loss
checkpoint.val_losses = val_losses
checkpoint.i = i
checkpoint.epoch = epoch
checkpoint.vocab = loader.vocab_mapping
torch.save(savefile, checkpoint)
end
if i % opt.print_every == 0 then
print(string.format("%d/%d (epoch %.3f), train_loss = %6.8f, grad/param norm = %6.4e, time/batch = %.2fs", i, iterations, epoch, train_loss, grad_params:norm() / params:norm(), time))
end
if i % 10 == 0 then collectgarbage() end
-- handle early stopping if things are going really bad
if loss0 == nil then loss0 = loss[1] end
if loss[1] > loss0 * 3 then
print('loss is exploding, aborting.')
break -- halt
end
end
In [ ]: