In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
In [2]:
from fastai.io import *
from fastai.conv_learner import *
from fastai.column_data import *
We're going to download the collected works of Nietzsche to use as our data for this class.
In [165]:
# PATH = Path('data/nietzsche/')
PATH = 'data/nietzsche/'
get_data("https://s3.amazonaws.com/text-datasets/nietzsche.txt", f'{PATH}nietzsche.txt')
text = open(f'{PATH}nietzsche.txt').read()
print('corpus length:', len(text))
In [4]:
text[:400]
Out[4]:
In [5]:
chars = sorted(list(set(text)))
vocab_size = len(chars) + 1
print('total chars', vocab_size)
Sometimes it's useful to have a zero value in the dataset, eg: for padding.
In [6]:
chars.insert(0, '\0')
''.join(chars[1:-5])
Out[6]:
Map from chars to indices and back again:
In [7]:
char_indices = {c: i for i, c in enumerate(chars)}
indices_char = {i: c for i, c in enumerate(chars)}
idx will be the data we use form now on – it simply converts all characters to their index (based on the mapping above).
In [8]:
idx = [char_indices[c] for c in text]
idx[:10]
Out[8]:
In [9]:
''.join(indices_char[i] for i in idx[:70])
Out[9]:
Create a list of every 4th character, starting at the 0th, 1st, 2nd, then 3rd characters.
In [10]:
cs = 3
c1_dat = [idx[i] for i in range(0, len(idx)-cs, cs)] # every 1st char
c2_dat = [idx[i+1] for i in range(0, len(idx)-cs, cs)] # every 2nd
c3_dat = [idx[i+2] for i in range(0, len(idx)-cs, cs)] # every 3rd
c4_dat = [idx[i+3] for i in range(0, len(idx)-cs, cs)] # every 4th
Our inputs:
In [11]:
x1 = np.stack(c1_dat)
x2 = np.stack(c2_dat)
x3 = np.stack(c3_dat)
Our outputs:
In [12]:
y = np.stack(c4_dat)
The first 4 inputs and outputs:
In [13]:
x1[:4], x2[:4], x3[:4]
Out[13]:
In [14]:
y[:4]
Out[14]:
In [15]:
x1.shape, y.shape
Out[15]:
Pick a size for our hidden state
In [16]:
n_hidden = 256
The number of latent factors to create (ie: size of the embedding matrix):
In [17]:
n_fac = 42 # about half the number of our characters
In [19]:
'0.3' in torch.__version__
Out[19]:
In [20]:
class Char3Model(nn.Module):
def __init__(self, vocab_size, n_fac):
super().__init__()
self.e = nn.Embedding(vocab_size, n_fac) # embedding
# the 'green arrow' from our diagram – the layer operation from input to hidden
self.l_in = nn.Linear(n_fac, n_hidden)
# the 'orange arrow' from our diagram – the layer operation from hidden to hidden
self.l_hidden = nn.Linear(n_hidden, n_hidden)
# the 'blue arrow' from our diagram – the layer operation from hidden to output
self.l_out = nn.Linear(n_hidden, vocab_size)
def forward(self, c1, c2, c3):
in1 = F.relu(self.l_in(self.e(c1)))
in2 = F.relu(self.l_in(self.e(c2)))
in3 = F.relu(self.l_in(self.e(c3)))
if '0.3' in torch.__version__:
h = V(torch.zeros(in1.size()).cuda())
h = F.tanh(self.l_hidden(h+in1))
h = F.tanh(self.l_hidden(h+in2))
h = F.tanh(self.l_hidden(h+in3))
else:
h = torch.zeros(in1.size()).cuda() # I dont think I have to wrap as Variable since this is pytorch 0.4, no?
h = torch.tanh(self.l_hidden(h + in1))
h = torch.tanh(self.l_hidden(h + in2))
h = torch.tanh(self.l_hidden(h + in3))
return F.log_softmax(self.l_out(h))
In [37]:
mdata = ColumnarModelData.from_arrays('.', [-1], np.stack([x1,x2,x3], axis=1), y, bs=512)
model = Char3Model(vocab_size, n_fac).cuda()
In [38]:
it = iter(mdata.trn_dl)
*xs,yt = next(it)
# tensor = model(*xs)
tensor = model(*V(xs))
In [39]:
optimizer = optim.Adam(model.parameters(), 1e-2)
In [40]:
set_lrs(optimizer, 1e-3)
fit(model, mdata, 1, optimizer, F.nll_loss)
Out[40]:
In [41]:
set_lrs(optimizer, 1e-3)
fit(model, mdata, 1, optimizer, F.nll_loss)
Out[41]:
In [35]:
def get_next(inp):
"""
Takes a 3-char string.
Turns it into a Tensor of an array of the char index of the string.
Passes that tensor to the model.
Does an argmax to get the predicted char-number; then coverts to char.
"""
idxs = T(np.array([char_indices[c] for c in inp]))
# pred = model(*idxs)
pred = model(*VV(idxs))
i = np.argmax(to_np(pred))
return chars[i]
In [25]:
get_next('y. '), get_next('ppl'), get_next(' th'), get_next('and')
Out[25]:
This is the size of our unrolled RNN:
In [43]:
cs = 8
For each of 0 thru 8, create a list of every 8th character with that starting point. These will be the 8 inputs to our model.
In [44]:
c_in_dat = [[idx[i + j] for i in range(cs)] for j in range(len(idx) - cs)]
In [45]:
c_out_dat = [idx[j + cs] for j in range(len(idx) - cs)]
In [46]:
xs = np.stack(c_in_dat, axis=0); xs.shape
Out[46]:
In [47]:
y = np.stack(c_out_dat); y.shape
Out[47]:
So each column below is one series of 8 characters from the text.
In [48]:
xs[:cs, :cs]
Out[48]:
they're overlapping. So after '[42, 29, 30, 25, 27, 29, 1, 1]
' comes '1
', and after '[29, 30, 25, 27, 29, 1, 1, 1]
' comes '43
', and so on. The n
th row is the same as the n
th column.
...and this is the next character after each sequence
In [49]:
y[:cs]
Out[49]:
In [50]:
val_idx = get_cv_idxs(len(idx) - cs - 1)
In [51]:
mdata = ColumnarModelData.from_arrays('.', val_idx, xs, y, bs=512)
In [52]:
class CharLoopModel(nn.Module):
"""This is an RNN."""
def __init__(self, vocab_size, n_fac):
super().__init__()
self.e = nn.Embedding(vocab_size, n_fac)
self.l_in = nn.Linear(n_fac, n_hidden)
self.l_hidden = nn.Linear(n_hidden, n_hidden)
self.l_out = nn.Linear(n_hidden, vocab_size)
def forward(self, *cs):
bs = cs[0].size(0)
# h = torch.zeros(bs, n_hidden).cuda()
h = V(torch.zeros(bs, n_hidden).cuda())
for c in cs:
# inp = torch.tanh(self.l_in(self.e(c))) # the torch.tanh vs F.tanh warning didnt pop
# h = torch.tanh(self.l_hidden(h + inp)) # up on Mac, but did on Linux-gpu. Odd.
inp = F.relu(self.l_in(self.e(c)))
h = F.tanh(self.l_hidden(h+inp))
return F.log_softmax(self.l_out(h), dim=-1)
In [53]:
model = CharLoopModel(vocab_size, n_fac).cuda()
optimizer = optim.Adam(model.parameters(), 1e-2)
In [54]:
fit(model, mdata, 1, optimizer, F.nll_loss)
Out[54]:
In [55]:
set_lrs(optimizer, 1e-3)
fit(model, mdata, 1, optimizer, F.nll_loss)
Out[55]:
The input and hidden states represent qualitatively different types of information, so adding them together can potentially lose information. Instead we can concatenate them together.
In [56]:
class CharLoopConcatModel(nn.Module):
def __init__(self, vocab_size, n_fac):
super().__init__()
self.e = nn.Embedding(vocab_size, n_fac)
self.l_in = nn.Linear(n_fac + n_hidden, n_hidden)
self.l_hidden = nn.Linear(n_hidden, n_hidden)
self.l_out = nn.Linear(n_hidden, vocab_size)
def forward(self, *cs):
bs = cs[0].size(0)
# h = torch.zeros(bs, n_hidden).cuda()
h = V(torch.zeros(bs, n_hidden).cuda())
for c in cs:
inp = torch.cat((h, self.e(c)), 1)
inp = F.relu(self.l_in(inp))
# h = torch.tanh(self.l_hidden(inp))
h = F.tanh(self.l_hidden(inp))
return F.log_softmax(self.l_out(inp), dim=-1)
In [57]:
model = CharLoopConcatModel(vocab_size, n_fac).cuda()
optimizer = optim.Adam(model.parameters(), 1e-3)
In [58]:
it = iter(mdata.trn_dl)
*xs,yt = next(it)
# t = model(*xs)
t = model(*V(xs))
In [59]:
xs[0].size(0)
Out[59]:
In [60]:
t
Out[60]:
In [61]:
fit(model, mdata, 1, optimizer, F.nll_loss)
Out[61]:
In [62]:
set_lrs(optimizer, 1e-4)
fit(model, mdata, 1, optimizer, F.nll_loss)
Out[62]:
In [67]:
if '0.3' in torch.__version__:
def get_next(inp):
idxs = T(np.array([char_indices[c] for c in inp]))
p = model(*VV(idxs))
i = np.argmax(to_np(p))
return chars[i]
else:
def get_next(inp):
# idxs = [T(np.array([char_indices[c] for c in inp]))]
idxs = [T(np.array([char_indices[c]])) for c in inp]
p = model(*idxs)
i = np.argmax(to_np(p))
# pdb.set_trace()
return chars[i]
In [68]:
get_next('for thos')
Out[68]:
In [69]:
get_next('part of ')
Out[69]:
In [70]:
get_next('queens a')
Out[70]:
In [92]:
class CharRNN(nn.Module):
def __init__(self, vocab_size, n_fac):
super().__init__()
self.e = nn.Embedding(vocab_size, n_fac)
self.rnn = nn.RNN(n_fac, n_hidden)
self.l_out = nn.Linear(n_hidden, vocab_size)
def forward(self, *cs):
bs = cs[0].size(0)
# h = torch.zeros(1, bs, n_hidden)
h = V(torch.zeros(1, bs, n_hidden))
inp = self.e(torch.stack(cs))
outp,h = self.rnn(inp, h)
return F.log_softmax(self.l_out(outp[-1]))
# return F.log_softmax(self.l_out(outp[-1]), dim=-1) # outp[-1] to get last hidden state
In [93]:
model = CharRNN(vocab_size, n_fac).cuda()
optimizer = optim.Adam(model.parameters(), 1e-3)
In [94]:
it = iter(mdata.trn_dl)
*xs,yt = next(it)
In [95]:
# tensor = model.e(V(torch.stack(xs))) # works w/o V(.). but takes longer when switching btwn w/wo V(.)?
# tensor = model.e(torch.stack(xs)) # these are ints so cannot require gradients
# tensor = model.e(T(torch.stack(xs)))
tensor = model.e(V(torch.stack(xs)))
tensor.size()
Out[95]:
In [96]:
# htensor = V(torch.zeros(1, 512, n_hidden)) # V(.) required here, else: RuntimeError: CuDNN error: CUDNN_STATUS_EXECUTION_FAILED
# NOTE: does not work: htensor = torch.zeros(1, 512, n_hidden, requires_grad=True) # requires_grad=True accomplishes what V(.) did in 0.3.1 for 0.4.
# htensor = T(torch.zeros(1, 512, n_hidden))
htensor = V(torch.zeros(1, 512, n_hidden))
In [97]:
outp, hn = model.rnn(tensor, htensor)
outp.size(), hn.size()
Out[97]:
I'm able to get this far in pytorch 0.4, using T
instead of V
. The problem is the next line keeps giving me a:
As per here, I'm going to use pytorch 0.3 from here to the end.
In [57]:
# the error when using pytorch 0.4:
tensor = model(*V(xs)); tensor.size()
In [98]:
tensor = model(*V(xs)); tensor.size()
Out[98]:
In [99]:
fit(model, mdata, 4, optimizer, F.nll_loss)
Out[99]:
In [100]:
set_lrs(opt, 1e-4)
fit(model, mdata, 2, optimizer, F.nll_loss)
Out[100]:
In [101]:
def get_next(inp):
idxs = T(np.array([char_indices[c] for c in inp]))
p = model(*VV(idxs))
i = np.argmax(to_np(p))
return chars[i]
In [102]:
get_next('for thos')
Out[102]:
In [103]:
def get_next_n(inp, n):
res = inp
for i in range(n):
c = get_next(inp)
res += c
inp = inp[1:] + c
return res
In [104]:
get_next_n('for thos', 40)
Out[104]:
Let's take non-overlapping sets of characters this time.
In [107]:
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(0, len(idx) - cs - 1, cs)]
Then create the exact same thing, offset by 1, as our labels.
In [108]:
c_out_dat = [[idx[i+j] for i in range(cs)] for j in range(1, len(idx) - cs, cs)]
In [109]:
xs = np.stack(c_in_dat)
xs.shape
Out[109]:
In [110]:
ys = np.stack(c_out_dat)
ys.shape
Out[110]:
In [111]:
xs[:cs, :cs]
Out[111]:
In [112]:
ys[:cs, :cs]
Out[112]:
In [147]:
val_idx = get_cv_idxs(len(xs) - cs - 1)
In [148]:
mdata = ColumnarModelData.from_arrays('.', val_idx, xs, ys, bs=512)
In [149]:
class CharSeqRNN(nn.Module):
def __init__(self, vocab_size, n_fac):
super().__init__()
self.e = nn.Embedding(vocab_size, n_fac)
self.rnn = nn.RNN(n_fac, n_hidden)
self.l_out = nn.Linear(n_hidden, vocab_size)
def forward(self, *cs):
bs = cs[0].size(0)
h = V(torch.zeros(1, bs, n_hidden))
inp = self.e(torch.stack(cs))
outp,h = self.rnn(inp, h)
return F.log_softmax(self.l_out(outp), dim=-1)
In [150]:
model = CharSeqRNN(vocab_size, n_fac).cuda()
optimizer = optim.Adam(model.parameters(), 1e-3)
In [151]:
it = iter(mdata.trn_dl)
*xst, yt = next(it)
In [152]:
def nll_loss_seq(inp, targ):
sl,bs,nh = inp.size() # 8 x 512 x nhidden
targ = targ.transpose(0,1).contiguous().view(-1)
return F.nll_loss(inp.view(-1, nh), targ)
In [153]:
fit(model, mdata, 4, optimizer, nll_loss_seq)
Out[153]:
In [154]:
set_lrs(opt, 1e-4)
In [155]:
fit(model, mdata, 1, optimizer, nll_loss_seq)
Out[155]:
In [156]:
model = CharSeqRNN(vocab_size, n_fac).cuda()
optimizer = optim.Adam(model.parameters(), 1e-2)
In [157]:
m.rnn.weight_hh_l0.data.copy_(torch.eye(n_hidden))
Out[157]:
In [158]:
fit(model, mdata, 4, optimizer, nll_loss_seq)
Out[158]:
In [159]:
set_lrs(optimizer, 1e-3)
In [160]:
fit(model, mdata, 4, opt, nll_loss_seq)
Out[160]:
In [161]:
set_lrs(optimizer, 1e-4)
In [162]:
fit(model, mdata, 4, optimizer, nll_loss_seq)
Out[162]:
In [168]:
from torchtext import vocab, data
from fastai.nlp import *
from fastai.lm_rnn import *
PATH = 'data/nietzsche/'
TRN_PATH = 'trn/'
VAL_PATH = 'val/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'
## line counting: https://stackoverflow.com/a/3137099
# $ wc -l nietzsche/nietzsche.txt
## splitting: https://stackoverflow.com/a/2016918
# $ split -l 7947 nietzsche/nietzsche.txt
# $ mv xaa nietzsche/trn.txt
# $ mv xab nietzsche/val.txt
%ls {PATH}
In [169]:
%ls {PATH}trn
In [170]:
TEXT = data.Field(lower=True, tokenize=list) # torchtext
bs = 64; bptt = 8; n_fac = 42; n_hidden = 256
FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
mdata = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=3)
len(mdata.trn_dl), mdata.nt, len(mdata.trn_ds), len(mdata.trn_ds[0].text)
Out[170]:
In [175]:
class CharSeqStatefulRNN(nn.Module):
def __init__(self, vocab_size, n_fac, bs):
self.vocab_size = vocab_size
super().__init__()
self.e = nn.Embedding(vocab_size, n_fac)
self.rnn = nn.RNN(n_fac, n_hidden)
self.l_out = nn.Linear(n_hidden, vocab_size)
self.init_hidden(bs)
def forward(self, cs):
bs = cs[0].size(0)
if self.h.size(1) != bs: self.init_hidden(bs)
outp,h = self.rnn(self.e(cs), self.h)
self.h = repackage_var(h) # bptt here; throw away hidden state's history
return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))
In [172]:
m = CharSeqStatefulRNN(mdata.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)
In [173]:
fit(m, mdata, 4, opt, F.nll_loss)
Out[173]:
In [174]:
set_lrs(opt, 1e-4)
fit(m, mdata, 4, opt, F.nll_loss)
Out[174]:
In [179]:
# # From pytorch source:
# def RNNCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
# return F.tanh(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))
In [195]:
class CharSeqStatefulRNN2(nn.Module):
def __init__(self, vocab_size, n_fac, bs):
super().__init__()
self.vocab_size = vocab_size
self.e = nn.Embedding(vocab_size, n_fac)
self.rnn = nn.RNNCell(n_fac, n_hidden)
self.l_out = nn.Linear(n_hidden, vocab_size)
self.init_hidden(bs)
def forward(self, cs):
bs = cs[0].size(0)
if self.h.size(1) != bs: self.init_hidden(bs)
outp = []
o = self.h
for c in cs:
o = self.rnn(self.e(c), o)
outp.append(o)
outp = self.l_out(torch.stack(outp))
self.h = repackage_var(o)
return F.log_softmax(outp, dim=-1).view(-1, self.vocab_size)
def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))
In [196]:
m = CharSeqStatefulRNN2(mdata.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)
In [197]:
fit(m, mdata, 4, opt, F.nll_loss)
Out[197]:
In [199]:
class CharSeqStatefulGRU(nn.Module):
def __init__(self, vocab_size, n_fac, bs):
super().__init__()
self.vocab_size = vocab_size
self.e = nn.Embedding(vocab_size, n_fac)
self.rnn = nn.GRU(n_fac, n_hidden)
self.l_out = nn.Linear(n_hidden, vocab_size)
self.init_hidden(bs)
def forward(self, cs):
bs = cs[0].size(0)
if self.h.size(1) != bs: self.init_hidden(bs)
outp,h = self.rnn(self.e(cs), self.h)
self.h = repackage_var(h)
return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))
In [201]:
# # From pytorch source code – for reference
# def GRUCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
# gi = F.linear(input, w_ih, b_ih)
# gh = F.linear(hidden, w_hh, b_hh)
# i_r, i_i, i_n = gi.chunk(3, 1)
# h_r, h_i, h_n = gh.chunk(3, 1)
# resetgate = F.sigmoid(i_r + h_r)
# inputgate = F.sigmoid(i_i + h_i)
# newgate = F.tanh(i_h + resetgate * h_n)
# return newgate + inputgate * (hidden - newgate)
In [202]:
m = CharSeqStatefulGRU(mdata.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)
In [203]:
fit(m, mdata, 6, opt, F.nll_loss)
Out[203]:
In [204]:
set_lrs(opt, 1e-4)
In [205]:
fit(m, mdata, 3, opt, F.nll_loss)
Out[205]:
In [206]:
from fastai import sgdr
n_hidden = 512
In [207]:
class CharSeqStatefulLSTM(nn.Module):
def __init__(self, vocab_size, n_fac, bs, nl):
super().__init__()
self.vocab_size,self.nl = vocab_size,nl
self.e = nn.Embedding(vocab_size, n_fac)
self.rnn = nn.LSTM(n_fac, n_hidden, nl, dropout=0.5)
self.l_out = nn.Linear(n_hidden, vocab_size)
self.init_hidden(bs)
def forward(self, cs):
bs = cs[0].size(0)
if self.h[0].size(1) != bs: self.init_hidden(bs)
outp,h = self.rnn(self.e(cs), self.h)
self.h = repackage_var(h)
return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
def init_hidden(self, bs):
self.h = (V(torch.zeros(self.nl, bs, n_hidden)),
V(torch.zeros(self.nl, bs, n_hidden)))
In [208]:
m = CharSeqStatefulLSTM(mdata.nt, n_fac, 512, 2).cuda()
lo = LayerOptimizer(optim.Adam, m, 1e-2, 1e-5)
In [210]:
os.makedirs(f'{PATH}models', exist_ok=True)
In [211]:
fit(m, mdata, 2, lo.opt, F.nll_loss)
Out[211]:
In [213]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(mdata.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, mdata, 2**4-1, lo.opt, F.nll_loss, callbacks=cb)
Out[213]:
In [215]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(mdata.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, mdata, 2**6-1, lo.opt, F.nll_loss, callbacks=cb)
Out[215]:
In [216]:
def get_next(inp):
idxs = TEXT.numericalize(inp)
p = m(VV(idxs.transpose(0,1)))
r = torch.multinomial(p[-1].exp(), 1)
return TEXT.vocab.itos[to_np(r)[0]]
In [217]:
get_next('for thos')
Out[217]:
In [218]:
def get_next_n(inp, n):
res = inp
for i in range(n):
c = get_next(inp)
res += c
inp = inp[1:] + c
return res
In [219]:
print(get_next_n('for thos', 400))
In [220]:
print(get_next_n('the reason', 400))
I made a mistake somewhere, the loss should be around 1.25, not 1.35. Anyway, basically works.