In [4]:
import theano
%matplotlib inline
import sys, os
sys.path.insert(1, os.path.join('../utils'))
import utils; reload(utils)
from utils import *
from __future__ import division, print_function
In [5]:
path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read()
print('corpus length:', len(text))
In [6]:
chars = sorted(list(set(text)))
vocab_size = len(chars) + 1
print('total chars:', vocab_size)
Sometimes it's useful to have a zero value in the dataset, eg. for padding
In [7]:
chars.insert(0, "\0")
In [8]:
''.join(chars[1:-6])
Out[8]:
Map from chars to indices and back again
In [9]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
idx will be the data we use from now on -- it simply converts all the characters to their index (based on the mapping above)
In [10]:
idx = [char_indices[c] for c in text]
# the 1st 10 characters:
idx[:10]
Out[10]:
In [11]:
''.join(indices_char[i] for i in idx[:70])
Out[11]:
Create a list of every 4th character, starting at the 0th, 1st, 2nd, then 3rd characters.
We're going to build a model that attempts to predict the 4th character from the previous 3. To do that we're going ot go through our whole list of indexes from 0 to the end minus 3, and we'll create a whole list of the 0th, 4th, 8th, 12th, etc characters; the 1st, 5th, 9th, etc; and 2nd, 6th, 10th, & so forth..
In [12]:
cs = 3
c1_dat = [idx[i] for i in xrange(0, len(idx)-1-cs, cs)]
c2_dat = [idx[i+1] for i in xrange(0, len(idx)-1-cs, cs)]
c3_dat = [idx[i+2] for i in xrange(0, len(idx)-1-cs, cs)]
c4_dat = [idx[i+3] for i in xrange(0, len(idx)-1-cs, cs)] # <-- gonna predict this
Our inputs
In [13]:
# we can turn these into Numpy arrays just by stacking them up together
x1 = np.stack(c1_dat[:-2]) # 1st chars
x2 = np.stack(c2_dat[:-2]) # 2nd chars
x3 = np.stack(c3_dat[:-2]) # 3rd chars
# for every 4 character peice of this - collected works
Our output
In [14]:
# labels will just be the 4th characters
y = np.stack(c4_dat[:-2])
The first 4 inputs and ouputs
In [15]:
# 1st, 2nd, 3rd chars of text
x1[:4], x2[:4], x3[:4]
Out[15]:
In [16]:
# 4th char of text
y[:3]
Out[16]:
Will try to predict 30
from 40, 42, 29
, 29
from 30, 25, 27
, & etc. That's our data format.
In [17]:
x1.shape, y.shape
Out[17]:
The number of latent factors to create (ie. the size of our 3 character inputs)
In [18]:
# we're going to turn these into embeddings
n_fac = 42
In [19]:
# by creating an embedding matrix
def embedding_input(name, n_in, n_out):
inp = Input(shape=(1,), dtype='int64', name=name)
emb = Embedding(n_in, n_out, input_length=1)(inp)
return inp, Flatten()(emb)
In [20]:
c1_in, c1 = embedding_input('c1', vocab_size, n_fac)
c2_in, c2 = embedding_input('c2', vocab_size, n_fac)
c3_in, c3 = embedding_input('c3', vocab_size, n_fac)
# c1, c2, c3 represent result of putting each char through the embedding &
# getting out 42 latent vectors. <-- those are input to greenarrow.
In [22]:
n_hidden = 256
This is the 'green arrow' from our diagram - the layer operation from input to hidden.
In [23]:
dense_in = Dense(n_hidden, activation='relu')
Our first hidden activation is simpmly this function applied to the result of the embedding of the first character.
In [24]:
c1_hidden = dense_in(c1)
This is the 'orange arrow' from our diagram - the layer operation from hidden to hidden.
In [25]:
dense_hidden = Dense(n_hidden, activation='tanh')
Our second and third hidden activations sum up the previous hidden state (after applying dense_hidden) to the new input state.
In [26]:
c2_dense = dense_in(c2) # char-2 embedding thru greenarrow
hidden_2 = dense_hidden(c1_hidden) # output of char-1's hidden state thru orangearrow
c2_hidden = merge([c2_dense, hidden_2]) # merge the two together (default: sum)
In [27]:
c3_dense = dense_in(c3)
hidden_3 = dense_hidden(c2_hidden)
c3_hidden = merge([c3_dense, hidden_3])
This is the 'blue arrow' from our diagram - the layer operation from hidden to hidden.
In [28]:
dense_out = Dense(vocab_size, activation='softmax') #output size: 86 <-- vocab_size
The third hidden state is the input to our output layer.
In [29]:
c4_out = dense_out(c3_hidden)
In [30]:
# passing in our 3 inputs & 1 output
model = Model([c1_in, c2_in, c3_in], c4_out)
In [ ]:
model.summary()
In [31]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())
model.optimizer.lr=0.001
In [32]:
model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=10)
Out[32]:
We test it by creating a function that we pass 3 letters into. Turn those letters into character indices (by looking them up in char_indices); turn each of those into a Numpy array; call model.predict on those 3 arrays -- that gives us 86 outputs; We then do an argmax to find which index of those 86 is the highest: and that's the character number we want to return.
So basically: we give it 3 letters, it gives us back the letter it thinks is most likely next.
In [33]:
def get_next(inp):
idxs = [char_indices[c] for c in inp]
arrs = [np.array(i)[np.newaxis] for i in idxs]
p = model.predict(arrs)
i = np.argmax(p)
return chars[i]
In [34]:
get_next('phi')
Out[34]:
In [36]:
get_next(' th')
Out[36]:
In [37]:
get_next(' an')
Out[37]:
In [38]:
cs = 8 # use 8 characters to predict the 9th
For each 0 thru 7, create a list of every 8th character with that starting point. These will be the 8 inputs to our model.
In [39]:
c_in_dat = [[idx[i+n] for i in xrange(0, len(idx)-1-cs, cs)] for n in range(cs)]
^ create an array with 8 elements; ea. elem contains a list of the 0th,8th,16th,24th char, the 1st,9th,17th,25th char, etc just as before. A sequence of inputs where ea. one is offset by 1 from the previous one.
Then create a list of the next character in each of these series. This will be the labels for our model. -- so our output will be exactly the same thing, except we're going to look at the indexed across by cs, so: 8. So this'll be the 8th thing in each sequence, predicted by the previous ones.
In [40]:
c_out_dat = [idx[i+cs] for i in xrange(0, len(idx)-1-cs,cs)]
In [41]:
# go thru every one of those input lists and turn into Numpy array:
xs = [np.stack(c[:-2]) for c in c_in_dat]
In [64]:
len(xs), xs[0].shape
Out[64]:
In [65]:
y = np.stack(c_out_dat[:-2])
So each column below is one series of 8 characters from the text.
In [66]:
# visualizing xs:
[xs[n][:cs] for n in range(cs)]
Out[66]:
The first column in each row is the 1st 8 characters of our test.
...and this is the next character after each sequence:
In [67]:
y[:cs]
Out[67]:
NOTE: it's almost the same as the 1-7th characters in the first row of xs. The final character in ea. sequence is the same as the first character of this sequence. It's almost the same as our previous data, just done in a more flexible way.
In [70]:
n_fac = 42
def embedding_input(name, n_in, n_out):
inp = Input(shape=(1,), dtype='int64', name=name+'_in')
emb = Embedding(n_in, n_out, input_length=1, name=name+'_emb')(inp)
return inp, Flatten()(emb)
In [71]:
c_ins = [embedding_input('c'+str(n), vocab_size, n_fac) for n in range(cs)]
n_hidden = 256
In [72]:
dense_in = Dense(n_hidden, activation='relu')
dense_hidden = Dense(n_hidden, activation='relu', init='identity')
dense_out = Dense(vocab_size, activation='softmax')
The first character of each sequence goes through dense_in()
, to create our first hidden activations.
In [73]:
hidden = dense_in(c_ins[0][1])
Then for each successive layer we combine the output of dense_in()
on the ext character with the output of dense_hidden()
on the current hidden state, to create the new hidden state.
In [77]:
for i in range(1,cs):
c_dense = dense_in(c_ins[i][1]) #green arrow
hidden = dense_hidden(hidden) #orange arrow
hidden = merge([c_dense, hidden]) #merge the two together
Putting the final hidden state through dense_out()
gives us our output:
In [78]:
c_out = dense_out(hidden)
So now we can create our model
In [80]:
model = Model([c[0] for c in c_ins], c_out)
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())
In [81]:
model.fit(xs, y, batch_size=64, nb_epoch=12)
Out[81]:
In [100]:
def get_next(inp):
idxs = [np.array(char_indices[c])[np.newaxis] for c in inp]
p = model.predict(idxs)
return chars[np.argmax(p)]
In [101]:
get_next('for thos')
Out[101]:
In [86]:
get_next('part of ')
Out[86]:
In [87]:
get_next('queens a')
Out[87]:
Here, c_out_dat
is identical to c_in_dat
, but moved across 1 character.
So now, in ea sequece, the 1st char will be used to predict the 2nd, the 1st & 2nd to predict the 3rd, and so on. A lot more predictions going on --> a lot more opportunity for the model to learn.
In [102]:
# c_in_dat = [[idx[i+n] for i in xrange(0, len(idx)-1-cs, cs)] for n in range(cs)]
c_out_dat = [[idx[i+n] for i in xrange(1, len(idx)-cs, cs)] for n in range(cs)]
In [103]:
ys = [np.stack(c[:-2]) for c in c_out_dat]
In [104]:
[xs[n][:cs] for n in range(cs)]
Out[104]:
In [105]:
[ys[n][:cs] for n in range(cs)]
Out[105]:
In [106]:
dense_in = Dense(n_hidden, activation='relu')
dense_out = Dense(vocab_size, activation='softmax', name='output')
We're going to pass a vector of all zeros as our starting point - here's our input layers for that:
In [107]:
# our char1 input is moved within the diagram's loop-box; so now need
# initialized input (zeros)
inp1 = Input(shape=(n_fac,), name='zeros')
hidden = dense_in(inp1)
In [108]:
outs = []
for i in range(cs):
c_dense = dense_in(c_ins[i][1])
hidden = dense_hidden(hidden)
hidden = merge([c_dense, hidden], mode='sum')
# every layer now has an output
outs.append(dense_out(hidden))
# our loop is identical to before, except at the end of every loop,
# we're going to append this output; so now we're going to have
# 8 outputs for every sequence instead of just 1.
In [109]:
# model now has vector of 0s: [inp1], and array of outputs: outs
model = Model([inp1] + [c[0] for c in c_ins], outs)
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())
In [110]:
zeros = np.tile(np.zeros(n_fac), (len(xs[0]),1))
zeros.shape
Out[110]:
Now when we fit, we add the array of zeros to the start of our inputs; our ouputs are going to be those lists of 8, offset by 1. We get 8 losses instead of 1 bc ea. one of those 8 outputs has its own loss. You'll see the model's ability to predict the 1st character from a bunch of zeros is very limited and flattens out; but predicting the 8th char with the context of 7 is much better and keeps improving.
In [112]:
model.fit([zeros]+xs, ys, batch_size=64, nb_epoch=12)
Out[112]:
In [115]:
def get_nexts(inp):
idxs = [char_indices[c] for c in inp]
arrs = [np.array(i)[np.newaxis] for i in idxs]
p = model.predict([np.zeros(n_fac)[np.newaxis,:]] + arrs)
print(list(inp))
return [chars[np.argmax(o)] for o in p]
In [116]:
get_nexts(' this is')
Out[116]:
In [118]:
get_nexts(' part of')
Out[118]:
In [120]:
n_hidden, n_fac, cs, vocab_size
Out[120]:
To convert our previous ekras model into a sequence model, simply add the return_sequences=True
parameter, and TimeDistributed()
around our dense layer.
In [121]:
model = Sequential([
Embedding(vocab_size, n_fac, input_length=cs),
SimpleRNN(n_hidden, return_sequences=True, activation='relu', inner_init='identity'),
TimeDistributed(Dense(vocab_size, activation='softmax')),
])
In [122]:
model.summary()
Note 8 outputs. What TimeDistributed
does is create 8 copies of the weight matrix for each output.
NOTE: in Keras anytime you specfy return_sequences=True
, any dense layers after that must have TimeDistributed
wrapped around them. - Bc, in this case, we want to create not 1 dense layer, but 8.
In [123]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())
In [124]:
# just some dimensionality changes required; otherwise same
x_rnn = np.stack(np.squeeze(xs), axis=1)
y_rnn = np.stack(ys, axis=1)
In [125]:
x_rnn.shape, y_rnn.shape
Out[125]:
In [126]:
model.fit(x_rnn, y_rnn, batch_size=64, nb_epoch=8)
Out[126]:
In [128]:
def get_nexts_keras(inp):
idxs = [char_indices[c] for c in inp]
arr = np.array(idxs)[np.newaxis,:]
p = model.predict(arr)[0]
print(list(inp))
return [chars[np.argmax(o)] for o in p]
In [129]:
get_nexts_keras(' this is')
Out[129]:
A stateful model is easy to create (just add stateful=True
) but harder to train. We had to add batchnorm and use LSTM to get reasonable results.
When using stateful in Keras, you have to also add batch_input_shape
to the first layer, and fix the batch size there.
Need shuffle=False
and stateful=True
in order to have memory for LSTMs. Having stateful True, tells Keras not to not to reset the hidden activations to zero, but leave them as they are -- allowing the model to build up as much state as it wants. If this done, then shuffle must be False, so it'll pass in the 1st 8 chars, then 2nd 8, and so on, in order, leaving the hidden state untouched in between each one.
Training these stateful models is a lot harder than other models due to exploding gradients (exploding activations). These Long-term Dependency Models were thought impossible until the '90s when researchers invented the LSTM model.
In the LSTM model, the recurrent weight-activations-matrix loop is replaced with a loop with a Neural Network inside of it that decides how much of this state matrix to keep, and to use at each activation. Therefore the model can learn how to avoid gradient explosions. It can actually learn how to create an effective sequence.
Below an LSTM & BatchNormed inputs are used bc J.H. had no luck with pure RNNs and ReLUs.
In [130]:
bs = 64
In [132]:
model = Sequential([
Embedding(vocab_size, n_fac, input_length=cs, batch_input_shape=(bs,8)),
BatchNormalization(),
LSTM(n_hidden, return_sequences=True, stateful=True),
TimeDistributed(Dense(vocab_size, activation='softmax')),
])
In [ ]:
# dont forget to compile (accidetnly hit `M` in JNB)
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())
Since we're using a fixed batch shape, we have to ensure our inputs and outputs are an even multiple of the batch size.
In [135]:
mx = len(x_rnn)//bs*bs
The LSTM model takes much longer to run than the regular RNN because it isn't in parallel: each operation has to be run in order.
In [138]:
model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, nb_epoch=4, shuffle=False)
Out[138]:
In [139]:
model.optimizer.lr=1e-4
model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, nb_epoch=4, shuffle=False)
Out[139]:
In [141]:
model = Sequential([
SimpleRNN(n_hidden, return_sequences=True, input_shape=(cs, vocab_size),
activation='relu', inner_init='identity'),
TimeDistributed(Dense(vocab_size, activation='softmax')),
])
model.compile(loss='categorical_crossentropy', optimizer=Adam())
# no embedding layer, so inputs must be onhotted too.
In [142]:
oh_ys = [to_categorical(o, vocab_size) for o in ys]
oh_y_rnn = np.stack(oh_ys, axis=1)
oh_xs = [to_categorical(o, vocab_size) for o in xs]
oh_x_rnn = np.stack(oh_xs, axis=1)
oh_x_rnn.shape, oh_y_rnn.shape
Out[142]:
The 86
is the onehotted dimension; classes of characters
In [144]:
model.fit(oh_x_rnn, oh_y_rnn, batch_size=64, nb_epoch=8)
Out[144]:
In [145]:
def get_nexts_oh(inp):
idxs = np.array([char_indices[c] for c in inp])
arr = to_categorical(idxs, vocab_size)
p = model.predict(arr[np.newaxis,:])[0]
print(list(inp))
return [chars[np.argmax(o)] for o in p]
In [146]:
get_nexts_oh(' this is')
Out[146]:
In [152]:
n_input = vocab_size
n_output = vocab_size
Using raw theano, we have to create our weight matrices and bias vectors outselves - here are the functions we'll use to do so (using glorot initialization).
The return values are wrapped in shard()
, which is how we tell theano that it can manage this data (copying it to and from the GPU as necessary).
In [166]:
def init_wgts(rows, cols):
scale = math.sqrt(2/rows) # 1st calc Glorot number to scale weights
return shared(normal(scale=scale, size=(rows, cols)).astype(np.float32))
def init_bias(rows):
return shared(np.zeros(rows, dtype=np.float32))
We return the weights and biases together as a tuple. For the hidden weights, we'll use an identity initialization (as recommended by Hinton.)
In [167]:
def wgts_and_bias(n_in, n_out):
return init_wgts(n_in, n_out), init_bias(n_out)
def id_and_bias(n):
return shared(np.eye(n, dtype=np.float32)), init_bias(n)
Different than Python; Theano requires us to build up a computation graph first. shared(..)
basically tells Theano to keep track of something to send to the GPU later. Once you wrap smth in shared
it basically belongs to Theano now.
Theano doesn't actually do any computations until we explicitly compile and evaluate the function (at which point it'll be turned into CUDA code and sent off to the GPU). So our job is to describe the computations that we'll want theano to do - the first step is to tell theano what inputs we'll be providing to our computation:
In [168]:
# Theano variables
t_inp = T.matrix('inp')
t_outp = T.matrix('outp')
t_h0 = T.vector('h0')
lr = T.scalar('lr')
all_args = [t_h0, t_inp, t_outp, lr]
Now we're ready to create our initial weight matrices.
In [169]:
W_h = id_and_bias(n_hidden)
W_x = wgts_and_bias(n_input, n_hidden)
W_y = wgts_and_bias(n_hidden, n_output)
w_all = list(chain.from_iterable([W_h, W_x, W_y]))
We now need to tell Theano what happens each time we take a single step of this RNN.
Theano handles looping by using the GPU scan operation. We have to tell theano what to do at each step through the scan - this is the function we'll use, which does a single forward pass for one character.
In [170]:
def step(x, h, W_h, b_h, W_x, b_x, W_y, b_y):
# Calculate the hidden activations
h = nnet.relu(T.dot(x, W_x) + b_x + T.dot(h, W_h) + b_h)
# Calculate the output activations
y = nnet.softmax(T.dot(h, W_y) + b_y)
# Return both (the 'Flatten()' is to work around a theano bug)
return h, T.flatten(y, 1)
Now we can provide everything necessary for the scan operation, so we can set that up - we have to pass in the function to call at each step, the sequence to step through, the initial values of the outputs, and any other arguments to pass to the step function.
In [171]:
[v_h, v_y], _ = theano.scan(step, sequences=t_inp,
outputs_info=[t_h0, None], non_sequences=w_all)
You get this error if you accidently define step
as:
def step(x, h, W_h, W_x, b_h, b_x, W_y, b_y):
In [164]:
[v_h, v_y], _ = theano.scan(step, sequences=t_inp,
outputs_info=[t_h0, None], non_sequences=w_all)
We can now calculate our loss function, and all of our gradients, with just a couple lines of code!
In [172]:
error = nnet.categorical_crossentropy(v_y, t_outp).sum()
g_all = T.grad(error, w_all)
We even have to show Theano how to do SGD - so we set up this dictionary of updates to complete after every forward pass, which apply the standard SGD update rule to every weight.
In [173]:
def upd_dict(wgts, grads, lr):
return OrderedDict({w: w-g*lr for (w,g) in zip(wgts,grads)})
In [176]:
upd = upd_dict(w_all, g_all, lr)
# we're finally ready to compile the function!:
fn = theano.function(all_args, error, updates=upd, allow_input_downcast=True)
In [177]:
X = oh_x_rnn
Y = oh_y_rnn
X.shape, Y.shape
Out[177]:
To use it, we simply loop through our input data, calling the function compiled above, and printing our progress from time to time.
We have to manually define our loop because Theano doesn't have it built-in.
In [178]:
err=0.0; l_rate=0.01
for i in xrange(len(X)):
err += fn(np.zeros(n_hidden), X[i], Y[i], l_rate)
if i % 1000 == 999:
print ("Error:{:.3f}".format(err/1000))
err=0.0
In [179]:
f_y = theano.function([t_h0, t_inp], v_y, allow_input_downcast=True)
In [180]:
pred = np.argmax(f_y(np.zeros(n_hidden), X[6]), axis=1)
In [181]:
act = np.argmax(X[6], axis=1)
In [182]:
[indices_char[o] for o in act]
Out[182]:
In [183]:
[indices_char[o] for o in pred]
Out[183]:
In [4]:
# looking at how to use Python debugger
import numpy as np
import pdb
err=0.; lrate=0.01
for i in range(len(np.zeros(10))):
err += np.sin(lrate+np.e**i)
pdb.set_trace()
In [ ]: