In [8]:
from theano.sandbox import cuda
cuda.use('gpu1')
In [9]:
%matplotlib inline
import utils;
from utils import *
from keras.layers import TimeDistributed, Activation
from numpy.random import choice
In [10]:
path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read()
print('corpus length:', len(text))
In [11]:
!tail {path} -n10
In [12]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars: ', vocab_size)
Sometimes it's useful to have a zero value in the dataset, e.g. for padding
In [13]:
chars.insert(0, "\0")
''.join(chars[:-6])
Out[13]:
In [14]:
char_indices = dict((c, i) for i,c in enumerate(chars))
indices_char = dict((i, c) for i,c in enumerate(chars))
idx = [char_indices[c] for c in text]
In [15]:
idx[:10]
Out[15]:
In [16]:
''.join(indices_char[i] for i in idx[:20])
Out[16]:
In [17]:
cs=3
c1_dat = [idx[i] for i in range(0, len(idx)-1-cs, cs)]
c2_dat = [idx[i+1] for i in range(0, len(idx)-1-cs, cs)]
c3_dat = [idx[i+2] for i in range(0, len(idx)-1-cs, cs)]
c4_dat = [idx[i+3] for i in range(0, len(idx)-1-cs, cs)]
In [18]:
c1_dat[:5]
?np.stack
Our inputs
In [19]:
# Return them into numpy arrays
x1 = np.stack(c1_dat[:-2])
x2 = np.stack(c2_dat[:-2])
x3 = np.stack(c3_dat[:-2])
In [20]:
print(x1.shape)
x1[:5]
Out[20]:
Our output
In [21]:
y = np.stack(c4_dat[:-2])
The number of latent factors to create
In [22]:
n_fac = 42
Create inputs and embedding outputs for each of our 3 character inputs
In [23]:
def embedding_input(name, n_in, n_out):
""" Create embedding by first create an input layer
then apply an embedding layer to it
"""
inp = Input(shape=(1,), dtype='int64', name=name)
emb = Embedding(n_in, n_out, input_length=1)(inp)
return inp, Flatten()(emb)
Of course, you can always use one-hot encoding for each character. But with embedding, we are able to capture the similarities between 'A' and 'a' for example. Whereas with one-hot encoding, 'A' and 'a' will be treated no differently with 'A' and 'Z'.
In [24]:
c1_in, c1 = embedding_input('c1', vocab_size, n_fac)
c2_in, c2 = embedding_input('c2', vocab_size, n_fac)
c3_in, c3 = embedding_input('c3', vocab_size, n_fac)
In [30]:
n_hidden = 256
Now create the 'green arrow' from our diagram - the layer operation from input to hidden
In [19]:
dense_in = Dense(n_hidden, activation='relu')
Our first hidden activation is simply this function applied to the result of the embedding of the first character(s)
In [20]:
c1_hidden = dense_in(c1)
Now create the 'orange arrow' from our diagram - the layer operation from hidden to hidden
In [21]:
dense_hidden = Dense(n_hidden, activation='tanh')
Our 2nd and 3rd hidden activations sum up the previous hidden status to the new input state
In [22]:
c2_dense = dense_in(c2)
hidden_2 = dense_hidden(c1_hidden)
c2_hidden = merge([c2_dense, hidden_2])
# merge: by default is a sum
In [23]:
c3_dense = dense_in(c3)
hidden_3 = dense_hidden(c2_hidden)
c3_hidden = merge([c3_dense, hidden_3])
Now create the 'blue arrow' from our diagram - the layer operation from hidden to output
In [24]:
dense_out = Dense(vocab_size, activation='softmax')
In [25]:
c4_out = dense_out(c3_hidden)
Till now, c4_out
contains all the model process information
In [26]:
c4_out
Out[26]:
In [27]:
model = Model([c1_in, c2_in, c3_in], c4_out)
In [28]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())
In [29]:
model.optimizer.lr=0.001
In [30]:
model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=5)
Out[30]:
In [31]:
def get_next(inp):
idxs = [char_indices[c] for c in inp]
arrs = [np.array(i)[np.newaxis] for i in idxs]
p = model.predict(arrs)
i = np.argmax(p)
return chars[i]
In [32]:
get_next('phi')
Out[32]:
In [33]:
get_next(' th')
Out[33]:
In [34]:
get_next(' an')
Out[34]:
In [15]:
cs=8
c_in_dat = [[idx[i+n] for i in range(0, len(idx)-1-cs, cs)]
for n in range(cs)]
Then create the labels for our model (i.e. the 9th char)
In [16]:
c_out_dat = [idx[i+cs] for i in range(0, len(idx)-1-cs, cs)]
In [17]:
xs = [np.stack(c[:-2]) for c in c_in_dat]
len(xs), xs[0].shape
Out[17]:
In [18]:
y = np.stack(c_out_dat[:-2])
So each column below is one series of 8 chars from the text
In [19]:
[xs[n][:cs] for n in range(4)]
Out[19]:
And this is the next char after each sequence
In [26]:
y[:4]
Out[26]:
In [27]:
n_fac=42
In [28]:
def embedding_input(name, n_in, n_out):
inp = Input(shape=(1,), dtype='int64', name=name+'_in')
emb = Embedding(n_in, n_out, input_length=1, name=name+'_emb')(inp)
return inp, Flatten()(emb)
In [29]:
c_ins = [embedding_input('c'+str(n), vocab_size, n_fac) for n in range(cs)]
In [30]:
n_hidden = 256
In [31]:
dense_in = Dense(n_hidden, activation='relu')
dense_hidden = Dense(n_hidden, activation='relu', init='identity')
dense_out = Dense(vocab_size, activation='softmax')
The first char of each sequence goes through dense_in(), to create our first hidden activations.
In [32]:
# the [1] means the embedding structure
hidden = dense_in(c_ins[0][1])
Then for each successive layer we combine the output of dense_in() on the next character with the output of dense_hidden() on the current hidden state, to create the new hidden state.
In [33]:
for i in range(1, cs):
c_dense = dense_in(c_ins[i][1])
hidden = dense_hidden(hidden)
hidden = merge([c_dense, hidden])
Putting the final hidden state through dense_out() gives us our output
In [34]:
c_out = dense_out(hidden)
So now we can create our model.
In [37]:
model = Model([c[0] for c in c_ins], c_out)
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())
model.fit(xs, y, batch_size=64, nb_epoch=10)
Out[37]:
In [38]:
def get_next(inp):
idxs = [np.array(char_indices[c])[np.newaxis] for c in inp]
p = model.predict(idxs)
return chars[np.argmax(p)]
In [39]:
get_next('for thos')
Out[39]:
In [40]:
get_next('part of ')
Out[40]:
In [41]:
model.fit(xs, y, batch_size=64, nb_epoch=5)
Out[41]:
In [42]:
get_next('for thos')
Out[42]:
In [43]:
n_hidden, n_fac, cs, vobac_size = (256, 42, 8, 86)
This is nearly exactly equivalent to the RNN we built ourselves in the previous section
In [44]:
model = Sequential([
Embedding(vocab_size, n_fac, input_length=cs),
# rather than initialize them randomly, we init them as an identity matrix
# it always does well with relu
SimpleRNN(n_hidden, activation='relu', inner_init='identity'),
Dense(vocab_size, activation='softmax')
])
In [45]:
model.summary()
'sparse_categorical_crossentropy' works together with the integer categorical output.
It's the same as 'categorical_crossentropy' with one-hot encoding output.
In [46]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())
In [47]:
model.fit(np.concatenate(xs, axis=1), y, batch_size=64, nb_epoch=8)
Out[47]:
In [49]:
def get_next_keras(inp):
idxs = [char_indices[c] for c in inp]
# np.newaxis is used to add 1 more dimention
arrs = np.array(idxs)[np.newaxis, :]
p = model.predict(arrs)[0]
return chars[np.argmax(p)]
In [50]:
get_next_keras('this is ')
Out[50]:
In [51]:
get_next_keras('part of ')
Out[51]:
In [52]:
get_next_keras('queens a')
Out[52]:
In [58]:
# c_in_dat = [[idx[i+n] for i in range(0, len(idx)-1-cs, cs)]
# for n in range(cs)]
c_out_dat = [[idx[i+n] for i in range(1, len(idx)-cs, cs)]
for n in range(cs)]
xs = [np.stack(c[:-2]) for c in c_in_dat]
ys = [np.stack(c[:-2]) for c in c_out_dat]
Reading down each column shows one set of inputs and outputs.
In [59]:
[xs[n][:cs] for n in range(cs)]
Out[59]:
In [60]:
[ys[n][:cs] for n in range(cs)]
Out[60]:
In [62]:
dense_in = Dense(n_hidden, activation='relu')
dense_hidden = Dense(n_hidden, activation='relu')
dense_out = Dense(vocab_size, activation='softmax')
In [63]:
inp1 = Input(shape=(n_fac,), name='zeros')
hidden = dense_in(inp1)
In [64]:
outs = []
for i in range(cs):
c_dense = dense_in(c_ins[i][1])
hidden = dense_hidden(hidden)
hidden = merge([c_dense, hidden], mode='sum')
outs.append(dense_out(hidden))
In [65]:
model = Model([inp1] + [c[0] for c in c_ins], outs)
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())
In [66]:
# add an array of 0s to our input
zeros = np.tile(np.zeros(n_fac), (len(xs[0]),1))
zeros.shape
Out[66]:
In [71]:
model.fit([zeros]+xs, ys, batch_size=64, nb_epoch=8)
Out[71]:
In [69]:
def get_nexts(inp):
idxs = [char_indices[c] for c in inp]
arrs = [np.array(i)[np.newaxis] for i in idxs]
p = model.predict([np.zeros(n_fac)[np.newaxis,:]] + arrs)
print(list(inp))
return [chars[np.argmax(o)] for o in p]
In [72]:
get_nexts(' this is')
Out[72]:
In [73]:
get_nexts(' part of')
Out[73]:
In [80]:
model = Sequential([
Embedding(vocab_size, n_fac, input_length=cs),
SimpleRNN(n_hidden, return_sequences=True, activation='relu', inner_init='identity'),
TimeDistributed(Dense(vocab_size, activation='softmax')),
])
In [81]:
model.summary()
In [82]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())
In [83]:
xs[0].shape
Out[83]:
In [90]:
x_rnn=np.stack(xs, axis=1)
c_out_dat = [[idx[i+n] for i in range(1, len(idx)-cs, cs)]
for n in range(cs)]
ys = [np.stack(c[:-2]) for c in c_out_dat]
y_rnn=np.expand_dims(np.stack(ys, axis=1),-1)
x_rnn.shape, y_rnn.shape
Out[90]:
In [91]:
model.fit(x_rnn, y_rnn, batch_size=64, nb_epoch=8)
Out[91]:
stateful=True
means that at end of each sequence, don't reset the hidden activations to 0, but leave them as they are. And also make sure that you pass shuffle=False
when you train the model.
A stateful model is easy to create (just add "stateful=True") but harder to train. We had to add batchnorm and use LSTM to get reasonable results.
When using stateful in keras, you have to also add 'batch_input_shape' to the first layer, and fix the batch size there.
In [92]:
bs=64
In [93]:
model=Sequential([
Embedding(vocab_size, n_fac, input_length=cs, batch_input_shape=(bs,8)),
BatchNormalization(),
LSTM(n_hidden, return_sequences=True, stateful=True),
TimeDistributed(Dense(vocab_size, activation='softmax')),
])
In [94]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())
Since we're using a fixed batch shape, we have to ensure our inputs and outputs are a even multiple of the batch size.
In [95]:
mx = len(x_rnn)//bs*bs
In [96]:
model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, nb_epoch=4, shuffle=False)
Out[96]:
As we start to try to add more and more stuff on top of Keras, or to Keras, increasingly, you'll find yourself wanting to use Theano. Because Theano is the kind of language that Keras is using behind the scenes.
In the process of doing it in Theano, we're going to force ourselves to think through a lot more of the details than we have before because Theano does not have any of the conveniences. There's no such thing as a layer. We have to think about all of the weight matrices and activtion functions ourself.
In Theano, there is a concept of variable. Rather than actually starting off by giving it data, we start off by describing the types of data that when we give it.
In [44]:
# import theano
# from theano import shared, tensor as T
n_input = vocab_size
n_output = vocab_size
n_hidden = 256
cs=8
Using raw theano, we have to create our weight matrices and bias vectors - here are the functions we'll use to do so (using glorot initialization).
The return values are wrapped in shared()
, which is how we tell theano that it can manage this data (copying it to and from the GPU as necessary).
In [26]:
def init_wgts(rows, cols):
scale = math.sqrt(2/rows)
return shared(normal(scale=scale, size=(rows, cols)).astype(np.float32))
def init_bias(rows):
return shared(np.zeros(rows, dtype=np.float32))
We return the weights and biases together as a tuple. For the hidden weights, we'll use an identity initialization (as recommended by Hinton.)
In [27]:
def wgts_and_bias(n_in, n_out):
return init_wgts(n_in, n_out), init_bias(n_out)
def id_and_bias(n):
return shared(np.eye(n, dtype=np.float32)), init_bias(n)
Theano doesn't actually do any computations until we explicitly compile and evaluate the function (at which point it'll be turned into CUDA code and sent off to the GPU). So our job is to describe the computations that we'll want theano to do - the first step is to tell theano what inputs we'll be providing to our computation:
In [28]:
t_inp = T.matrix('inp')
t_outp = T.matrix('outp')
t_h0 = T.vector('h0')
lr = T.scalar('lr')
all_args = [t_h0, t_inp, t_outp, lr]
Now, we're ready to create our initial weight matrices
In [34]:
W_h = id_and_bias(n_hidden)
W_x = wgts_and_bias(n_input, n_hidden)
W_y = wgts_and_bias(n_hidden, n_output)
w_all = list(chain.from_iterable([W_h, W_x, W_y]))
Theano handles looping by using the GPU scan operation. We have to tell theano what to do at each step through the scan - this is the function we'll use, which does a single forward pass for one char:
In [36]:
def step(x, h, W_h, b_h, W_x, b_x, W_y, b_y):
# calculate the hidden activations
h = nnet.relu(T.dot(x, W_x)+b_x+T.dot(h, W_h)+b_h)
# calculate the output activations
y = nnet.softmax(T.dot(h, W_y)+b_y)
# return both ('Flatten()' is to work around a theano bug)
return h, T.flatten(y,1)
Now we can provide everything necessary for the scan operation, so we can setup that up - we have to pass in the function to call at each step, the sequence to step through, the initial values of the outputs, and any other arguments to pass to the step function.
In [37]:
[v_h, v_y], _ = theano.scan(step, sequences=t_inp,
outputs_info=[t_h0, None], non_sequences=w_all)
We can now calculate our loss function, and all of our gradients, with just a couple of lines of code!
In [38]:
error = nnet.categorical_crossentropy(v_y, t_outp).sum()
g_all = T.grad(error, w_all)
We even have to show theano how to do SGD - so we set up this dictionary of updates to complete after every forward pass, which apply to standard SGD update rule to every weight.
In [39]:
def upd_dict(wgts, grads, lr):
return OrderedDict({w: w-g*lr for (w,g) in zip(wgts, grads)})
upd = upd_dict(w_all, g_all, lr)
We're finally ready to compile the function!
In [40]:
fn = theano.function(all_args, error, updates=upd, allow_input_downcast=True)
In [45]:
c_in_dat = [[idx[i+n] for i in range(0, len(idx)-1-cs, cs)]
for n in range(cs)]
xs = [np.stack(c[:-2]) for c in c_in_dat]
c_out_dat = [[idx[i+n] for i in range(1, len(idx)-cs, cs)]
for n in range(cs)]
ys = [np.stack(c[:-2]) for c in c_out_dat]
oh_ys = [to_categorical(o, vocab_size) for o in ys]
oh_y_rnn=np.stack(oh_ys, axis=1)
oh_xs = [to_categorical(o, vocab_size) for o in xs]
oh_x_rnn=np.stack(oh_xs, axis=1)
X = oh_x_rnn
Y = oh_y_rnn
X.shape, Y.shape
Out[45]:
To use it, we simply loop through our input data, calling the function compiled above, and printing our progress from time to time.
In [ ]:
err=0.0; l_rate=0.01
for i in range(len(X)):
err += fn(np.zeros(n_hidden), X[i], Y[i], l_rate)
if i % 1000 == 999:
print ("Error:{:.3f}".format(err/1000))
err=0.0
In [ ]: