In [1]:
from __future__ import division, print_function
%matplotlib inline
from importlib import reload  # Python 3
import utils; reload(utils)
from utils import *


Using cuDNN version 6021 on context None
Mapped name None to device cuda0: GeForce GTX TITAN X (0000:04:00.0)
Using Theano backend.

Setup

We're going to download the collected works of Nietzsche to use as our data for this class.


In [2]:
path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read()
print('corpus length:', len(text))


corpus length: 600893

In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)


total chars: 85

Sometimes it's useful to have a zero value in the dataset, e.g. for padding


In [4]:
chars.insert(0, "\0")

In [5]:
''.join(chars[1:-6])


Out[5]:
'\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxy'

Map from chars to indices and back again


In [6]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

idx will be the data we use from now own - it simply converts all the characters to their index (based on the mapping above)


In [7]:
idx = [char_indices[c] for c in text]

In [8]:
idx[:10]


Out[8]:
[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

In [9]:
''.join(indices_char[i] for i in idx[:70])


Out[9]:
'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not gro'

3 char model

Create inputs

Create a list of every 4th character, starting at the 0th, 1st, 2nd, then 3rd characters


In [10]:
cs=3
c1_dat = [idx[i] for i in range(0, len(idx)-1-cs, cs)]
c2_dat = [idx[i+1] for i in range(0, len(idx)-1-cs, cs)]
c3_dat = [idx[i+2] for i in range(0, len(idx)-1-cs, cs)]
c4_dat = [idx[i+3] for i in range(0, len(idx)-1-cs, cs)]

Our inputs


In [11]:
x1 = np.stack(c1_dat[:-2])
x2 = np.stack(c2_dat[:-2])
x3 = np.stack(c3_dat[:-2])

Our output


In [12]:
y = np.stack(c4_dat[:-2])

The first 4 inputs and outputs


In [13]:
x1[:4], x2[:4], x3[:4]


Out[13]:
(array([40, 30, 29,  1]), array([42, 25,  1, 43]), array([29, 27,  1, 45]))

In [14]:
y[:4]


Out[14]:
array([30, 29,  1, 40])

In [15]:
x1.shape, y.shape


Out[15]:
((200295,), (200295,))

The number of latent factors to create (i.e. the size of the embedding matrix)


In [16]:
n_fac = 42

Create inputs and embedding outputs for each of our 3 character inputs


In [17]:
def embedding_input(name, n_in, n_out):
    inp = Input(shape=(1,), dtype='int64', name=name)
    emb = Embedding(n_in, n_out, input_length=1)(inp)
    return inp, Flatten()(emb)

In [18]:
c1_in, c1 = embedding_input('c1', vocab_size, n_fac)
c2_in, c2 = embedding_input('c2', vocab_size, n_fac)
c3_in, c3 = embedding_input('c3', vocab_size, n_fac)

Create and train model

Pick a size for our hidden state


In [19]:
n_hidden = 256

This is the 'green arrow' from our diagram - the layer operation from input to hidden.


In [20]:
dense_in = Dense(n_hidden, activation='relu')

Our first hidden activation is simply this function applied to the result of the embedding of the first character.


In [21]:
c1_hidden = dense_in(c1)

This is the 'orange arrow' from our diagram - the layer operation from hidden to hidden.


In [22]:
dense_hidden = Dense(n_hidden, activation='tanh')

Our second and third hidden activations sum up the previous hidden state (after applying dense_hidden) to the new input state.


In [23]:
c2_dense = dense_in(c2)
hidden_2 = dense_hidden(c1_hidden)
c2_hidden = add([c2_dense, hidden_2])

In [24]:
c3_dense = dense_in(c3)
hidden_3 = dense_hidden(c2_hidden)
c3_hidden = add([c3_dense, hidden_3])

This is the 'blue arrow' from our diagram - the layer operation from hidden to output.


In [25]:
dense_out = Dense(vocab_size, activation='softmax')

The third hidden state is the input to our output layer.


In [26]:
c4_out = dense_out(c3_hidden)

In [27]:
model = Model([c1_in, c2_in, c3_in], c4_out)

In [28]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [29]:
model.optimizer.lr=0.000001

In [30]:
model.fit([x1, x2, x3], y, batch_size=64, epochs=4)


Epoch 1/4
200295/200295 [==============================] - 5s 23us/step - loss: 4.4038
Epoch 2/4
200295/200295 [==============================] - 4s 21us/step - loss: 4.2744
Epoch 3/4
200295/200295 [==============================] - 4s 21us/step - loss: 4.0004
Epoch 4/4
200295/200295 [==============================] - 4s 22us/step - loss: 3.6064
Out[30]:
<keras.callbacks.History at 0x7fd4128176a0>

In [31]:
model.optimizer.lr=0.01

In [32]:
model.fit([x1, x2, x3], y, batch_size=64, epochs=4)


Epoch 1/4
200295/200295 [==============================] - 4s 22us/step - loss: 3.3180
Epoch 2/4
200295/200295 [==============================] - 4s 21us/step - loss: 3.1918
Epoch 3/4
200295/200295 [==============================] - 4s 21us/step - loss: 3.1390
Epoch 4/4
200295/200295 [==============================] - 4s 21us/step - loss: 3.1158
Out[32]:
<keras.callbacks.History at 0x7fd4128174a8>

In [33]:
model.optimizer.lr=0.000001

In [34]:
model.fit([x1, x2, x3], y, batch_size=64, epochs=4)


Epoch 1/4
200295/200295 [==============================] - 4s 21us/step - loss: 3.1023
Epoch 2/4
200295/200295 [==============================] - 4s 21us/step - loss: 3.0925
Epoch 3/4
200295/200295 [==============================] - 4s 21us/step - loss: 3.0845
Epoch 4/4
200295/200295 [==============================] - 4s 21us/step - loss: 3.0774
Out[34]:
<keras.callbacks.History at 0x7fd4110512e8>

In [35]:
model.optimizer.lr=0.01

In [36]:
model.fit([x1, x2, x3], y, batch_size=64, epochs=4)


Epoch 1/4
200295/200295 [==============================] - 4s 22us/step - loss: 3.0708
Epoch 2/4
200295/200295 [==============================] - 4s 21us/step - loss: 3.0643
Epoch 3/4
200295/200295 [==============================] - 4s 22us/step - loss: 3.0579
Epoch 4/4
200295/200295 [==============================] - 4s 22us/step - loss: 3.0515
Out[36]:
<keras.callbacks.History at 0x7fd4128172e8>

Test model


In [37]:
def get_next(inp):
    idxs = [char_indices[c] for c in inp]
    arrs = [np.array(i)[np.newaxis] for i in idxs]
    p = model.predict(arrs)
    i = np.argmax(p)
    return chars[i]

In [38]:
get_next('phi')


Out[38]:
' '

In [39]:
get_next(' th')


Out[39]:
' '

In [40]:
get_next(' an')


Out[40]:
' '

Our first RNN!

Create inputs

This is the size of our unrolled RNN.


In [41]:
cs=8

For each of 0 through 7, create a list of every 8th character with that starting point. These will be the 8 inputs to out model.


In [42]:
c_in_dat = [[idx[i+n] for i in range(0, len(idx)-1-cs, cs)]
            for n in range(cs)]

Then create a list of the next character in each of these series. This will be the labels for our model.


In [43]:
c_out_dat = [idx[i+cs] for i in range(0, len(idx)-1-cs, cs)]

In [44]:
xs = [np.stack(c[:-2]) for c in c_in_dat]

In [45]:
len(xs), xs[0].shape


Out[45]:
(8, (75109,))

In [46]:
y = np.stack(c_out_dat[:-2])

So each column below is one series of 8 characters from the text.


In [47]:
[xs[n][:cs] for n in range(cs)]


Out[47]:
[array([40,  1, 33,  2, 72, 67, 73,  2]),
 array([42,  1, 38, 44,  2,  9, 61, 73]),
 array([29, 43, 31, 71, 54,  9, 58, 61]),
 array([30, 45,  2, 74,  2, 76, 67, 58]),
 array([25, 40, 73, 73, 76, 61, 24, 71]),
 array([27, 40, 61, 61, 68, 54,  2, 58]),
 array([29, 39, 54,  2, 66, 73, 33,  2]),
 array([ 1, 43, 73, 62, 54,  2, 72, 67])]

...and this is the next character after each sequence.


In [48]:
y[:cs]


Out[48]:
array([ 1, 33,  2, 72, 67, 73,  2, 68])

In [49]:
n_fac = 42

Create and train model


In [50]:
def embedding_input(name, n_in, n_out):
    inp = Input(shape=(1,), dtype='int64', name=name+'_in')
    emb = Embedding(n_in, n_out, input_length=1, name=name+'_emb')(inp)
    return inp, Flatten()(emb)

In [51]:
c_ins = [embedding_input('c'+str(n), vocab_size, n_fac) for n in range(cs)]

In [52]:
n_hidden = 256

In [53]:
dense_in = Dense(n_hidden, activation='relu')
dense_hidden = Dense(n_hidden, activation='relu', kernel_initializer='identity')
dense_out = Dense(vocab_size, activation='softmax')

The first character of each sequence goes through dense_in(), to create our first hidden activations.


In [54]:
hidden = dense_in(c_ins[0][1])  # [0] picks up the first char, [1] picks up the emb (see embedding_input)

Then for each successive layer we combine the output of dense_in() on the next character with the output of dense_hidden() on the current hidden state, to create the new hidden state.


In [55]:
for i in range(1,cs):
    c_dense = dense_in(c_ins[i][1])
    hidden = dense_hidden(hidden)
    hidden = add([c_dense, hidden])

Putting the final hidden state through dense_out() gives us our output.


In [56]:
c_out = dense_out(hidden)

So now we can create our model.


In [57]:
model = Model([c[0] for c in c_ins], c_out)
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [58]:
model.fit(xs, y, batch_size=64, epochs=12)


Epoch 1/12
75109/75109 [==============================] - 3s 36us/step - loss: 2.5237
Epoch 2/12
75109/75109 [==============================] - 3s 36us/step - loss: 2.2447
Epoch 3/12
75109/75109 [==============================] - 3s 36us/step - loss: 2.1471
Epoch 4/12
75109/75109 [==============================] - 3s 36us/step - loss: 2.0762
Epoch 5/12
75109/75109 [==============================] - 3s 36us/step - loss: 2.0227
Epoch 6/12
75109/75109 [==============================] - 3s 36us/step - loss: 1.9778
Epoch 7/12
75109/75109 [==============================] - 3s 36us/step - loss: 1.9381
Epoch 8/12
75109/75109 [==============================] - 3s 36us/step - loss: 1.9040
Epoch 9/12
75109/75109 [==============================] - 3s 36us/step - loss: 1.8735
Epoch 10/12
75109/75109 [==============================] - 3s 36us/step - loss: 1.8447
Epoch 11/12
75109/75109 [==============================] - 3s 36us/step - loss: 1.8207
Epoch 12/12
75109/75109 [==============================] - 3s 36us/step - loss: 1.7953
Out[58]:
<keras.callbacks.History at 0x7fd40659c208>

Test model


In [59]:
def get_next(inp):
    idxs = [np.array(char_indices[c])[np.newaxis] for c in inp]
    p = model.predict(idxs)
    return chars[np.argmax(p)]

In [60]:
get_next('for thos')


Out[60]:
'e'

In [61]:
get_next('part of ')


Out[61]:
't'

In [62]:
get_next('queens a')


Out[62]:
'n'

Our first RNN with keras!


In [63]:
n_hidden, n_fac, cs, vocab_size = (256, 42, 8, 86)

This is nearly exactly equivalent to the RNN we built ourselves in the previous section.


In [64]:
model=Sequential([
        Embedding(vocab_size, n_fac, input_length=cs),
        SimpleRNN(n_hidden, activation='relu', recurrent_initializer='identity'),
        Dense(vocab_size, activation='softmax')
    ])

In [65]:
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_4 (Embedding)      (None, 8, 42)             3612      
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 256)               76544     
_________________________________________________________________
dense_7 (Dense)              (None, 86)                22102     
=================================================================
Total params: 102,258
Trainable params: 102,258
Non-trainable params: 0
_________________________________________________________________

In [66]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [67]:
#model.fit(np.stack(xs,1), y, batch_size=64, epochs=8)    # doesn't work (it seems it doesn't like the extra 1 dim)
model.fit(np.stack(xs,1).squeeze(), y, batch_size=64, epochs=8)


Epoch 1/8
75109/75109 [==============================] - 3s 37us/step - loss: 2.8142
Epoch 2/8
75109/75109 [==============================] - 3s 37us/step - loss: 2.2926
Epoch 3/8
75109/75109 [==============================] - 3s 37us/step - loss: 2.0906
Epoch 4/8
75109/75109 [==============================] - 3s 37us/step - loss: 1.9567
Epoch 5/8
75109/75109 [==============================] - 3s 37us/step - loss: 1.8546
Epoch 6/8
75109/75109 [==============================] - 3s 37us/step - loss: 1.7730
Epoch 7/8
75109/75109 [==============================] - 3s 37us/step - loss: 1.7032
Epoch 8/8
75109/75109 [==============================] - 3s 37us/step - loss: 1.6463
Out[67]:
<keras.callbacks.History at 0x7fd40519d6a0>

In [68]:
def get_next_keras(inp):
    idxs = [char_indices[c] for c in inp]
    arrs = np.array(idxs)[np.newaxis,:]
    p = model.predict(arrs)[0]
    return chars[np.argmax(p)]

In [69]:
get_next_keras('this is ')


Out[69]:
't'

In [70]:
get_next_keras('part of ')


Out[70]:
't'

In [71]:
get_next_keras('queens a')


Out[71]:
'n'

Returning sequences

Create inputs

To use a sequence model, we can leave our input unchanged - but we have to change our output to a sequence (of course!)

Here, c_out_dat is identical to c_in_dat, but moved across 1 character.


In [72]:
#c_in_dat = [[idx[i+n] for i in range(0, len(idx)-1-cs, cs)]
#            for n in range(cs)]
c_out_dat = [[idx[i+n] for i in range(1, len(idx)-cs, cs)]
            for n in range(cs)]

In [73]:
xs = [np.stack(c[:-2]) for c in c_in_dat]  # repeated here to restore the original after the previous np.stack.squeeze
ys = [np.stack(c[:-2]) for c in c_out_dat]

Reading down each column shows one set of inputs and outputs.


In [74]:
[xs[n][:cs] for n in range(cs)]


Out[74]:
[array([40,  1, 33,  2, 72, 67, 73,  2]),
 array([42,  1, 38, 44,  2,  9, 61, 73]),
 array([29, 43, 31, 71, 54,  9, 58, 61]),
 array([30, 45,  2, 74,  2, 76, 67, 58]),
 array([25, 40, 73, 73, 76, 61, 24, 71]),
 array([27, 40, 61, 61, 68, 54,  2, 58]),
 array([29, 39, 54,  2, 66, 73, 33,  2]),
 array([ 1, 43, 73, 62, 54,  2, 72, 67])]

In [75]:
[ys[n][:cs] for n in range(cs)]


Out[75]:
[array([42,  1, 38, 44,  2,  9, 61, 73]),
 array([29, 43, 31, 71, 54,  9, 58, 61]),
 array([30, 45,  2, 74,  2, 76, 67, 58]),
 array([25, 40, 73, 73, 76, 61, 24, 71]),
 array([27, 40, 61, 61, 68, 54,  2, 58]),
 array([29, 39, 54,  2, 66, 73, 33,  2]),
 array([ 1, 43, 73, 62, 54,  2, 72, 67]),
 array([ 1, 33,  2, 72, 67, 73,  2, 68])]

Create and train model


In [76]:
dense_in = Dense(n_hidden, activation='relu')
dense_hidden = Dense(n_hidden, activation='relu', kernel_initializer='identity')
dense_out = Dense(vocab_size, activation='softmax', name='output')

We're going to pass a vector of all zeros as our starting point - here's our input layers for that:


In [77]:
inp1 = Input(shape=(n_fac,), name='zeros')
hidden = dense_in(inp1)

In [78]:
outs = []

for i in range(cs):
    c_dense = dense_in(c_ins[i][1])
    hidden = dense_hidden(hidden)
    hidden = add([c_dense, hidden])
    # every layer now has an output
    outs.append(dense_out(hidden))

In [79]:
model = Model([inp1] + [c[0] for c in c_ins], outs)
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [80]:
zeros = np.tile(np.zeros(n_fac), (len(xs[0]),1))
zeros.shape


Out[80]:
(75109, 42)

In [81]:
model.fit([zeros]+xs, ys, batch_size=64, epochs=12)


Epoch 1/12
75109/75109 [==============================] - 6s 77us/step - loss: 20.0653 - output_loss_1: 2.7128 - output_loss_2: 2.5656 - output_loss_3: 2.5041 - output_loss_4: 2.4754 - output_loss_5: 2.4604 - output_loss_6: 2.4547 - output_loss_7: 2.4629 - output_loss_8: 2.4294
Epoch 2/12
75109/75109 [==============================] - 6s 77us/step - loss: 17.8424 - output_loss_1: 2.5152 - output_loss_2: 2.3560 - output_loss_3: 2.2255 - output_loss_4: 2.1759 - output_loss_5: 2.1479 - output_loss_6: 2.1433 - output_loss_7: 2.1533 - output_loss_8: 2.1253
Epoch 3/12
75109/75109 [==============================] - 6s 77us/step - loss: 17.2275 - output_loss_1: 2.4985 - output_loss_2: 2.3331 - output_loss_3: 2.1633 - output_loss_4: 2.0852 - output_loss_5: 2.0472 - output_loss_6: 2.0368 - output_loss_7: 2.0440 - output_loss_8: 2.0194
Epoch 4/12
75109/75109 [==============================] - 6s 77us/step - loss: 16.8621 - output_loss_1: 2.4914 - output_loss_2: 2.3247 - output_loss_3: 2.1332 - output_loss_4: 2.0340 - output_loss_5: 1.9846 - output_loss_6: 1.9693 - output_loss_7: 1.9739 - output_loss_8: 1.9509
Epoch 5/12
75109/75109 [==============================] - 6s 77us/step - loss: 16.6054 - output_loss_1: 2.4867 - output_loss_2: 2.3195 - output_loss_3: 2.1153 - output_loss_4: 1.9996 - output_loss_5: 1.9405 - output_loss_6: 1.9195 - output_loss_7: 1.9236 - output_loss_8: 1.9007
Epoch 6/12
75109/75109 [==============================] - 6s 77us/step - loss: 16.4153 - output_loss_1: 2.4846 - output_loss_2: 2.3150 - output_loss_3: 2.1013 - output_loss_4: 1.9755 - output_loss_5: 1.9079 - output_loss_6: 1.8848 - output_loss_7: 1.8853 - output_loss_8: 1.8608
Epoch 7/12
75109/75109 [==============================] - 6s 77us/step - loss: 16.2691 - output_loss_1: 2.4827 - output_loss_2: 2.3120 - output_loss_3: 2.0941 - output_loss_4: 1.9565 - output_loss_5: 1.8831 - output_loss_6: 1.8561 - output_loss_7: 1.8543 - output_loss_8: 1.8304
Epoch 8/12
75109/75109 [==============================] - 6s 78us/step - loss: 16.1518 - output_loss_1: 2.4818 - output_loss_2: 2.3111 - output_loss_3: 2.0860 - output_loss_4: 1.9408 - output_loss_5: 1.8628 - output_loss_6: 1.8312 - output_loss_7: 1.8308 - output_loss_8: 1.8072
Epoch 9/12
75109/75109 [==============================] - 6s 79us/step - loss: 16.0596 - output_loss_1: 2.4804 - output_loss_2: 2.3082 - output_loss_3: 2.0824 - output_loss_4: 1.9302 - output_loss_5: 1.8464 - output_loss_6: 1.8138 - output_loss_7: 1.8115 - output_loss_8: 1.7868
Epoch 10/12
75109/75109 [==============================] - 6s 77us/step - loss: 15.9790 - output_loss_1: 2.4796 - output_loss_2: 2.3073 - output_loss_3: 2.0776 - output_loss_4: 1.9186 - output_loss_5: 1.8349 - output_loss_6: 1.7952 - output_loss_7: 1.7927 - output_loss_8: 1.7732
Epoch 11/12
75109/75109 [==============================] - 6s 78us/step - loss: 15.9102 - output_loss_1: 2.4786 - output_loss_2: 2.3050 - output_loss_3: 2.0731 - output_loss_4: 1.9110 - output_loss_5: 1.8225 - output_loss_6: 1.7828 - output_loss_7: 1.7794 - output_loss_8: 1.7578
Epoch 12/12
75109/75109 [==============================] - 6s 78us/step - loss: 15.8514 - output_loss_1: 2.4781 - output_loss_2: 2.3053 - output_loss_3: 2.0710 - output_loss_4: 1.9044 - output_loss_5: 1.8123 - output_loss_6: 1.7711 - output_loss_7: 1.7642 - output_loss_8: 1.7449
Out[81]:
<keras.callbacks.History at 0x7fd404ac12b0>

Test model


In [82]:
def get_nexts(inp):
    idxs = [char_indices[c] for c in inp]
    arrs = [np.array(i)[np.newaxis] for i in idxs]
    p = model.predict([np.zeros(n_fac)[np.newaxis,:]] + arrs)
    print(list(inp))
    return [chars[np.argmax(o)] for o in p]

In [83]:
get_nexts(' this is')


[' ', 't', 'h', 'i', 's', ' ', 'i', 's']
Out[83]:
['t', 'h', 'e', 't', ' ', 'c', 'n', ' ']

In [84]:
get_nexts(' part of')


[' ', 'p', 'a', 'r', 't', ' ', 'o', 'f']
Out[84]:
['t', 'o', 's', 't', 'i', 'o', 'f', ' ']

Sequence model with keras


In [85]:
n_hidden, n_fac, cs, vocab_size


Out[85]:
(256, 42, 8, 86)

To convert our previous keras model into a sequence model, simply add the 'return_sequences=True' parameter, and add TimeDistributed() around our dense layer.


In [86]:
model=Sequential([
        Embedding(vocab_size, n_fac, input_length=cs),
        SimpleRNN(n_hidden, return_sequences=True, activation='relu', recurrent_initializer='identity'),
        TimeDistributed(Dense(vocab_size, activation='softmax')),
    ])

In [87]:
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_5 (Embedding)      (None, 8, 42)             3612      
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (None, 8, 256)            76544     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 8, 86)             22102     
=================================================================
Total params: 102,258
Trainable params: 102,258
Non-trainable params: 0
_________________________________________________________________

In [88]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [89]:
x_rnn=np.stack(xs, axis=1)
# y_rnn=np.expand_dims(np.stack(ys, axis=1), -1)  # doesn't work (it seems it doesn't like the extra 1 dim)
y_rnn=np.stack(ys, axis=1)

In [90]:
x_rnn.shape, y_rnn.shape


Out[90]:
((75109, 8), (75109, 8, 1))

In [91]:
model.fit(x_rnn, y_rnn, batch_size=64, epochs=8)


Epoch 1/8
75109/75109 [==============================] - 3s 37us/step - loss: 2.4339
Epoch 2/8
75109/75109 [==============================] - 3s 37us/step - loss: 2.0032
Epoch 3/8
75109/75109 [==============================] - 3s 37us/step - loss: 1.8870
Epoch 4/8
75109/75109 [==============================] - 3s 38us/step - loss: 1.8259
Epoch 5/8
75109/75109 [==============================] - 3s 38us/step - loss: 1.7872
Epoch 6/8
75109/75109 [==============================] - 3s 37us/step - loss: 1.7599
Epoch 7/8
75109/75109 [==============================] - 3s 38us/step - loss: 1.7394
Epoch 8/8
75109/75109 [==============================] - 3s 38us/step - loss: 1.7237
Out[91]:
<keras.callbacks.History at 0x7fd3f4dd6ac8>

In [92]:
def get_nexts_keras(inp):
    idxs = [char_indices[c] for c in inp]
    arr = np.array(idxs)[np.newaxis,:]
    p = model.predict(arr)[0]
    print(list(inp))
    return [chars[np.argmax(o)] for o in p]

In [93]:
get_nexts_keras(' this is')


[' ', 't', 'h', 'i', 's', ' ', 'i', 's']
Out[93]:
['t', 'h', 'e', 's', ' ', 's', 'n', ' ']

One-hot sequence model with keras

This is the keras version of the theano model that we're about to create.


In [94]:
model=Sequential([
        SimpleRNN(n_hidden, return_sequences=True, input_shape=(cs, vocab_size),
                  activation='relu', recurrent_initializer='identity'),
        TimeDistributed(Dense(vocab_size, activation='softmax')),
    ])
model.compile(loss='categorical_crossentropy', optimizer=Adam())

In [95]:
oh_ys = [to_categorical(o, vocab_size) for o in ys]
oh_y_rnn=np.stack(oh_ys, axis=1)

oh_xs = [to_categorical(o, vocab_size) for o in xs]
oh_x_rnn=np.stack(oh_xs, axis=1)

oh_x_rnn.shape, oh_y_rnn.shape


Out[95]:
((75109, 8, 86), (75109, 8, 86))

In [96]:
model.fit(oh_x_rnn, oh_y_rnn, batch_size=64, epochs=8)


Epoch 1/8
75109/75109 [==============================] - 3s 36us/step - loss: 2.4443
Epoch 2/8
75109/75109 [==============================] - 3s 36us/step - loss: 2.0369
Epoch 3/8
75109/75109 [==============================] - 3s 36us/step - loss: 1.9247
Epoch 4/8
75109/75109 [==============================] - 3s 36us/step - loss: 1.8595
Epoch 5/8
75109/75109 [==============================] - 3s 35us/step - loss: 1.8173
Epoch 6/8
75109/75109 [==============================] - 3s 37us/step - loss: 1.7867
Epoch 7/8
75109/75109 [==============================] - 3s 37us/step - loss: 1.7633
Epoch 8/8
75109/75109 [==============================] - 3s 35us/step - loss: 1.7447
Out[96]:
<keras.callbacks.History at 0x7fd3f21f1a20>

In [97]:
def get_nexts_oh(inp):
    idxs = np.array([char_indices[c] for c in inp])
    arr = to_categorical(idxs, vocab_size)
    p = model.predict(arr[np.newaxis,:])[0]
    print(list(inp))
    return [chars[np.argmax(o)] for o in p]

In [98]:
get_nexts_oh(' this is')


[' ', 't', 'h', 'i', 's', ' ', 'i', 's']
Out[98]:
['t', 'h', 'e', 'n', ' ', 't', 't', ' ']

Stateful model with keras


In [99]:
bs=64

A stateful model is easy to create (just add "stateful=True") but harder to train. We had to add batchnorm and use LSTM to get reasonable results.

When using stateful in keras, you have to also add 'batch_input_shape' to the first layer, and fix the batch size there.


In [100]:
model=Sequential([
        Embedding(vocab_size, n_fac, input_length=cs, batch_input_shape=(bs,8)),
        BatchNormalization(),
        LSTM(n_hidden, return_sequences=True, stateful=True),
        TimeDistributed(Dense(vocab_size, activation='softmax')),
    ])

In [101]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

Since we're using a fixed batch shape, we have to ensure our inputs and outputs are a even multiple of the batch size.


In [102]:
mx = len(x_rnn)//bs*bs

In [103]:
model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, epochs=4, shuffle=False)


Epoch 1/4
75072/75072 [==============================] - 14s 187us/step - loss: 2.2532
Epoch 2/4
75072/75072 [==============================] - 14s 186us/step - loss: 1.9809
Epoch 3/4
75072/75072 [==============================] - 14s 186us/step - loss: 1.9014
Epoch 4/4
75072/75072 [==============================] - 14s 186us/step - loss: 1.8544
Out[103]:
<keras.callbacks.History at 0x7fd3883566a0>

In [104]:
model.optimizer.lr=1e-4

In [105]:
model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, epochs=4, shuffle=False)


Epoch 1/4
75072/75072 [==============================] - 14s 188us/step - loss: 1.8202
Epoch 2/4
75072/75072 [==============================] - 14s 186us/step - loss: 1.7933
Epoch 3/4
75072/75072 [==============================] - 14s 187us/step - loss: 1.7707
Epoch 4/4
75072/75072 [==============================] - 14s 189us/step - loss: 1.7509
Out[105]:
<keras.callbacks.History at 0x7fd3730f5e48>

In [106]:
model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, epochs=4, shuffle=False)


Epoch 1/4
75072/75072 [==============================] - 14s 187us/step - loss: 1.7334
Epoch 2/4
75072/75072 [==============================] - 14s 191us/step - loss: 1.7174
Epoch 3/4
75072/75072 [==============================] - 14s 184us/step - loss: 1.7026
Epoch 4/4
75072/75072 [==============================] - 14s 182us/step - loss: 1.6890
Out[106]:
<keras.callbacks.History at 0x7fd3730f5320>

Theano RNN


In [107]:
n_input = vocab_size
n_output = vocab_size

Using raw theano, we have to create our weight matrices and bias vectors ourselves - here are the functions we'll use to do so (using glorot initialization).

The return values are wrapped in shared(), which is how we tell theano that it can manage this data (copying it to and from the GPU as necessary).


In [108]:
def init_wgts(rows, cols): 
    scale = math.sqrt(2/rows)
    return shared(normal(scale=scale, size=(rows, cols)).astype(np.float32))
def init_bias(rows): 
    return shared(np.zeros(rows, dtype=np.float32))

We return the weights and biases together as a tuple. For the hidden weights, we'll use an identity initialization (as recommended by Hinton.)


In [109]:
def wgts_and_bias(n_in, n_out): 
    return init_wgts(n_in, n_out), init_bias(n_out)
def id_and_bias(n): 
    return shared(np.eye(n, dtype=np.float32)), init_bias(n)

Theano doesn't actually do any computations until we explicitly compile and evaluate the function (at which point it'll be turned into CUDA code and sent off to the GPU). So our job is to describe the computations that we'll want theano to do - the first step is to tell theano what inputs we'll be providing to our computation:


In [110]:
t_inp = T.matrix('inp')
t_outp = T.matrix('outp')
t_h0 = T.vector('h0')
lr = T.scalar('lr')

all_args = [t_h0, t_inp, t_outp, lr]

Now we're ready to create our intial weight matrices.


In [111]:
W_h = id_and_bias(n_hidden)
W_x = wgts_and_bias(n_input, n_hidden)
W_y = wgts_and_bias(n_hidden, n_output)
w_all = list(chain.from_iterable([W_h, W_x, W_y]))

Theano handles looping by using the GPU scan operation. We have to tell theano what to do at each step through the scan - this is the function we'll use, which does a single forward pass for one character:


In [112]:
def step(x, h, W_h, b_h, W_x, b_x, W_y, b_y):
    # Calculate the hidden activations
    h = nnet.relu(T.dot(x, W_x) + b_x + T.dot(h, W_h) + b_h)
    # Calculate the output activations
    y = nnet.softmax(T.dot(h, W_y) + b_y)
    # Return both (the 'Flatten()' is to work around a theano bug)
    return h, T.flatten(y, 1)

Now we can provide everything necessary for the scan operation, so we can setup that up - we have to pass in the function to call at each step, the sequence to step through, the initial values of the outputs, and any other arguments to pass to the step function.


In [113]:
[v_h, v_y], _ = theano.scan(step, sequences=t_inp, 
                            outputs_info=[t_h0, None], non_sequences=w_all)


/home/roebius/anaconda/envs/f1/lib/python3.5/site-packages/ipykernel_launcher.py:5: UserWarning: DEPRECATION: If x is a vector, Softmax will not automatically pad x anymore in next releases. If you need it, please do it manually. The vector case is gonna be supported soon and the output will be a vector.
  """

We can now calculate our loss function, and all of our gradients, with just a couple of lines of code!


In [114]:
error = nnet.categorical_crossentropy(v_y, t_outp).sum()
g_all = T.grad(error, w_all)

We even have to show theano how to do SGD - so we set up this dictionary of updates to complete after every forward pass, which apply to standard SGD update rule to every weight.


In [115]:
def upd_dict(wgts, grads, lr): 
    return OrderedDict({w: w-g*lr for (w,g) in zip(wgts,grads)})

upd = upd_dict(w_all, g_all, lr)

We're finally ready to compile the function!


In [116]:
fn = theano.function(all_args, error, updates=upd, allow_input_downcast=True)

In [117]:
X = oh_x_rnn
Y = oh_y_rnn
X.shape, Y.shape


Out[117]:
((75109, 8, 86), (75109, 8, 86))

To use it, we simply loop through our input data, calling the function compiled above, and printing our progress from time to time.


In [118]:
err=0.0; l_rate=0.01
for i in range(len(X)): 
    err+=fn(np.zeros(n_hidden), X[i], Y[i], l_rate)
    if i % 1000 == 999: 
        print ("Error:{:.3f}".format(err/1000))
        err=0.0


Error:25.113
Error:21.447
Error:20.902
Error:19.888
Error:18.840
Error:19.223
Error:19.110
Error:18.491
Error:17.950
Error:18.230
Error:17.472
Error:17.620
Error:18.440
Error:17.333
Error:16.850
Error:17.778
Error:17.389
Error:17.223
Error:16.817
Error:16.685
Error:16.588
Error:16.415
Error:16.789
Error:16.169
Error:16.767
Error:16.545
Error:16.044
Error:16.199
Error:16.329
Error:16.487
Error:16.728
Error:16.399
Error:16.717
Error:16.321
Error:16.063
Error:16.692
Error:15.950
Error:16.450
Error:16.049
Error:16.269
Error:15.423
Error:15.742
Error:15.855
Error:16.028
Error:15.993
Error:15.796
Error:15.599
Error:16.054
Error:15.954
Error:16.100
Error:15.301
Error:15.576
Error:15.082
Error:14.945
Error:15.606
Error:15.393
Error:14.820
Error:15.495
Error:15.153
Error:15.081
Error:15.058
Error:15.361
Error:15.395
Error:14.995
Error:14.705
Error:14.693
Error:14.237
Error:14.787
Error:15.183
Error:14.822
Error:15.242
Error:14.763
Error:14.491
Error:14.520
Error:14.525

In [119]:
f_y = theano.function([t_h0, t_inp], v_y, allow_input_downcast=True)

In [120]:
pred = np.argmax(f_y(np.zeros(n_hidden), X[6]), axis=1)

In [121]:
act = np.argmax(X[6], axis=1)

In [122]:
[indices_char[o] for o in act]


Out[122]:
['t', 'h', 'e', 'n', '?', ' ', 'I', 's']

In [123]:
[indices_char[o] for o in pred]


Out[123]:
['h', 'e', ' ', ' ', ' ', 'T', 't', ' ']

Pure python RNN!

Set up basic functions

Now we're going to try to repeat the above theano RNN, using just pure python (and numpy). Which means, we have to do everything ourselves, including defining the basic functions of a neural net! Below are all of the definitions, along with tests to check that they give the same answers as theano. The functions ending in _d are the derivatives of each function.


In [124]:
def sigmoid(x): return 1/(1+np.exp(-x))
def sigmoid_d(x): 
    output = sigmoid(x)
    return output*(1-output)

In [125]:
def relu(x): return np.maximum(0., x)
def relu_d(x): return (x > 0.)*1.

In [126]:
relu(np.array([3.,-3.])), relu_d(np.array([3.,-3.]))


Out[126]:
(array([ 3.,  0.]), array([ 1.,  0.]))

In [127]:
def dist(a,b): return pow(a-b,2)
def dist_d(a,b): return 2*(a-b)

In [128]:
import pdb

In [129]:
eps = 1e-7
def x_entropy(pred, actual): 
    return -np.sum(actual * np.log(np.clip(pred, eps, 1-eps)))
def x_entropy_d(pred, actual): return -actual/pred

In [130]:
def softmax(x): return np.exp(x)/np.exp(x).sum()

In [131]:
def softmax_d(x):
    sm = softmax(x)
    res = np.expand_dims(-sm,-1)*sm
    res[np.diag_indices_from(res)] = sm*(1-sm)
    return res

In [132]:
test_preds = np.array([0.2,0.7,0.1])
test_actuals = np.array([0.,1.,0.])
nnet.categorical_crossentropy(test_preds, test_actuals).eval()


Out[132]:
array(0.35667494393873245)

In [133]:
x_entropy(test_preds, test_actuals)


Out[133]:
0.35667494393873245

In [134]:
test_inp = T.dvector()
test_out = nnet.categorical_crossentropy(test_inp, test_actuals)
test_grad = theano.function([test_inp], T.grad(test_out, test_inp))

In [135]:
test_grad(test_preds)


Out[135]:
array([-0.    , -1.4286, -0.    ])

In [136]:
x_entropy_d(test_preds, test_actuals)


Out[136]:
array([-0.    , -1.4286, -0.    ])

In [137]:
pre_pred = random(oh_x_rnn[0][0].shape)
preds = softmax(pre_pred)
actual = oh_x_rnn[0][0]

In [138]:
loss=x_entropy
loss_d=x_entropy_d

In [139]:
np.allclose(softmax_d(pre_pred).dot(loss_d(preds,actual)), preds-actual)


Out[139]:
True

In [140]:
softmax(test_preds)


Out[140]:
array([ 0.2814,  0.464 ,  0.2546])

In [141]:
nnet.softmax(test_preds).eval()


/home/roebius/anaconda/envs/f1/lib/python3.5/site-packages/ipykernel_launcher.py:1: UserWarning: DEPRECATION: If x is a vector, Softmax will not automatically pad x anymore in next releases. If you need it, please do it manually. The vector case is gonna be supported soon and the output will be a vector.
  """Entry point for launching an IPython kernel.
Out[141]:
array([[ 0.2814,  0.464 ,  0.2546]])

In [142]:
test_out = T.flatten(nnet.softmax(test_inp))


/home/roebius/anaconda/envs/f1/lib/python3.5/site-packages/ipykernel_launcher.py:1: UserWarning: DEPRECATION: If x is a vector, Softmax will not automatically pad x anymore in next releases. If you need it, please do it manually. The vector case is gonna be supported soon and the output will be a vector.
  """Entry point for launching an IPython kernel.

In [143]:
test_grad = theano.function([test_inp], theano.gradient.jacobian(test_out, test_inp))

In [144]:
test_grad(test_preds)


Out[144]:
array([[ 0.2022, -0.1306, -0.0717],
       [-0.1306,  0.2487, -0.1181],
       [-0.0717, -0.1181,  0.1898]])

In [145]:
softmax_d(test_preds)


Out[145]:
array([[ 0.2022, -0.1306, -0.0717],
       [-0.1306,  0.2487, -0.1181],
       [-0.0717, -0.1181,  0.1898]])

In [146]:
act=relu
act_d = relu_d

We also have to define our own scan function. Since we're not worrying about running things in parallel, it's very simple to implement:


In [147]:
def scan(fn, start, seq):
    res = []
    prev = start
    for s in seq:
        app = fn(prev, s)
        res.append(app)
        prev = app
    return res

...for instance, scan on + is the cumulative sum.


In [148]:
scan(lambda prev,curr: prev+curr, 0, range(5))


Out[148]:
[0, 1, 3, 6, 10]

Set up training

Let's now build the functions to do the forward and backward passes of our RNN. First, define our data and shape.


In [149]:
inp = oh_x_rnn
outp = oh_y_rnn
n_input = vocab_size
n_output = vocab_size

In [150]:
inp.shape, outp.shape


Out[150]:
((75109, 8, 86), (75109, 8, 86))

Here's the function to do a single forward pass of an RNN, for a single character.


In [151]:
def one_char(prev, item):
    # Previous state
    tot_loss, pre_hidden, pre_pred, hidden, ypred = prev
    # Current inputs and output
    x, y = item
    pre_hidden = np.dot(x,w_x) + np.dot(hidden,w_h)
    hidden = act(pre_hidden)
    pre_pred = np.dot(hidden,w_y)
    ypred = softmax(pre_pred)
    return (
        # Keep track of loss so we can report it
        tot_loss+loss(ypred, y),
        # Used in backprop
        pre_hidden, pre_pred, 
        # Used in next iteration
        hidden, 
        # To provide predictions
        ypred)

We use scan to apply the above to a whole sequence of characters.


In [152]:
def get_chars(n): return zip(inp[n], outp[n])
def one_fwd(n): return scan(one_char, (0,0,0,np.zeros(n_hidden),0), get_chars(n))

Now we can define the backward step. We use a loop to go through every element of the sequence. The derivatives are applying the chain rule to each step, and accumulating the gradients across the sequence.


In [153]:
# "Columnify" a vector
def col(x): return x[:,newaxis]

def one_bkwd(args, n):
    global w_x,w_y,w_h

    i=inp[n]  # 8x86
    o=outp[n] # 8x86
    d_pre_hidden = np.zeros(n_hidden) # 256
    for p in reversed(range(len(i))):
        totloss, pre_hidden, pre_pred, hidden, ypred = args[p]
        x=i[p] # 86
        y=o[p] # 86
        d_pre_pred = softmax_d(pre_pred).dot(loss_d(ypred,y))  # 86
        d_pre_hidden = (np.dot(d_pre_hidden, w_h.T) 
                        + np.dot(d_pre_pred,w_y.T)) * act_d(pre_hidden) # 256

        # d(loss)/d(w_y) = d(loss)/d(pre_pred) * d(pre_pred)/d(w_y)
        w_y -= col(hidden) * d_pre_pred * alpha
        # d(loss)/d(w_h) = d(loss)/d(pre_hidden[p-1]) * d(pre_hidden[p-1])/d(w_h)
        if (p>0): w_h -= args[p-1][3].dot(d_pre_hidden) * alpha
        w_x -= col(x)*d_pre_hidden * alpha
    return d_pre_hidden

Now we can set up our initial weight matrices. Note that we're not using bias at all in this example, in order to keep things simpler.


In [154]:
scale=math.sqrt(2./n_input)
w_x = normal(scale=scale, size=(n_input,n_hidden))
w_y = normal(scale=scale, size=(n_hidden, n_output))
w_h = np.eye(n_hidden, dtype=np.float32)

Our loop looks much like the theano loop in the previous section, except that we have to call the backwards step ourselves.


In [155]:
overallError=0
alpha=0.0001
for n in range(10000):
    res = one_fwd(n)
    overallError+=res[-1][0]
    deriv = one_bkwd(res, n)
    if(n % 1000 == 999):
        print ("Error:{:.4f}; Gradient:{:.5f}".format(
                overallError/1000, np.linalg.norm(deriv)))
        overallError=0


Error:35.6776; Gradient:2.08857
Error:35.4488; Gradient:1.92239
Error:34.9883; Gradient:3.64758
Error:32.9029; Gradient:3.86891
Error:30.9743; Gradient:3.65797
Error:30.0525; Gradient:3.72072
Error:28.9902; Gradient:4.33666
Error:28.5418; Gradient:3.55847
Error:28.0224; Gradient:3.44001
Error:28.0573; Gradient:3.18044

Keras GRU

Identical to the last keras rnn, but a GRU!


In [156]:
# No inner_init='identity' in the model, 
# "Identity matrix initializer can only be used for 2D square matrices" (!?)
model=Sequential([
        GRU(n_hidden, return_sequences=True, input_shape=(cs, vocab_size),
                  activation='relu'),
        TimeDistributed(Dense(vocab_size, activation='softmax')),
    ])
model.compile(loss='categorical_crossentropy', optimizer=Adam())

In [157]:
model.fit(oh_x_rnn, oh_y_rnn, batch_size=64, epochs=8)


Epoch 1/8
75109/75109 [==============================] - 9s 120us/step - loss: 2.4016
Epoch 2/8
75109/75109 [==============================] - 9s 119us/step - loss: 1.9825
Epoch 3/8
75109/75109 [==============================] - 9s 119us/step - loss: 1.8736
Epoch 4/8
75109/75109 [==============================] - 9s 119us/step - loss: 1.8110
Epoch 5/8
75109/75109 [==============================] - 9s 120us/step - loss: 1.7685
Epoch 6/8
75109/75109 [==============================] - 9s 120us/step - loss: 1.7381
Epoch 7/8
75109/75109 [==============================] - 9s 120us/step - loss: 1.7139
Epoch 8/8
75109/75109 [==============================] - 9s 120us/step - loss: 1.6941
Out[157]:
<keras.callbacks.History at 0x7fd36029a588>

In [158]:
get_nexts_oh(' this is')


[' ', 't', 'h', 'i', 's', ' ', 'i', 's']
Out[158]:
['t', 'h', 'e', 's', ' ', 's', 'n', ' ']

Theano GRU

Separate weights

The theano GRU looks just like the simple theano RNN, except for the use of the reset and update gates. Each of these gates requires its own hidden and input weights, so we add those to our weight matrices.


In [159]:
W_h = id_and_bias(n_hidden)
W_x = init_wgts(n_input, n_hidden)
W_y = wgts_and_bias(n_hidden, n_output)
rW_h = init_wgts(n_hidden, n_hidden)
rW_x = wgts_and_bias(n_input, n_hidden)
uW_h = init_wgts(n_hidden, n_hidden)
uW_x = wgts_and_bias(n_input, n_hidden)
w_all = list(chain.from_iterable([W_h, W_y, uW_x, rW_x]))
w_all.extend([W_x, uW_h, rW_h])

Here's the definition of a gate - it's just a sigmoid applied to the addition of the dot products of the input vectors.


In [160]:
def gate(x, h, W_h, W_x, b_x):
    return nnet.sigmoid(T.dot(x, W_x) + b_x + T.dot(h, W_h))

Our step is nearly identical to before, except that we multiply our hidden state by our reset gate, and we update our hidden state based on the update gate.


In [161]:
def step(x, h, W_h, b_h, W_y, b_y, uW_x, ub_x, rW_x, rb_x, W_x, uW_h, rW_h):
    reset = gate(x, h, rW_h, rW_x, rb_x)
    update = gate(x, h, uW_h, uW_x, ub_x)
    h_new = gate(x, h * reset, W_h, W_x, b_h)
    h = update*h + (1-update)*h_new
    y = nnet.softmax(T.dot(h, W_y) + b_y)
    return h, T.flatten(y, 1)

Everything from here on is identical to our simple RNN in theano.


In [162]:
[v_h, v_y], _ = theano.scan(step, sequences=t_inp, 
                            outputs_info=[t_h0, None], non_sequences=w_all)


/home/roebius/anaconda/envs/f1/lib/python3.5/site-packages/ipykernel_launcher.py:6: UserWarning: DEPRECATION: If x is a vector, Softmax will not automatically pad x anymore in next releases. If you need it, please do it manually. The vector case is gonna be supported soon and the output will be a vector.
  

In [163]:
error = nnet.categorical_crossentropy(v_y, t_outp).sum()
g_all = T.grad(error, w_all)

In [164]:
upd = upd_dict(w_all, g_all, lr)
fn = theano.function(all_args, error, updates=upd, allow_input_downcast=True)

In [165]:
err=0.0; l_rate=0.1
for i in range(len(X)): 
    err+=fn(np.zeros(n_hidden), X[i], Y[i], l_rate)
    if i % 1000 == 999: 
        l_rate *= 0.95
        print ("Error:{:.2f}".format(err/1000))
        err=0.0


Error:26.94
Error:22.56
Error:22.16
Error:21.27
Error:20.41
Error:20.75
Error:20.45
Error:19.82
Error:19.62
Error:19.88
Error:19.13
Error:19.17
Error:19.83
Error:19.03
Error:18.43
Error:19.53
Error:19.31
Error:19.00
Error:18.39
Error:18.19
Error:17.94
Error:17.93
Error:18.40
Error:17.94
Error:18.18
Error:18.03
Error:17.76
Error:17.86
Error:17.83
Error:18.10
Error:18.33
Error:17.94
Error:18.20
Error:17.78
Error:17.65
Error:18.24
Error:17.53
Error:18.09
Error:17.66
Error:17.75
Error:17.06
Error:17.50
Error:17.33
Error:17.72
Error:17.62
Error:17.75
Error:17.49
Error:18.65
Error:17.47
Error:17.72
Error:17.04
Error:17.40
Error:16.85
Error:16.91
Error:17.57
Error:17.33
Error:16.88
Error:17.34
Error:17.34
Error:17.22
Error:16.87
Error:17.33
Error:17.17
Error:17.11
Error:16.84
Error:16.80
Error:16.69
Error:16.79
Error:17.41
Error:16.73
Error:17.35
Error:16.77
Error:16.65
Error:16.62
Error:16.50

Combined weights

We can make the previous section simpler and faster by concatenating the hidden and input matrices and inputs together. We're not going to step through this cell by cell - you'll see it's identical to the previous section except for this concatenation.


In [166]:
W = (shared(np.concatenate([np.eye(n_hidden), normal(size=(n_input, n_hidden))])
            .astype(np.float32)), init_bias(n_hidden))

rW = wgts_and_bias(n_input+n_hidden, n_hidden)
uW = wgts_and_bias(n_input+n_hidden, n_hidden)
W_y = wgts_and_bias(n_hidden, n_output)
w_all = list(chain.from_iterable([W, W_y, uW, rW]))

In [167]:
def gate(m, W, b): return nnet.sigmoid(T.dot(m, W) + b)

In [168]:
def step(x, h, W, b, W_y, b_y, uW, ub, rW, rb):
    m = T.concatenate([h, x])
    reset = gate(m, rW, rb)
    update = gate(m, uW, ub)
    m = T.concatenate([h*reset, x])
    h_new = gate(m, W, b)
    h = update*h + (1-update)*h_new
    y = nnet.softmax(T.dot(h, W_y) + b_y)
    return h, T.flatten(y, 1)

In [169]:
[v_h, v_y], _ = theano.scan(step, sequences=t_inp, 
                            outputs_info=[t_h0, None], non_sequences=w_all)


/home/roebius/anaconda/envs/f1/lib/python3.5/site-packages/ipykernel_launcher.py:8: UserWarning: DEPRECATION: If x is a vector, Softmax will not automatically pad x anymore in next releases. If you need it, please do it manually. The vector case is gonna be supported soon and the output will be a vector.
  

In [170]:
def upd_dict(wgts, grads, lr): 
    return OrderedDict({w: w-g*lr for (w,g) in zip(wgts,grads)})

In [171]:
error = nnet.categorical_crossentropy(v_y, t_outp).sum()
g_all = T.grad(error, w_all)

In [172]:
upd = upd_dict(w_all, g_all, lr)
fn = theano.function(all_args, error, updates=upd, allow_input_downcast=True)

In [173]:
err=0.0; l_rate=0.01
for i in range(len(X)): 
    err+=fn(np.zeros(n_hidden), X[i], Y[i], l_rate)
    if i % 1000 == 999: 
        print ("Error:{:.2f}".format(err/1000))
        err=0.0


Error:24.70
Error:22.14
Error:22.01
Error:21.24
Error:20.48
Error:20.99
Error:20.70
Error:20.17
Error:19.95
Error:20.28
Error:19.54
Error:19.66
Error:20.27
Error:19.52
Error:18.99
Error:19.98
Error:19.74
Error:19.58
Error:18.95
Error:18.79
Error:18.45
Error:18.49
Error:19.02
Error:18.46
Error:18.70
Error:18.49
Error:18.28
Error:18.33
Error:18.28
Error:18.45
Error:18.79
Error:18.34
Error:18.59
Error:18.22
Error:17.94
Error:18.50
Error:17.82
Error:18.36
Error:17.88
Error:18.00
Error:17.27
Error:17.73
Error:17.55
Error:17.87
Error:17.82
Error:17.85
Error:17.65
Error:17.96
Error:17.65
Error:17.76
Error:17.16
Error:17.30
Error:16.71
Error:16.83
Error:17.31
Error:17.24
Error:16.65
Error:17.21
Error:16.98
Error:16.92
Error:16.67
Error:17.09
Error:16.94
Error:16.72
Error:16.52
Error:16.46
Error:16.24
Error:16.48
Error:17.00
Error:16.40
Error:16.84
Error:16.33
Error:16.20
Error:16.16
Error:16.07

End