In [ ]:
from theano.sandbox import cuda
#cuda.use('gpu1')

In [2]:
%matplotlib inline
import utils
from utils import *
from __future__ import division, print_function


Using TensorFlow backend.

Setup

We're going to download the collected works of Nietzsche to use as our data for this class.


In [3]:
path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read()
print('corpus length:', len(text))


corpus length: 600893

In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)


total chars: 85

Sometimes it's useful to have a zero value in the dataset, e.g. for padding


In [5]:
chars.insert(0, "\0")

In [6]:
''.join(chars[1:-5])


Out[6]:
'\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz'

Map from chars to indices and back again


In [7]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
char_indices;

idx will be the data we use from now own - it simply converts all the characters to their index (based on the mapping above)


In [8]:
idx = [char_indices[c] for c in text]

In [9]:
idx[:10]


Out[9]:
[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

In [10]:
''.join(indices_char[i] for i in idx[:70])


Out[10]:
'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not gro'

3 char model

Create inputs

Create a list of every 4th character, starting at the 0th, 1st, 2nd, then 3rd characters


In [11]:
cs=3
c1_dat = [idx[i] for i in range(0, len(idx)-1-cs, cs)]
c2_dat = [idx[i+1] for i in range(0, len(idx)-1-cs, cs)]
c3_dat = [idx[i+2] for i in range(0, len(idx)-1-cs, cs)]
c4_dat = [idx[i+3] for i in range(0, len(idx)-1-cs, cs)]

In [12]:
c1_dat[:10]


Out[12]:
[40, 30, 29, 1, 40, 43, 31, 61, 2, 74]

Our inputs


In [13]:
# just creates an array out of list
xxx = np.stack(c1_dat[:-2])
xxx.shape
xxx[:10]


Out[13]:
(200295,)
Out[13]:
array([40, 30, 29,  1, 40, 43, 31, 61,  2, 74])

In [14]:
x1 = np.stack(c1_dat[:-2])
x2 = np.stack(c2_dat[:-2])
x3 = np.stack(c3_dat[:-2])

Our output


In [15]:
y = np.stack(c4_dat[:-2])

The first 4 inputs and outputs


In [16]:
x1[:5], x2[:4], x3[:4]


Out[16]:
(array([40, 30, 29,  1, 40]), array([42, 25,  1, 43]), array([29, 27,  1, 45]))

In [17]:
y[:4]


Out[17]:
array([30, 29,  1, 40])

In [ ]:
x1.shape, y.shape

The number of latent factors to create (i.e. the size of the embedding matrix)


In [ ]:
n_fac = 42

Create inputs and embedding outputs for each of our 3 character inputs


In [ ]:
def embedding_input(name, n_in, n_out):
    inp = Input(shape=(1,), dtype='int64', name=name)
    emb = Embedding(n_in, n_out, input_length=1)(inp)
    return inp, Flatten()(emb)

In [ ]:
c1_in, c1 = embedding_input('c1', vocab_size, n_fac)
c2_in, c2 = embedding_input('c2', vocab_size, n_fac)
c3_in, c3 = embedding_input('c3', vocab_size, n_fac)

Create and train model

Pick a size for our hidden state


In [ ]:
n_hidden = 256

This is the 'green arrow' from our diagram - the layer operation from input to hidden.


In [ ]:
dense_in = Dense(n_hidden, activation='relu')

Our first hidden activation is simply this function applied to the result of the embedding of the first character.


In [ ]:
c1_hidden = dense_in(c1)

This is the 'orange arrow' from our diagram - the layer operation from hidden to hidden.


In [ ]:
dense_hidden = Dense(n_hidden, activation='tanh')

Our second and third hidden activations sum up the previous hidden state (after applying dense_hidden) to the new input state.


In [ ]:
c2_dense = dense_in(c2)
hidden_2 = dense_hidden(c1_hidden)
c2_hidden = merge([c2_dense, hidden_2])

In [ ]:
c3_dense = dense_in(c3)
hidden_3 = dense_hidden(c2_hidden)
c3_hidden = merge([c3_dense, hidden_3])

This is the 'blue arrow' from our diagram - the layer operation from hidden to output.


In [ ]:
dense_out = Dense(vocab_size, activation='softmax')

The third hidden state is the input to our output layer.


In [ ]:
c4_out = dense_out(c3_hidden)

In [ ]:
model = Model([c1_in, c2_in, c3_in], c4_out)

In [ ]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [ ]:
model.optimizer.lr=0.000001

In [ ]:
model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=4)

In [ ]:
model.optimizer.lr=0.01

In [ ]:
model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=4)

In [ ]:
model.optimizer.lr=0.000001

In [ ]:
model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=4)

In [ ]:
model.optimizer.lr=(0.01)

In [ ]:
model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=4)

Test model


In [ ]:
def get_next(inp):
    idxs = [char_indices[c] for c in inp]
    print(idxs)
    arrs = [np.array(i)[np.newaxis] for i in idxs]
    print(arrs)
    p = model.predict(arrs)
    i = np.argmax(p)
    print(p, i)
    return chars[i]

In [ ]:
get_next('phi')

In [ ]:
get_next(' th')

In [ ]:
get_next(' an')

RNNs

Create inputs

This is the size of our unrolled RNN.


In [ ]:
cs=8

For each of 0 through 7, create a list of every 8th character with that starting point. These will be the 8 inputs to out model.


In [ ]:
l = len(idx)-cs-1  # relevant for later sequences
c_in_dat = [[idx[i+n] for i in range(0, l, cs)]
            for n in range(cs)]

# dimensions
len(idx), l/cs
[len(c_in_dat[i]) for i in range(cs)],  len(c_in_dat)

# python does not support multi-axes slicing in lists
c_in_dat[0][0:10]
c_in_dat[7][0:10]

Then create a list of the next character in each of these series. This will be the labels for our model.


In [ ]:
# same as c_out_dat[0] shifted by 1
c_out_dat = [idx[i+cs] for i in range(0, l, cs)]
c_out_dat[0:10]

In [ ]:
xs = [np.stack(c[:-2]) for c in c_in_dat]

In [ ]:
# transform into numpy array
len(xs), xs[0].shape
xs[0][0:10]

In [ ]:
y = np.stack(c_out_dat[:-2])

So each column below is one series of 8 characters from the text.


In [ ]:
len(xs), xs[0].shape
[xs[n][:cs+10] for n in range(cs)]

...and this is the next character after each sequence.


In [ ]:
y[:cs+10]

In [ ]:
n_fac = 42

Our first RNN!

Create and train model


In [ ]:
def embedding_input(name, n_in, n_out):
    inp = Input(shape=(1,), dtype='int64', name=name+'_in')
    emb = Embedding(n_in, n_out, input_length=1, name=name+'_emb')(inp)
    return inp, Flatten()(emb)

In [ ]:
c_ins = [embedding_input('c'+str(n), vocab_size, n_fac) for n in range(cs)]
c_ins

In [ ]:
n_hidden = 256

In [ ]:
dense_in = Dense(n_hidden, activation='relu') # green arrow

# init: no random init but identity matrix, so multiplication does not change the original,
# so hidden state does not change at all
# rnn hidden wheight matrix and relus: init with identity
dense_hidden = Dense(n_hidden, activation='relu', init='identity') # orange arrow
dense_out = Dense(vocab_size, activation='softmax') # blue arrow

The first character of each sequence goes through dense_in(), to create our first hidden activations.


In [ ]:
# get first character's embedding
hidden = dense_in(c_ins[0][1])

Then for each successive layer we combine the output of dense_in() on the next character with the output of dense_hidden() on the current hidden state, to create the new hidden state.


In [ ]:
for i in range(1,cs):
    c_dense = dense_in(c_ins[i][1])
    hidden = dense_hidden(hidden)
    hidden = merge([c_dense, hidden])

Putting the final hidden state through dense_out() gives us our output.


In [ ]:
c_out = dense_out(hidden)

So now we can create our model.


In [ ]:
model = Model([c[0] for c in c_ins], c_out)
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())
model.summary();

In [ ]:
model.fit(xs, y, batch_size=64, nb_epoch=12)

Test model


In [ ]:
def get_next(inp):
    idxs = [np.array(char_indices[c])[np.newaxis] for c in inp]
    p = model.predict(idxs)
    return chars[np.argmax(p)]

In [ ]:
get_next('for thos')

In [ ]:
get_next('part of ')

In [ ]:
get_next('queens a')

Our first RNN with keras!


In [ ]:
n_hidden, n_fac, cs, vocab_size = (256, 42, 8, 86)

This is nearly exactly equivalent to the RNN we built ourselves in the previous section.


In [ ]:
model=Sequential([
        Embedding(vocab_size, n_fac, input_length=cs),
        SimpleRNN(n_hidden, activation='relu', inner_init='identity'),
        Dense(vocab_size, activation='softmax')
    ])

In [ ]:
model.summary()

In [ ]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [ ]:
np.stack(xs,1).squeeze().shape

In [ ]:
model.fit(np.stack(xs,1).squeeze(), y, batch_size=64, nb_epoch=8)
#model.fit(xs, y, batch_size=64, nb_epoch=8)

In [ ]:
def get_next_keras(inp):
    idxs = [char_indices[c] for c in inp]
    arrs = np.array(idxs)[np.newaxis,:]
    p = model.predict(arrs)[0]
    return chars[np.argmax(p)]

In [ ]:
get_next_keras('this is ')

In [ ]:
get_next_keras('part of ')

In [ ]:
get_next_keras('queens a')

Returning sequences

Create inputs

To use a sequence model, we can leave our input unchanged - but we have to change our output to a sequence (of course!)

Here, c_out_dat is identical to c_in_dat, but moved across 1 character.


In [ ]:
#c_in_dat = [[idx[i+n] for i in range(0, len(idx)-1-cs, cs)]
#            for n in range(cs)]
c_out_dat = [[idx[i+n] for i in range(1, len(idx)-cs, cs)]
            for n in range(cs)]

In [ ]:
ys = [np.stack(c[:-2]) for c in c_out_dat]

Reading down each column shows one set of inputs and outputs.


In [ ]:
[xs[n][:cs] for n in range(cs)]

In [ ]:
[ys[n][:cs] for n in range(cs)]

Create and train model


In [ ]:
dense_in = Dense(n_hidden, activation='relu')
dense_hidden = Dense(n_hidden, activation='relu', init='identity')
dense_out = Dense(vocab_size, activation='softmax', name='output')

We're going to pass a vector of all zeros as our starting point - here's our input layers for that:


In [ ]:
inp1 = Input(shape=(n_fac,), name='zeros')
hidden = dense_in(inp1)

In [ ]:
outs = []

for i in range(cs):
    c_dense = dense_in(c_ins[i][1])
    hidden = dense_hidden(hidden)
    hidden = merge([c_dense, hidden], mode='sum')
    # every layer now has an output
    outs.append(dense_out(hidden))

In [ ]:
model = Model([inp1] + [c[0] for c in c_ins], outs)
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [ ]:
model.summary()

In [ ]:
np.tile
zeros = np.tile(np.zeros(n_fac), (len(xs[0]),1))
zeros.shape

In [ ]:
%%capture output
model.fit([zeros]+xs, ys, batch_size=64, nb_epoch=12)

In [ ]:
output.show()

Test model


In [ ]:
def get_nexts(inp):
    idxs = [char_indices[c] for c in inp]
    arrs = [np.array(i)[np.newaxis] for i in idxs]
    p = model.predict([np.zeros(n_fac)[np.newaxis,:]] + arrs)
    print(list(inp))
    return [chars[np.argmax(o)] for o in p]

In [ ]:
get_nexts(' this is')

In [ ]:
get_nexts(' part of')

Sequence model with keras


In [ ]:
n_hidden, n_fac, cs, vocab_size

To convert our previous keras model into a sequence model, simply add the 'return_sequences=True' parameter, and add TimeDistributed() around our dense layer.


In [ ]:
model=Sequential([
        Embedding(vocab_size, n_fac, input_length=cs),
        SimpleRNN(n_hidden, return_sequences=True, activation='relu', inner_init='identity'),
        TimeDistributed(Dense(vocab_size, activation='softmax')),
    ])

In [ ]:
model.summary()

In [ ]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [ ]:
xs[0].shape
len(ys)

In [ ]:
x_rnn=np.stack(xs, axis=1)
#y_rnn=np.expand_dims(np.stack(ys, axis=1), -1)
y_rnn=np.stack(ys, axis=1)

In [ ]:
x_rnn.shape, y_rnn.shape
x_rnn[0]

In [ ]:
%%capture output
model.fit(x_rnn, y_rnn, batch_size=64, nb_epoch=8)

In [ ]:
output.show()

In [ ]:
def get_nexts_keras(inp):
    idxs = [char_indices[c] for c in inp]
    arr = np.array(idxs)[np.newaxis,:]
    p = model.predict(arr)[0]
    print(list(inp))
    return [chars[np.argmax(o)] for o in p]

In [ ]:
get_nexts_keras(' this is')

One-hot sequence model with keras

This is the keras version of the theano model that we're about to create.


In [ ]:
model=Sequential([
        SimpleRNN(n_hidden, return_sequences=True, input_shape=(cs, vocab_size),
                  activation='relu', inner_init='identity'),
        TimeDistributed(Dense(vocab_size, activation='softmax')),
    ])
model.compile(loss='categorical_crossentropy', optimizer=Adam())

In [ ]:
oh_ys = [to_categorical(o, vocab_size) for o in ys]
oh_y_rnn=np.stack(oh_ys, axis=1)

oh_xs = [to_categorical(o, vocab_size) for o in xs]
oh_x_rnn=np.stack(oh_xs, axis=1)

oh_x_rnn.shape, oh_y_rnn.shape

In [ ]:
%%capture output
model.fit(oh_x_rnn, oh_y_rnn, batch_size=64, nb_epoch=8)

In [ ]:
output.show()

In [ ]:
def get_nexts_oh(inp):
    idxs = np.array([char_indices[c] for c in inp])
    arr = to_categorical(idxs, vocab_size)
    p = model.predict(arr[np.newaxis,:])[0]
    print(list(inp))
    return [chars[np.argmax(o)] for o in p]

In [ ]:
get_nexts_oh(' this is')

Stateful model with keras


In [ ]:
bs=64

A stateful model is easy to create (just add "stateful=True") but harder to train. We had to add batchnorm and use LSTM to get reasonable results.

When using stateful in keras, you have to also add 'batch_input_shape' to the first layer, and fix the batch size there.


In [ ]:
model=Sequential([
        Embedding(vocab_size, n_fac, input_length=cs, batch_input_shape=(bs,8)),
        BatchNormalization(),
        LSTM(n_hidden, return_sequences=True, stateful=True),
        TimeDistributed(Dense(vocab_size, activation='softmax')),
    ])

In [ ]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())
model.summary()

In [ ]:
model.input_shape

Since we're using a fixed batch shape, we have to ensure our inputs and outputs are a even multiple of the batch size.


In [ ]:
mx = len(x_rnn)//bs*bs
mx

In [ ]:
%%capture output
model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, nb_epoch=4, shuffle=False)

In [ ]:
output.show()

In [ ]:
model.optimizer.lr=1e-4

In [ ]:
%%capture output
model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, nb_epoch=4, shuffle=False)

In [ ]:
output.show()

In [ ]:
%%capture output
model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, nb_epoch=4, shuffle=False)

In [ ]:
output.show()

Keras GRU

Identical to the last keras rnn, but a GRU!


In [ ]:
model=Sequential([
        GRU(n_hidden, return_sequences=True, input_shape=(cs, vocab_size),
                  activation='relu', inner_init='identity'),
        TimeDistributed(Dense(vocab_size, activation='softmax')),
    ])
model.compile(loss='categorical_crossentropy', optimizer=Adam())

In [ ]:
model.fit(oh_x_rnn, oh_y_rnn, batch_size=64, nb_epoch=8)

In [ ]:
get_nexts_oh(' this is')