In [ ]:
from theano.sandbox import cuda
#cuda.use('gpu1')
In [2]:
%matplotlib inline
import utils
from utils import *
from __future__ import division, print_function
We're going to download the collected works of Nietzsche to use as our data for this class.
In [3]:
path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read()
print('corpus length:', len(text))
In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)
Sometimes it's useful to have a zero value in the dataset, e.g. for padding
In [5]:
chars.insert(0, "\0")
In [6]:
''.join(chars[1:-5])
Out[6]:
Map from chars to indices and back again
In [7]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
char_indices;
idx will be the data we use from now own - it simply converts all the characters to their index (based on the mapping above)
In [8]:
idx = [char_indices[c] for c in text]
In [9]:
idx[:10]
Out[9]:
In [10]:
''.join(indices_char[i] for i in idx[:70])
Out[10]:
Create a list of every 4th character, starting at the 0th, 1st, 2nd, then 3rd characters
In [11]:
cs=3
c1_dat = [idx[i] for i in range(0, len(idx)-1-cs, cs)]
c2_dat = [idx[i+1] for i in range(0, len(idx)-1-cs, cs)]
c3_dat = [idx[i+2] for i in range(0, len(idx)-1-cs, cs)]
c4_dat = [idx[i+3] for i in range(0, len(idx)-1-cs, cs)]
In [12]:
c1_dat[:10]
Out[12]:
Our inputs
In [13]:
# just creates an array out of list
xxx = np.stack(c1_dat[:-2])
xxx.shape
xxx[:10]
Out[13]:
Out[13]:
In [14]:
x1 = np.stack(c1_dat[:-2])
x2 = np.stack(c2_dat[:-2])
x3 = np.stack(c3_dat[:-2])
Our output
In [15]:
y = np.stack(c4_dat[:-2])
The first 4 inputs and outputs
In [16]:
x1[:5], x2[:4], x3[:4]
Out[16]:
In [17]:
y[:4]
Out[17]:
In [ ]:
x1.shape, y.shape
The number of latent factors to create (i.e. the size of the embedding matrix)
In [ ]:
n_fac = 42
Create inputs and embedding outputs for each of our 3 character inputs
In [ ]:
def embedding_input(name, n_in, n_out):
inp = Input(shape=(1,), dtype='int64', name=name)
emb = Embedding(n_in, n_out, input_length=1)(inp)
return inp, Flatten()(emb)
In [ ]:
c1_in, c1 = embedding_input('c1', vocab_size, n_fac)
c2_in, c2 = embedding_input('c2', vocab_size, n_fac)
c3_in, c3 = embedding_input('c3', vocab_size, n_fac)
Pick a size for our hidden state
In [ ]:
n_hidden = 256
This is the 'green arrow' from our diagram - the layer operation from input to hidden.
In [ ]:
dense_in = Dense(n_hidden, activation='relu')
Our first hidden activation is simply this function applied to the result of the embedding of the first character.
In [ ]:
c1_hidden = dense_in(c1)
This is the 'orange arrow' from our diagram - the layer operation from hidden to hidden.
In [ ]:
dense_hidden = Dense(n_hidden, activation='tanh')
Our second and third hidden activations sum up the previous hidden state (after applying dense_hidden) to the new input state.
In [ ]:
c2_dense = dense_in(c2)
hidden_2 = dense_hidden(c1_hidden)
c2_hidden = merge([c2_dense, hidden_2])
In [ ]:
c3_dense = dense_in(c3)
hidden_3 = dense_hidden(c2_hidden)
c3_hidden = merge([c3_dense, hidden_3])
This is the 'blue arrow' from our diagram - the layer operation from hidden to output.
In [ ]:
dense_out = Dense(vocab_size, activation='softmax')
The third hidden state is the input to our output layer.
In [ ]:
c4_out = dense_out(c3_hidden)
In [ ]:
model = Model([c1_in, c2_in, c3_in], c4_out)
In [ ]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())
In [ ]:
model.optimizer.lr=0.000001
In [ ]:
model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=4)
In [ ]:
model.optimizer.lr=0.01
In [ ]:
model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=4)
In [ ]:
model.optimizer.lr=0.000001
In [ ]:
model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=4)
In [ ]:
model.optimizer.lr=(0.01)
In [ ]:
model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=4)
In [ ]:
def get_next(inp):
idxs = [char_indices[c] for c in inp]
print(idxs)
arrs = [np.array(i)[np.newaxis] for i in idxs]
print(arrs)
p = model.predict(arrs)
i = np.argmax(p)
print(p, i)
return chars[i]
In [ ]:
get_next('phi')
In [ ]:
get_next(' th')
In [ ]:
get_next(' an')
This is the size of our unrolled RNN.
In [ ]:
cs=8
For each of 0 through 7, create a list of every 8th character with that starting point. These will be the 8 inputs to out model.
In [ ]:
l = len(idx)-cs-1 # relevant for later sequences
c_in_dat = [[idx[i+n] for i in range(0, l, cs)]
for n in range(cs)]
# dimensions
len(idx), l/cs
[len(c_in_dat[i]) for i in range(cs)], len(c_in_dat)
# python does not support multi-axes slicing in lists
c_in_dat[0][0:10]
c_in_dat[7][0:10]
Then create a list of the next character in each of these series. This will be the labels for our model.
In [ ]:
# same as c_out_dat[0] shifted by 1
c_out_dat = [idx[i+cs] for i in range(0, l, cs)]
c_out_dat[0:10]
In [ ]:
xs = [np.stack(c[:-2]) for c in c_in_dat]
In [ ]:
# transform into numpy array
len(xs), xs[0].shape
xs[0][0:10]
In [ ]:
y = np.stack(c_out_dat[:-2])
So each column below is one series of 8 characters from the text.
In [ ]:
len(xs), xs[0].shape
[xs[n][:cs+10] for n in range(cs)]
...and this is the next character after each sequence.
In [ ]:
y[:cs+10]
In [ ]:
n_fac = 42
In [ ]:
def embedding_input(name, n_in, n_out):
inp = Input(shape=(1,), dtype='int64', name=name+'_in')
emb = Embedding(n_in, n_out, input_length=1, name=name+'_emb')(inp)
return inp, Flatten()(emb)
In [ ]:
c_ins = [embedding_input('c'+str(n), vocab_size, n_fac) for n in range(cs)]
c_ins
In [ ]:
n_hidden = 256
In [ ]:
dense_in = Dense(n_hidden, activation='relu') # green arrow
# init: no random init but identity matrix, so multiplication does not change the original,
# so hidden state does not change at all
# rnn hidden wheight matrix and relus: init with identity
dense_hidden = Dense(n_hidden, activation='relu', init='identity') # orange arrow
dense_out = Dense(vocab_size, activation='softmax') # blue arrow
The first character of each sequence goes through dense_in(), to create our first hidden activations.
In [ ]:
# get first character's embedding
hidden = dense_in(c_ins[0][1])
Then for each successive layer we combine the output of dense_in() on the next character with the output of dense_hidden() on the current hidden state, to create the new hidden state.
In [ ]:
for i in range(1,cs):
c_dense = dense_in(c_ins[i][1])
hidden = dense_hidden(hidden)
hidden = merge([c_dense, hidden])
Putting the final hidden state through dense_out() gives us our output.
In [ ]:
c_out = dense_out(hidden)
So now we can create our model.
In [ ]:
model = Model([c[0] for c in c_ins], c_out)
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())
model.summary();
In [ ]:
model.fit(xs, y, batch_size=64, nb_epoch=12)
In [ ]:
def get_next(inp):
idxs = [np.array(char_indices[c])[np.newaxis] for c in inp]
p = model.predict(idxs)
return chars[np.argmax(p)]
In [ ]:
get_next('for thos')
In [ ]:
get_next('part of ')
In [ ]:
get_next('queens a')
In [ ]:
n_hidden, n_fac, cs, vocab_size = (256, 42, 8, 86)
This is nearly exactly equivalent to the RNN we built ourselves in the previous section.
In [ ]:
model=Sequential([
Embedding(vocab_size, n_fac, input_length=cs),
SimpleRNN(n_hidden, activation='relu', inner_init='identity'),
Dense(vocab_size, activation='softmax')
])
In [ ]:
model.summary()
In [ ]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())
In [ ]:
np.stack(xs,1).squeeze().shape
In [ ]:
model.fit(np.stack(xs,1).squeeze(), y, batch_size=64, nb_epoch=8)
#model.fit(xs, y, batch_size=64, nb_epoch=8)
In [ ]:
def get_next_keras(inp):
idxs = [char_indices[c] for c in inp]
arrs = np.array(idxs)[np.newaxis,:]
p = model.predict(arrs)[0]
return chars[np.argmax(p)]
In [ ]:
get_next_keras('this is ')
In [ ]:
get_next_keras('part of ')
In [ ]:
get_next_keras('queens a')
To use a sequence model, we can leave our input unchanged - but we have to change our output to a sequence (of course!)
Here, c_out_dat is identical to c_in_dat, but moved across 1 character.
In [ ]:
#c_in_dat = [[idx[i+n] for i in range(0, len(idx)-1-cs, cs)]
# for n in range(cs)]
c_out_dat = [[idx[i+n] for i in range(1, len(idx)-cs, cs)]
for n in range(cs)]
In [ ]:
ys = [np.stack(c[:-2]) for c in c_out_dat]
Reading down each column shows one set of inputs and outputs.
In [ ]:
[xs[n][:cs] for n in range(cs)]
In [ ]:
[ys[n][:cs] for n in range(cs)]
In [ ]:
dense_in = Dense(n_hidden, activation='relu')
dense_hidden = Dense(n_hidden, activation='relu', init='identity')
dense_out = Dense(vocab_size, activation='softmax', name='output')
We're going to pass a vector of all zeros as our starting point - here's our input layers for that:
In [ ]:
inp1 = Input(shape=(n_fac,), name='zeros')
hidden = dense_in(inp1)
In [ ]:
outs = []
for i in range(cs):
c_dense = dense_in(c_ins[i][1])
hidden = dense_hidden(hidden)
hidden = merge([c_dense, hidden], mode='sum')
# every layer now has an output
outs.append(dense_out(hidden))
In [ ]:
model = Model([inp1] + [c[0] for c in c_ins], outs)
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())
In [ ]:
model.summary()
In [ ]:
np.tile
zeros = np.tile(np.zeros(n_fac), (len(xs[0]),1))
zeros.shape
In [ ]:
%%capture output
model.fit([zeros]+xs, ys, batch_size=64, nb_epoch=12)
In [ ]:
output.show()
In [ ]:
def get_nexts(inp):
idxs = [char_indices[c] for c in inp]
arrs = [np.array(i)[np.newaxis] for i in idxs]
p = model.predict([np.zeros(n_fac)[np.newaxis,:]] + arrs)
print(list(inp))
return [chars[np.argmax(o)] for o in p]
In [ ]:
get_nexts(' this is')
In [ ]:
get_nexts(' part of')
In [ ]:
n_hidden, n_fac, cs, vocab_size
To convert our previous keras model into a sequence model, simply add the 'return_sequences=True' parameter, and add TimeDistributed() around our dense layer.
In [ ]:
model=Sequential([
Embedding(vocab_size, n_fac, input_length=cs),
SimpleRNN(n_hidden, return_sequences=True, activation='relu', inner_init='identity'),
TimeDistributed(Dense(vocab_size, activation='softmax')),
])
In [ ]:
model.summary()
In [ ]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())
In [ ]:
xs[0].shape
len(ys)
In [ ]:
x_rnn=np.stack(xs, axis=1)
#y_rnn=np.expand_dims(np.stack(ys, axis=1), -1)
y_rnn=np.stack(ys, axis=1)
In [ ]:
x_rnn.shape, y_rnn.shape
x_rnn[0]
In [ ]:
%%capture output
model.fit(x_rnn, y_rnn, batch_size=64, nb_epoch=8)
In [ ]:
output.show()
In [ ]:
def get_nexts_keras(inp):
idxs = [char_indices[c] for c in inp]
arr = np.array(idxs)[np.newaxis,:]
p = model.predict(arr)[0]
print(list(inp))
return [chars[np.argmax(o)] for o in p]
In [ ]:
get_nexts_keras(' this is')
This is the keras version of the theano model that we're about to create.
In [ ]:
model=Sequential([
SimpleRNN(n_hidden, return_sequences=True, input_shape=(cs, vocab_size),
activation='relu', inner_init='identity'),
TimeDistributed(Dense(vocab_size, activation='softmax')),
])
model.compile(loss='categorical_crossentropy', optimizer=Adam())
In [ ]:
oh_ys = [to_categorical(o, vocab_size) for o in ys]
oh_y_rnn=np.stack(oh_ys, axis=1)
oh_xs = [to_categorical(o, vocab_size) for o in xs]
oh_x_rnn=np.stack(oh_xs, axis=1)
oh_x_rnn.shape, oh_y_rnn.shape
In [ ]:
%%capture output
model.fit(oh_x_rnn, oh_y_rnn, batch_size=64, nb_epoch=8)
In [ ]:
output.show()
In [ ]:
def get_nexts_oh(inp):
idxs = np.array([char_indices[c] for c in inp])
arr = to_categorical(idxs, vocab_size)
p = model.predict(arr[np.newaxis,:])[0]
print(list(inp))
return [chars[np.argmax(o)] for o in p]
In [ ]:
get_nexts_oh(' this is')
In [ ]:
bs=64
A stateful model is easy to create (just add "stateful=True") but harder to train. We had to add batchnorm and use LSTM to get reasonable results.
When using stateful in keras, you have to also add 'batch_input_shape' to the first layer, and fix the batch size there.
In [ ]:
model=Sequential([
Embedding(vocab_size, n_fac, input_length=cs, batch_input_shape=(bs,8)),
BatchNormalization(),
LSTM(n_hidden, return_sequences=True, stateful=True),
TimeDistributed(Dense(vocab_size, activation='softmax')),
])
In [ ]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())
model.summary()
In [ ]:
model.input_shape
Since we're using a fixed batch shape, we have to ensure our inputs and outputs are a even multiple of the batch size.
In [ ]:
mx = len(x_rnn)//bs*bs
mx
In [ ]:
%%capture output
model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, nb_epoch=4, shuffle=False)
In [ ]:
output.show()
In [ ]:
model.optimizer.lr=1e-4
In [ ]:
%%capture output
model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, nb_epoch=4, shuffle=False)
In [ ]:
output.show()
In [ ]:
%%capture output
model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, nb_epoch=4, shuffle=False)
In [ ]:
output.show()
Identical to the last keras rnn, but a GRU!
In [ ]:
model=Sequential([
GRU(n_hidden, return_sequences=True, input_shape=(cs, vocab_size),
activation='relu', inner_init='identity'),
TimeDistributed(Dense(vocab_size, activation='softmax')),
])
model.compile(loss='categorical_crossentropy', optimizer=Adam())
In [ ]:
model.fit(oh_x_rnn, oh_y_rnn, batch_size=64, nb_epoch=8)
In [ ]:
get_nexts_oh(' this is')