Wayne Nixalo - 25 Jun 2017
RNN practice in Theano -- 3rd attempt
In [1]:
import theano
%matplotlib inline
import sys, os
sys.path.insert(1, os.path.join('../utils'))
import utils; reload(utils)
from utils import *
from __future__ import division, print_function
In [2]:
path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read()
print('corpus length:', len(text))
In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars) + 1
print('total chars:', vocab_size)
In [4]:
chars.insert(0, "\0")
''.join(chars[1:-6])
Out[4]:
In [5]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
In [6]:
idx = [char_indices[c] for c in text]
# the 1st 10 characters:
idx[:10]
Out[6]:
In [7]:
''.join(indices_char[i] for i in idx[:70])
Out[7]:
In [8]:
cs = 3
c1_dat = [idx[i] for i in xrange(0, len(idx)-1-cs, cs)]
c2_dat = [idx[i+1] for i in xrange(0, len(idx)-1-cs, cs)]
c3_dat = [idx[i+2] for i in xrange(0, len(idx)-1-cs, cs)]
c4_dat = [idx[i+3] for i in xrange(0, len(idx)-1-cs, cs)] # <-- gonna predict this
In [9]:
# we can turn these into Numpy arrays just by stacking them up together
x1 = np.stack(c1_dat[:-2]) # 1st chars
x2 = np.stack(c2_dat[:-2]) # 2nd chars
x3 = np.stack(c3_dat[:-2]) # 3rd chars
# for every 4 character peice of this - collected works
In [10]:
# labels will just be the 4th characters
y = np.stack(c4_dat[:-2])
In [11]:
# 1st, 2nd, 3rd chars of text
x1[:4], x2[:4], x3[:4]
Out[11]:
In [12]:
# 4th char of text
y[:3]
Out[12]:
In [13]:
x1.shape, y.shape
Out[13]:
In [14]:
# we're going to turn these into embeddings
n_fac = 42
# by creating an embedding matrix
def embedding_input(name, n_in, n_out):
inp = Input(shape=(1,), dtype='int64', name=name)
emb = Embedding(n_in, n_out, input_length=1)(inp)
return inp, Flatten()(emb)
c1_in, c1 = embedding_input('c1', vocab_size, n_fac)
c2_in, c2 = embedding_input('c2', vocab_size, n_fac)
c3_in, c3 = embedding_input('c3', vocab_size, n_fac)
# c1, c2, c3 represent result of putting each char through the embedding &
# getting out 42 latent vectors. <-- those are input to greenarrow.
In [15]:
n_hidden = 256
dense_in = Dense(n_hidden, activation='relu')
c1_hidden = dense_in(c1)
dense_hidden = Dense(n_hidden, activation='tanh')
c2_dense = dense_in(c2) # char-2 embedding thru greenarrow
hidden_2 = dense_hidden(c1_hidden) # output of char-1's hidden state thru orangearrow
c2_hidden = merge([c2_dense, hidden_2]) # merge the two together (default: sum)
c3_dense = dense_in(c3)
hidden_3 = dense_hidden(c2_hidden)
c3_hidden = merge([c3_dense, hidden_3])
dense_out = Dense(vocab_size, activation='softmax') #output size: 86 <-- vocab_size
c4_out = dense_out(c3_hidden)
# passing in our 3 inputs & 1 output
model = Model([c1_in, c2_in, c3_in], c4_out)
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())
model.optimizer.lr=0.001
In [16]:
model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=1)
Out[16]:
In [17]:
def get_next(inp):
idxs = [char_indices[c] for c in inp]
arrs = [np.array(i)[np.newaxis] for i in idxs]
p = model.predict(arrs)
i = np.argmax(p)
return chars[i]
In [18]:
get_next('phi')
Out[18]:
In [19]:
get_next(' th')
Out[19]:
In [20]:
get_next(' an')
Out[20]:
In [21]:
cs = 8 # use 8 characters to predict the 9th
c_in_dat = [[idx[i+n] for i in xrange(0, len(idx)-1-cs, cs)] for n in range(cs)]
c_out_dat = [idx[i+cs] for i in xrange(0, len(idx)-1-cs,cs)]
# go thru every one of those input lists and turn into Numpy array:
xs = [np.stack(c[:-2]) for c in c_in_dat]
len(xs), xs[0].shape
Out[21]:
In [22]:
y = np.stack(c_out_dat[:-2])
# visualizing xs:
[xs[n][:cs] for n in range(cs)]
Out[22]:
In [23]:
y[:cs]
Out[23]:
In [24]:
n_fac = 42
def embedding_input(name, n_in, n_out):
inp = Input(shape=(1,), dtype='int64', name=name+'_in')
emb = Embedding(n_in, n_out, input_length=1, name=name+'_emb')(inp)
return inp, Flatten()(emb)
c_ins = [embedding_input('c'+str(n), vocab_size, n_fac) for n in range(cs)]
n_hidden = 256
dense_in = Dense(n_hidden, activation='relu')
dense_hidden = Dense(n_hidden, activation='relu', init='identity')
dense_out = Dense(vocab_size, activation='softmax')
hidden = dense_in(c_ins[0][1])
for i in range(1,cs):
c_dense = dense_in(c_ins[i][1]) #green arrow
hidden = dense_hidden(hidden) #orange arrow
hidden = merge([c_dense, hidden]) #merge the two together
c_out = dense_out(hidden)
In [25]:
xs, xs[0].shape, len(xs)
Out[25]:
In [26]:
model = Model([c[0] for c in c_ins], c_out)
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())
model.fit(xs, y, batch_size=64, nb_epoch=1)
Out[26]:
In [27]:
xs, xs[0].shape, len(xs)
Out[27]:
In [28]:
def get_next(inp):
idxs = [np.array(char_indices[c])[np.newaxis] for c in inp]
p = model.predict(idxs)
return chars[np.argmax(p)]
In [29]:
get_next('for thos')
Out[29]:
In [30]:
get_next('part of ')
Out[30]:
In [31]:
get_next('queens a')
Out[31]:
In [32]:
# c_in_dat = [[idx[i+n] for i in xrange(0, len(idx)-1-cs, cs)] for n in range(cs)]
c_out_dat = [[idx[i+n] for i in xrange(1, len(idx)-cs, cs)] for n in range(cs)]
ys = [np.stack(c[:-2]) for c in c_out_dat]
In [33]:
[xs[n][:cs] for n in range(cs)]
Out[33]:
In [34]:
[ys[n][:cs] for n in range(cs)]
Out[34]:
In [35]:
dense_in = Dense(n_hidden, activation='relu')
dense_out = Dense(vocab_size, activation='softmax', name='output')
In [36]:
# our char1 input is moved within the diagram's loop-box; so now need
# initialized input (zeros)
inp1 = Input(shape=(n_fac,), name='zeros')
hidden = dense_in(inp1)
outs = []
for i in range(cs):
c_dense = dense_in(c_ins[i][1])
hidden = dense_hidden(hidden)
hidden = merge([c_dense, hidden], mode='sum')
# every layer now has an output
outs.append(dense_out(hidden))
# our loop is identical to before, except at the end of every loop,
# we're going to append this output; so now we're going to have
# 8 outputs for every sequence instead of just 1.
# model now has vector of 0s: [inp1], and array of outputs: outs
model = Model([inp1] + [c[0] for c in c_ins], outs)
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())
zeros = np.tile(np.zeros(n_fac), (len(xs[0]),1))
zeros.shape
Out[36]:
In [37]:
model.fit([zeros]+xs, ys, batch_size=64, nb_epoch=1)
Out[37]:
In [38]:
def get_nexts(inp):
idxs = [char_indices[c] for c in inp]
arrs = [np.array(i)[np.newaxis] for i in idxs]
p = model.predict([np.zeros(n_fac)[np.newaxis,:]] + arrs)
print(list(inp))
return [chars[np.argmax(o)] for o in p]
In [39]:
get_nexts(' this is')
Out[39]:
In [40]:
get_nexts(' part of')
Out[40]:
In [41]:
# SEQUENCE MODEL WITH KERAS
In [42]:
n_hidden, n_fac, cs, vocab_size
Out[42]:
In [43]:
model = Sequential([
Embedding(vocab_size, n_fac, input_length=cs),
SimpleRNN(n_hidden, return_sequences=True, activation='relu', inner_init='identity'),
TimeDistributed(Dense(vocab_size, activation='softmax')),
])
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())
# just some dimensionality changes required; otherwise same
x_rnn = np.stack(np.squeeze(xs), axis=1)
y_rnn = np.stack(ys, axis=1)
x_rnn.shape, y_rnn.shape
Out[43]:
In [44]:
model.fit(x_rnn, y_rnn, batch_size=64, nb_epoch=1)
Out[44]:
In [45]:
def get_nexts_keras(inp):
idxs = [char_indices[c] for c in inp]
arr = np.array(idxs)[np.newaxis,:]
p = model.predict(arr)[0]
print(list(inp))
return [chars[np.argmax(o)] for o in p]
get_nexts_keras(' this is')
Out[45]:
In [46]:
bs = 64
model = Sequential([
Embedding(vocab_size, n_fac, input_length=cs, batch_input_shape=(bs,8)),
BatchNormalization(),
LSTM(n_hidden, return_sequences=True, stateful=True),
TimeDistributed(Dense(vocab_size, activation='softmax')),
])
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())
mx = len(x_rnn)//bs*bs
In [47]:
model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, nb_epoch=1, shuffle=False)
Out[47]:
In [48]:
model.optimizer.lr=1e-4
model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, nb_epoch=1, shuffle=False)
Out[48]:
In [49]:
model = Sequential([
SimpleRNN(n_hidden, return_sequences=True, input_shape=(cs, vocab_size),
activation='relu', inner_init='identity'),
TimeDistributed(Dense(vocab_size, activation='softmax')),
])
model.compile(loss='categorical_crossentropy', optimizer=Adam())
# no embedding layer, so inputs must be onhotted too.
oh_ys = [to_categorical(o, vocab_size) for o in ys]
oh_y_rnn = np.stack(oh_ys, axis=1)
oh_xs = [to_categorical(o, vocab_size) for o in xs]
oh_x_rnn = np.stack(oh_xs, axis=1)
oh_x_rnn.shape, oh_y_rnn.shape
Out[49]:
In [50]:
model.fit(oh_x_rnn, oh_y_rnn, batch_size=64, nb_epoch=1)
Out[50]:
In [51]:
def get_nexts_oh(inp):
idxs = np.array([char_indices[c] for c in inp])
arr = to_categorical(idxs, vocab_size)
p = model.predict(arr[np.newaxis,:])[0]
print(list(inp))
return [chars[np.argmax(o)] for o in p]
In [52]:
get_nexts_oh(' this is')
Out[52]:
In [54]:
print(x_rnn.shape, y_rnn.shape)
[xs[n][:cs] for n in xrange(cs)]
Out[54]:
In [55]:
# THEANO RNN
In [56]:
n_input = vocab_size
n_output = vocab_size
def init_wgts(rows, cols):
scale = math.sqrt(2/rows) # 1st calc Glorot number to scale weights
return shared(normal(scale=scale, size=(rows, cols)).astype(np.float32))
def init_bias(rows):
return shared(np.zeros(rows, dtype=np.float32))
def wgts_and_bias(n_in, n_out):
return init_wgts(n_in, n_out), init_bias(n_out)
def id_and_bias(n):
return shared(np.eye(n, dtype=np.float32)), init_bias(n)
# Theano variables
t_inp = T.matrix('inp')
t_outp = T.matrix('outp')
t_h0 = T.vector('h0')
lr = T.scalar('lr')
all_args = [t_h0, t_inp, t_outp, lr]
W_h = id_and_bias(n_hidden)
W_x = wgts_and_bias(n_input, n_hidden)
W_y = wgts_and_bias(n_hidden, n_output)
w_all = list(chain.from_iterable([W_h, W_x, W_y]))
def step(x, h, W_h, b_h, W_x, b_x, W_y, b_y):
# Calculate the hidden activations
h = nnet.relu(T.dot(x, W_x) + b_x + T.dot(h, W_h) + b_h)
# Calculate the output activations
y = nnet.softmax(T.dot(h, W_y) + b_y)
# Return both (the 'Flatten()' is to work around a theano bug)
return h, T.flatten(y, 1)
In [57]:
[v_h, v_y], _ = theano.scan(step, sequences=t_inp,
outputs_info=[t_h0, None], non_sequences=w_all)
In [58]:
error = nnet.categorical_crossentropy(v_y, t_outp).sum()
g_all = T.grad(error, w_all)
In [59]:
def upd_dict(wgts, grads, lr):
return OrderedDict({w: w-g*lr for (w,g) in zip(wgts,grads)})
In [60]:
upd = upd_dict(w_all, g_all, lr)
# we're finally ready to compile the function!:
fn = theano.function(all_args, error, updates=upd, allow_input_downcast=True)
In [61]:
X = oh_x_rnn
Y = oh_y_rnn
X.shape, Y.shape
Out[61]:
In [62]:
err=0.0; l_rate=0.01
for i in xrange(len(X)):
err += fn(np.zeros(n_hidden), X[i], Y[i], l_rate)
if i % 1000 == 999:
print ("Error:{:.3f}".format(err/1000))
err=0.0
NOTE: it looks like you have to run a model.fit()
on the xs
data before trying to build a straight-Theano RNN. When I go straight to building a Theano-RNN without compiling/fitting any model beforehand, I get an error starting around 30, which levels off at 25. The model predicts ' ' for every case. When a model is fitten before manually building a Theano RNN, you get the above behavior: error starting at 25 and dropping to around 14, and actually doing its job of predicting letters.
What I noticed changed was the structure of the data in xs
. Taking a look at xs
via xs, xs[0].shape, len(xs)
before running a model.fit()
gives:
Out[22]:
[array([40, 1, 33, 2, 72, 67, 73, 2]),
array([42, 1, 38, 44, 2, 9, 61, 73]),
array([29, 43, 31, 71, 54, 9, 58, 61]),
array([30, 45, 2, 74, 2, 76, 67, 58]),
array([25, 40, 73, 73, 76, 61, 24, 71]),
array([27, 40, 61, 61, 68, 54, 2, 58]),
array([29, 39, 54, 2, 66, 73, 33, 2]),
array([ 1, 43, 73, 62, 54, 2, 72, 67])]
Whereas after running model.fit()
, the same code will show you:
[array([[40],
[ 1],
[33],
[ 2],
[72],
[67],
[73],
[ 2]]), array([[42],
[ 1],
[38],
[44],
(..truncated..)
So it looks like model.fit()
is altering the structure of the input data xs
in such a way that it's 'useful' for usage in a manually-built RNN. May take a look again at Lecture 6, or just try a regular Theano RNN tutorial online to see if I can get better results without having to use a Keras model before I build one in Theano.
-- WNx
In [63]:
f_y = theano.function([t_h0, t_inp], v_y, allow_input_downcast=True)
pred = np.argmax(f_y(np.zeros(n_hidden), X[6]), axis=1)
act = np.argmax(X[6], axis=1)
In [64]:
[indices_char[o] for o in act]
Out[64]:
In [65]:
[indices_char[o] for o in pred]
Out[65]:
Note - 1 July 2017:
Ahhhhhh.... what it looks like is a dimensionality issue -- I remember J.Howard talking about this in the lecture.. So, originally when xs
is a vector of non-dimensional rows (shape = (75110,)
), that's the case where the mdoel is trying to predict the next letter with only the 1st or no input? Something like that..
But the 2nd case where xs
is a vector of 1x8 vectors (shape = (75110, 1), 8)
), that's where the next letter is predicted using the context of the previous 8 letters...
I'll have to rewatch those parts of Lecture 6, but that's what it seems like. Will make a 4th JNB to test it out fresh.