Neural Turing Machine with Keras

Definitions:
$h$ controller RNN state. $M$ memory (a matrix). $r$ and $w$ read and write vectors, output of a softmax, thus a probability distribution, they are used to do weighted averaging (it seems that differentiable attention is the new name for that) over the rows of $M$. The rows of $M$ are the different "memory locations".

Internal algorithm:
1) Using $h_{t-1}$, update the reading vector $r_t = f_r(r_{t-1}, h_{t-1})$
2) Read from memory $m_t = read(r_t, M_{t-1})$
3) Using the input and the read vector, update the RNN controller state $RNN$: $h_t = RNN(x_t, m_t, h_{t-1})$
4) Using $h_t$, update the writing vector $w_t = f_w(w_{tm1}, h_t)$
5) Write to memory $M_t = write(M_{t-1}, h_t, w_t)$.

For details see Graves et. al.. Next we are going to run the Copy experiment in Keras using #ThankYouTheano backend.

Note

current version:
pip install https://github.com/fchollet/keras.git@7a3122c1546da
pip install https://github.com/edersantana/seya.git@26d0cef422f0

v.1:
pip install https://github.com/fchollet/keras.git@b5f65df
pip install https://github.com/edersantana/seya.git@b044283

Copy Problem


In [1]:
%matplotlib inline
from __future__ import absolute_import
from __future__ import print_function
import logging
import numpy as np
np.random.seed(124)
import matplotlib.pyplot as plt
import cPickle

from theano import tensor, function

from keras.datasets import mnist
from keras.models import Graph, Sequential
from keras.layers.core import TimeDistributedDense, Dropout, Activation, Flatten, Masking
from keras.layers.recurrent import LSTM
from keras.utils import np_utils, generic_utils
from keras.optimizers import Adam, SGD
from keras import backend as K

from seya.layers.ntm import NeuralTuringMachine as NTM
#from seya.models import Sequential  # this is just good old Sequential, from before TensorFlow support

from IPython import display


Using gpu device 0: GeForce GTX 680 (CNMeM is disabled)
Using Theano backend.

In [2]:
batch_size = 100

h_dim = 64
n_slots = 50
m_length = 20
input_dim = 8
lr = 1e-3
clipnorm = 10

In [3]:
# Neural Turing Machine

ntm = NTM(h_dim, n_slots=n_slots, m_length=m_length, shift_range=3,
          inner_rnn='lstm', return_sequences=True, input_dim=input_dim)
model = Sequential()
# model.add(Masking(input_shape=(None, input_dim)))
model.add(ntm)
model.add(TimeDistributedDense(input_dim))
model.add(Activation('sigmoid'))

In [4]:
sgd = Adam(lr=lr, clipnorm=clipnorm)
model.compile(loss='binary_crossentropy', optimizer=sgd, sample_weight_mode="temporal")

In [5]:
# LSTM - Run this for comparison

#sgd = Adam(lr=lr, clipnorm=clipnorm)

#model4 = Sequential()
#model4.add(LSTM(input_dim=input_dim, output_dim=h_dim*2, return_sequences=True))
#model4.add(LSTM(output_dim=h_dim*2, return_sequences=True))
#model4.add(LSTM(output_dim=h_dim*2, return_sequences=True))
#model4.add(TimeDistributedDense(input_dim))
#model4.add(Activation('sigmoid'))

#model4.compile(loss='binary_crossentropy', optimizer=sgd)

Dataset


In [6]:
def get_sample(batch_size=128, n_bits=8, max_size=20, min_size=1):
    # generate samples with random length
    inp = np.zeros((batch_size, 2*max_size-1, n_bits))
    out = np.zeros((batch_size, 2*max_size-1, n_bits))
    sw = np.zeros((batch_size, 2*max_size-1, 1))
    for i in range(batch_size):
        t = np.random.randint(low=min_size, high=max_size)
        x = np.random.uniform(size=(t, n_bits)) > .5
        for j,f in enumerate(x.sum(axis=-1)): # remove fake flags
            if f>=n_bits:
                x[j, :] = 0.
        del_flag = np.ones((1, n_bits))
        inp[i, :t+1] = np.concatenate([x, del_flag], axis=0)
        out[i, t:(2*t)] = x
        sw[i, t:(2*t)] = 1
    return inp, out, sw

In [7]:
'''
def get_sample(batch_size=128, n_bits=8, max_size=20, min_size=1):
    # generate samples with random length
    inp = np.zeros((batch_size, 2*max_size-1, n_bits))
    out = np.zeros((batch_size, 2*max_size-1, n_bits))
    sw = np.zeros((batch_size, 2*max_size-1, 1))
    for i in range(batch_size):
        t = np.random.randint(low=min_size, high=max_size)
        x = np.random.uniform(size=(t, n_bits)) > .5
        for j,f in enumerate(x.sum(axis=-1)): # remove fake flags
            if f>=n_bits:
                x[j, :] = 0.
        del_flag = np.ones((1, n_bits))
        xflag = np.concatenate([x, del_flag], axis=0)
        inp[i, -2*t-1:-t] = xflag
        out[i, -t:] = x
        sw[i, -t:] = 1
    return inp, out, sw
'''


Out[7]:
'\ndef get_sample(batch_size=128, n_bits=8, max_size=20, min_size=1):\n    # generate samples with random length\n    inp = np.zeros((batch_size, 2*max_size-1, n_bits))\n    out = np.zeros((batch_size, 2*max_size-1, n_bits))\n    sw = np.zeros((batch_size, 2*max_size-1, 1))\n    for i in range(batch_size):\n        t = np.random.randint(low=min_size, high=max_size)\n        x = np.random.uniform(size=(t, n_bits)) > .5\n        for j,f in enumerate(x.sum(axis=-1)): # remove fake flags\n            if f>=n_bits:\n                x[j, :] = 0.\n        del_flag = np.ones((1, n_bits))\n        xflag = np.concatenate([x, del_flag], axis=0)\n        inp[i, -2*t-1:-t] = xflag\n        out[i, -t:] = x\n        sw[i, -t:] = 1\n    return inp, out, sw\n'

In [8]:
def show_pattern(inp, out, sw, file_name='pattern2.png'):
    plt.figure(figsize=(10, 10))
    plt.subplot(131)
    plt.imshow(inp>.5)
    plt.subplot(132)
    plt.imshow(out>.5)
    plt.subplot(133)
    plt.imshow(sw[:, :1]>.5)
    plt.savefig(file_name)
    plt.close()

inp, out, sw = get_sample()
show_pattern(inp[0], out[0], sw[0])

In [9]:
inp, out, sw = get_sample(1, 8, 20)

plt.subplot(131)
plt.title('input')
plt.imshow(inp[0], cmap='gray')
plt.subplot(132)
plt.title('desired')
plt.imshow(out[0], cmap='gray')
plt.subplot(133)
plt.title('sample_weight')
plt.imshow(sw[0], cmap='gray')

# sample_weight marks the points in time that will 
# be part of the cost function.


Out[9]:
<matplotlib.image.AxesImage at 0x7fae35add910>

In [10]:
# training uses sequences of length 1 to 20. Test uses series of length 100.
def test_model(model, file_name, min_size=40):
    I, V, sw = get_sample(batch_size=500, n_bits=input_dim, max_size=min_size+1, min_size=min_size)
    Y = np.asarray(model.predict(I, batch_size=100) > .5).astype('float64')
    acc = (V[:, -min_size:, :] == Y[:, -min_size:, :]).mean() * 100
    show_pattern(Y[0], V[0], sw[0], file_name)

    return acc

In [11]:
trained = model

nb_epoch = 4000
progbar = generic_utils.Progbar(nb_epoch)
ACC = []
for e in range(nb_epoch):
    I, V, sw = get_sample(n_bits=input_dim, max_size=20, min_size=1, batch_size=100)
    
    loss = trained.train_on_batch(I, V, sample_weight=sw[:, :, 0])[0]
    # loss = trained.fit(I, V, sample_weight=sw[:, :, 0], nb_epoch=1, batch_size=100, verbose=0).totals['loss']
    
    progbar.add(1, values=[("loss", loss)])
    
    if e % 500 == 0:
        print("")
        acc = test_model(trained, 'ntm_test.png')
        l = []
        for a in [acc,]:
            print("acc: {}".format(a))
            l.append(a)
        ACC.append(l)


   1/4000 [..............................] - ETA: 1058s - loss: 0.1708
acc: 49.433125
 501/4000 [==>...........................] - ETA: 840s - loss: 0.1680
acc: 51.7975
1001/4000 [======>.......................] - ETA: 720s - loss: 0.1624
acc: 54.041875
1501/4000 [==========>...................] - ETA: 600s - loss: 0.1572
acc: 53.959375
2001/4000 [==============>...............] - ETA: 479s - loss: 0.1525
acc: 53.759375
2501/4000 [=================>............] - ETA: 358s - loss: 0.1485
acc: 54.330625
3001/4000 [=====================>........] - ETA: 238s - loss: 0.1443
acc: 55.7775
3501/4000 [=========================>....] - ETA: 119s - loss: 0.1262
acc: 97.486875
4000/4000 [==============================] - 953s - loss: 0.1107   

Visualization


In [12]:
X = model.get_input()
Y = ntm.get_full_output()[0:3] # (memory over time, read_vectors, write_vectors)
F = function([X], Y, allow_input_downcast=True)

In [13]:
inp, out, sw = get_sample(1, input_dim, 31, 30)

In [14]:
mem, read, write,  = F(inp.astype('float32'))
Y = model.predict(inp)

In [15]:
mem = mem.transpose(1, 0, 2).reshape((1, -1, n_slots, m_length))
mem.shape


Out[15]:
(1, 61, 50, 20)

In [16]:
write = write.transpose(1, 0, 2)
read = read.transpose(1, 0, 2)

In [17]:
plt.figure(figsize=(15, 12))

plt.subplot(221)
plt.imshow(write[0], cmap='gray')
plt.xlabel('memory location')
plt.ylabel('time')
plt.title('write')

plt.subplot(222)
plt.imshow(read[0], cmap='gray')
plt.title('read')

plt.subplot(223)
plt.title('desired')
plt.imshow(out[0], cmap='gray')

plt.subplot(224)
plt.imshow(Y[0]>.5, cmap='gray')
plt.title('output')

plt.figure(figsize=(15, 10))
plt.subplot(325)
plt.ylabel('time')
plt.xlabel('location')
plt.title('memory evolving in time (avg value per location)')
plt.imshow(mem[0].mean(axis=-1), cmap='gray')


Out[17]:
<matplotlib.image.AxesImage at 0x7fae399bf910>

In [ ]: