In [544]:
'''
Implementation of the single hop network from https://arxiv.org/abs/1503.08895.

Derived from:
- https://github.com/fchollet/keras/blob/master/examples/babi_memnn.py and 
- Implementation by Jeremy Howard for http://course.fast.ai/.
'''
from __future__ import print_function
from keras.models import Sequential
from keras.layers import *
from keras.layers import Input
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import keras.backend as K
import numpy as np
import tarfile
import itertools
from keras.models import *
import random
import sys
from glob import glob
import re
import tqdm
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import *
import os
from keras_tqdm import TQDMNotebookCallback

Corpus Preprocessing

Data Format

The training data consists of a number of "examples". Each example contains a few memories, and the relevant question/answer pairs. Consider the following example

  1. Mary moved to the bathroom.
  2. John went to the hallway.
  3. Where is Mary? bathroom 1
  4. Daniel went back to the hallway.
  5. Where is Daniel? hallway 4

In the above example, we have 3 memories and 2 question/answer pairs:

  • Three Memories

    • Mary moved to the bathroom.
    • John went to the hallway.
    • Daniel went back to the hallway.
  • 2 Questions/Answer Pairs:

    • Q: Where is Mary? A: bathroom

    • Q: Where is Daniel? A: hallway

    Note that for the first question, only memories (1, 2) are relevant. For the second question, memories (1, 2, 4) are relevant.

Abstractions

  • Corpus : Responsible for keeping tracks of the information related to the text corpus. For example, the total number of unique words, mapping from string to text and so on.
  • ExampleParser : The training file consists of "example" units. ExampleParser takes a file with a number of examples and parses them into 3-tuples of (memories, question, answer)

In [545]:
class Corpus:
    def __init__(self):
        '''
        A corpus object maintains a mapping from a word (string) to a unique id (int).
        '''
        self.word_idx_dict = {}
        self.uniq_word_cnt = 0
    
    def update_vocab(self, words):
        '''
        Updates the corpus with the given list of words. The words that are seen for the 
        first time are added to the word -> id dictionary.
        '''
        for word in words:
            if word not in self.word_idx_dict:
                self.word_idx_dict[word] = self.uniq_word_cnt
                self.uniq_word_cnt += 1

    def words_idx(self, words):
        '''
        Returns the list of IDs corresponding to the given words. 
        '''
        return [self.word_idx_dict[word] for word in words]
    
    def tokenize(self, sentence):
        return [x.strip() for x in re.split('(\W+)?', sentence) if x.strip()]

In [546]:
class ExampleParser:
    '''
    Responsible for parsing examples for the babi tasks as specified at https://research.fb.com/downloads/babi/
    '''
    @staticmethod
    def add(example_lines, data, c):
        '''
        
        Takes the set of lines that form an example and:
        a) updates the corpus with these lines 
        b) Parses the line to 3-tuples of the form: (memory, question, answer).
        
        A single story line may yield several 3-tuples of the above form. E.g.: 
        1 Mary moved to the bathroom.
        2 John went to the hallway.
        3 Where is Mary?        bathroom        1
        4 Daniel went back to the hallway.
        12 Where is Daniel?     hallway  4
        
        Will generate 2 tuples: 
        
        - ([1 Mary moved to the bathroom., 2 John went to the hallway.], Where is Mary?, bathroom)
        - ([1 Mary moved to the bathroom., 2 John went to the hallway., 4 Daniel went back to the hallway.],
                                                                         Where is Daniel?, hallway)
        
        Note that instead of storing the actual words, an example stores the IDs of the associated words.
        
        A word -> ID map is maintained in the corpus.
        
        @example_lines: A set of lines that form an example.
        @data: List of 3-tuples (memories, question, answer), updated by "add".
        @c: The corpus object.
        '''
        memories = []
        memories_txt = []
        qa = []
        for eg_line in example_lines:
            if "\t" not in eg_line: #normal memory
                eg_line = c.tokenize(eg_line)
                c.update_vocab(eg_line)
                mem_id, memory = eg_line[0], c.words_idx(eg_line[1:])
                memories.append(c.words_idx(eg_line))
                memories_txt.append(eg_line)
            else: #question line
                ques, ans, hints = eg_line.split("\t")
                ques = c.tokenize(ques)[1:]
                c.update_vocab(ques)
                ans = c.tokenize(ans)
                c.update_vocab(ans)
                data.append(([m for m in memories],
                                  c.words_idx(ques), c.words_idx(ans)))
    
    @staticmethod
    def process_files(lines, corpus):
        '''
        Reads the given file, identifies splits of the example and adds them to th.
        '''
        data = []
        eg_lines = [lines[0].decode('utf-8').strip()]
        for line in lines[1:]:
            line = line.decode('utf-8').strip()
            if int(line.split(" ", 1)[0]) == 1: #new story starts
                ExampleParser.add(eg_lines, data, corpus)
                eg_lines = [line.strip()]
            else:
                eg_lines.append(line.strip())
        if len(eg_lines) > 0:
            ExampleParser.add(eg_lines, data, corpus)
        return data

Download and extract datasets


In [547]:
challenges = {
    # QA1 with 10,000 samples
    'single_supporting_fact_10k': 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt',
    # QA2 with 10,000 samples
    'two_supporting_facts_10k': 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt',
}
path = get_file('babi-tasks-v1-2.tar.gz', origin='https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz')
tar = tarfile.open(path)
challenge_type = 'two_supporting_facts_10k'
challenge = challenges[challenge_type]

test_lines = tar.extractfile(challenge.format('test')).readlines()
train_lines = tar.extractfile(challenge.format('train')).readlines()
print(train_lines[0:4])
print(test_lines[0:4])


[b'1 Mary moved to the bathroom.\n', b'2 Sandra journeyed to the bedroom.\n', b'3 Mary got the football there.\n', b'4 John went to the kitchen.\n']
[b'1 Mary got the milk there.\n', b'2 John moved to the bedroom.\n', b'3 Sandra went back to the kitchen.\n', b'4 Mary travelled to the hallway.\n']

Parse datasets to (memories, question, answer) tuples, perform word -> idx mapping


In [548]:
c = Corpus()
print("Processing training files")
train_tuples = ExampleParser.process_files(train_lines, c)
print("Processing test files")
test_tuples = ExampleParser.process_files(test_lines, c)
all_tuples = train_tuples + test_tuples
print("# training tuples = {0}\n# test tuples = {1}".format(len(train_tuples), len(test_tuples)))


Processing training files
C:\Users\amadaan\AppData\Local\Continuum\Anaconda3\lib\re.py:203: FutureWarning: split() requires a non-empty pattern match.
  return _compile(pattern, flags).split(string, maxsplit)
Processing test files
# training tuples = 10000
# test tuples = 1000

Example Description

  • Let us look at the first example and the corresponding lines from the text.

In [549]:
print(train_tuples[0])
print(train_lines[0:3])


([[0, 1, 2, 3, 4, 5, 6], [7, 8, 9, 3, 4, 10, 6], [11, 1, 12, 4, 13, 14, 6], [15, 16, 17, 3, 4, 18, 6], [19, 1, 17, 20, 3, 4, 18, 6], [21, 1, 17, 20, 3, 4, 22, 6]], [23, 24, 4, 13, 25], [22])
[b'1 Mary moved to the bathroom.\n', b'2 Sandra journeyed to the bedroom.\n', b'3 Mary got the football there.\n']
  • From the mapping, note that "1 Mary moved to the bathroom." has been mapped to "0, 1, 2, 3, 4, 5, 6". (line number is used as a feature).

    • 1 -> 0
    • Mary -> 1
    • moved -> 2
    • to -> 3
    • the -> 4
    • bathroom -> 5
    • . (full stop) - 6
  • Similarily, 3 Where is Mary? is mapped to [11, 12, 1, 13] (note that Mary -> 1, the mapping is retained).

  • The answer is 5, bathroom

Preparing Dataset for Training

  • At this point, we have the 3-tuples of (memories, question, answer).

  • Different tuples may have different number of memories, and different number of words in memories and questions.

  • As a reminder, a 3-tuple contains a bunch of memories, 1 question and the corresponding answer. Also, we have processed these tuples so that at this point, they are just a bunch of numbers.

  • Because we want to use the same network for training, we will "pad" our 3-tuples to make them all of the same size. This means that:

  1. All the 3-tuples should have the same number of memories.
  2. All the memories should have the same number of words.
  3. All the questions should have the same number of words.
  • To achieve this, we will 0 pad the sequences. Let's find out relevant upper limits.

In [550]:
max_num_memories = max([len(example[0]) for example in all_tuples])
max_memory_len = max([len(memory) for example in all_tuples for memory in example[0]])
max_ques_len = max([len(example[1]) for example in all_tuples])
vocab_size = c.uniq_word_cnt
len(train_tuples), len(test_tuples), c.uniq_word_cnt, max_num_memories, max_memory_len, max_ques_len


Out[550]:
(10000, 1000, 123, 88, 8, 5)
Variable Count
# Training Tuples 10000
# Test Tuples 1000
Max # words per memory 8
Max # memories in a tuple 10
Max # words per question 4

In [551]:
def pad_tuples(tuples, max_memory_len, max_num_memories, max_ques_len, vocab_size):
    '''
    Takes a number of tuples, as well as measures required for 0 padding. 
    Returns a padded version of the memories, questions and the answer.
    In other words, for each tuple, memories is now a matrix of:
    a) max_num_memories * max_memory_len
    b) question is an array with max_ques_len elements. The gaps are filled with 0s.
    Also performs 1-hot encoding for the output.
    '''
    m, q, a = [], [], []
    for (memories, ques, ans) in tuples:
        memories= pad_sequences(memories, maxlen=max_memory_len)
        memories = np.concatenate([memories, np.zeros((max_num_memories - memories.shape[0],
                                                       max_memory_len), 'int') ])
        m.append(memories)
        q.append(ques)
        #ans_vec = np.zeros((vocab_size))
        #ans_vec[ans] = 1
        a.append(ans)
    return np.array(m), pad_sequences(q, maxlen=max_ques_len), np.array(a)

In [552]:
m_train, q_train, a_train = pad_tuples(train_tuples, max_memory_len, max_num_memories, max_ques_len, c.uniq_word_cnt)
m_test, q_test, a_test = pad_tuples(test_tuples, max_memory_len, max_num_memories, max_ques_len, c.uniq_word_cnt)
print(m_train.shape) 
print(q_train.shape)
print(a_train.shape)
print(m_test.shape)
print(q_test.shape)
print(a_test.shape)


(10000, 88, 8)
(10000, 5)
(10000, 1)
(1000, 88, 8)
(1000, 5)
(1000, 1)

Model

We will now build the model as specified at https://arxiv.org/abs/1503.08895.

The relevant snippet from the paper are referenced inline. An attempt has been made to stay as close to the notation in paper as possible. Continuing from the notations used above, our network is designed to be trained on 1-tuple at a time. That is, the network will be fed a list of 3-tuples, where each of the 3-tuples is (memories, question, answer).

For the entirety of the model building discussion, the focus will be on 1 such tuple.

Step 1: Encode the inputs

The set x1, x2, ..., x_i that the paper mentions is our list of memories (each of the x_i is basically a list of numbers).

m_i

  • For each of the inputs, x_i we learn an embedded representation, m_i. In the single fact example, the length of each x is 8. Each of the x is thus an 8-dimensional vectory. Similarily, the number of dimensions in the embedded space is 64. Thus, we want to map each of the 8 dim input vector to a 64 dim vector.

  • This is achieved by embedding each word of the input to a 64 dim vector, and then adding them all to give the 64 dim vector for the input.

Mapping m_1 to x_1: A complete example.

  • Suppose that x_1 is = "1 Mary went to the kitchen", mapped to [0 0 12 23 32 33 22 21] (0 padding to make the length = 8). For brevity, suppose that the number of hidden dimensions was 5 (it's 64 in the real case), then step 1 would be to learn 1 5 dimensional vector for each word in the sequence:

  • 0: '[0.13 0.69 0.52 -0.22 -1.15]'

  • 0: '[0.13 0.69 0.52 -0.22 -1.15]'
  • 1: '[0.05 0.23 -1.09 0.9 0.21]'
  • Mary: '[0.8 -1.09 -0.49 1.7 2.77]'
  • went: '[-0.95 0.87 0.88 0.02 0.09]'
  • to: '[0.73 -0.58 0.27 0.95 0.9]'
  • the: '[-1.13 0.41 -0.19 -1.41 0.07]'
  • kitchen: '[-0.5 , -1.11, -0.47, 0.09, -1.87]'

    The representation for the sentence is then just a sum of all of these: [-0.77, -0.18, -1.46, 3.19, -1.]).

    This is m_1.


In [553]:
n_hidden = 64
memories = Input(shape=(max_num_memories, max_memory_len))
x = TimeDistributed(Embedding(input_dim=vocab_size, output_dim=n_hidden))(memories)
m = Lambda(lambda xx: K.sum(xx, 2))(x)
memories.shape, m.shape


Out[553]:
(TensorShape([Dimension(None), Dimension(88), Dimension(8)]),
 TensorShape([Dimension(None), Dimension(88), Dimension(64)]))

Step 2: Encode the query

  • Similarily, a query input is mapped from a 4 dim to a 64 dim space. This embedded representation of the query is called "u".

In [554]:
query = Input(shape=(max_ques_len,))
u = Embedding(input_dim=vocab_size, output_dim=n_hidden)(query)
u = Lambda(lambda x : K.sum(x, 1))(u)
u = Reshape(target_shape=(1, n_hidden))(u)
query.shape, u.shape


Out[554]:
(TensorShape([Dimension(None), Dimension(5)]),
 TensorShape([Dimension(None), Dimension(1), Dimension(64)]))

Step 3: Calculate p, the attention that each input deserves for the given query

  • Our answers need to be derived from our memories. Further, similarities in the query and memory is a good indicator of the memory containing the answer (because of presence of common words). The similarity is calculated by taking a dot product of the embedded query with each of the embedded input, followed by a softmax.


In [555]:
p = dot([m, u], axes=2)
p = Reshape((max_num_memories,))(p)
p = Activation(activation='softmax')(p)
p = Reshape((max_num_memories,1))(p)
p.shape


Out[555]:
TensorShape([Dimension(None), Dimension(88), Dimension(1)])

Step 4: Calculate c, the output embeddings

  • The input memory embeddings hopefully now have values that are conducive to finding out the relevant memories for a given question. We now want to learn another representation that will help in finding out the answer. The mechanics of finding out this embedding remain similar to what we did for m_i.


In [556]:
x = TimeDistributed(Embedding(vocab_size, n_hidden))(memories)
c = Lambda(lambda xx: K.sum(xx, 2))(x)
c.shape


Out[556]:
TensorShape([Dimension(None), Dimension(88), Dimension(64)])

Step 5: Calculate o, the output representation

  • The output representation is calculated by taking a "weighted average" of the output memory repreesntation. Where the weights are given by p learned above.

Output Representation, o


In [557]:
o = dot([c, p], axes=1)
o = Reshape(target_shape=(1,n_hidden))(o)
o


Out[557]:
<tf.Tensor 'reshape_506/Reshape:0' shape=(?, 1, 64) dtype=float32>

Step 6: Feed o + u to a softmax to get the answer


In [558]:
a_in = Lambda(lambda ou: sum([ou[0], ou[1]]))([o, u])
a_in = Reshape(target_shape=(n_hidden,))(a_in)
answer = Dense(vocab_size, activation='softmax')(a_in)
answer


Out[558]:
<tf.Tensor 'dense_71/Softmax:0' shape=(?, 123) dtype=float32>

In [559]:
babi_memmn = Model([memories, query], answer)

In [560]:
babi_memmn.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [561]:
K.set_value(babi_memmn.optimizer.lr, 1e-2)
params = {'verbose': 2, 'callbacks': [TQDMNotebookCallback(leave_inner=False)]}
babi_memmn.fit([m_train, q_train], a_train, **params, batch_size=32, epochs=5,
               validation_data=([m_test, q_test], a_test))


Train on 10000 samples, validate on 1000 samples
Epoch 1/5
10s - loss: 1.7321 - acc: 0.2641 - val_loss: 1.5787 - val_acc: 0.3460
Epoch 2/5
7s - loss: 1.5332 - acc: 0.3634 - val_loss: 1.5346 - val_acc: 0.3590
Epoch 3/5
8s - loss: 1.5214 - acc: 0.3687 - val_loss: 1.6479 - val_acc: 0.3010
Epoch 4/5
7s - loss: 1.5139 - acc: 0.3692 - val_loss: 1.5191 - val_acc: 0.3600
Epoch 5/5
7s - loss: 1.5122 - acc: 0.3620 - val_loss: 1.4913 - val_acc: 0.3780

Out[561]:
<keras.callbacks.History at 0x16706f95b00>

Multi-hop network

We will now start building the multi hop network, starting with a verbose 2-hop network and then modularizing it. Let us look at the experiments section to see which configuration have the authors used for the final experiment.

Let's look at the multi-hop networks first.

We note the following from the diagram:

  1. Layers of networks are stacked one over the other.

  2. The input doesn't change across the layers.

  3. The original query is supplied only to the first layer. For the subsequent layers, the following scheme is used: u(i + 1) = u(i) + o(i)

We will start with 2 hops first, and then move to three.

2-hop network with adjacent weight sharing

Since the adjacent weight sharing scheme is used for the final experiments, let's use the same.


In [650]:
mem_input = Input(shape=(max_num_memories, max_memory_len))
query_input = Input(shape=(max_ques_len,))

In [651]:
A1 = Embedding(vocab_size,output_dim=n_hidden)
m_i = TimeDistributed(A1)(mem_input)
m_i = Lambda(lambda x: K.sum(x, 2))(m_i)
m_i


Out[651]:
<tf.Tensor 'lambda_203/Sum:0' shape=(?, 88, 64) dtype=float32>

  • So, the embedding layer for the query and the input should be shared for the first hop.

In [652]:
B = A1 #as specified
u1 = B(query_input)
u1 = (Lambda(lambda x: K.sum(x, 1)))(u1)
u1 = Reshape((1, n_hidden))(u1)
u1


Out[652]:
<tf.Tensor 'reshape_578/Reshape:0' shape=(?, 1, 64) dtype=float32>

In [653]:
#nothing special here
C1 = Embedding(vocab_size,output_dim=n_hidden)
c_i_1 = TimeDistributed(C1)(mem_input)
c_i_1 = Lambda(lambda x: K.sum(x, 2))(c_i_1)
c_i_1


Out[653]:
<tf.Tensor 'lambda_205/Sum:0' shape=(?, 88, 64) dtype=float32>

In [654]:
#Calculation for a given hop don't change at all.
p1 = dot([m_i, u1], axes=2)
p1 = Reshape((max_num_memories,))(p1)
p1 = Activation(activation='softmax')(p1)
p1 = Reshape((max_num_memories,1))(p1)
o1 = dot([c_i_1, p1], axes=1)
o1 = Reshape(target_shape=(n_hidden,))(o1)
o1


Out[654]:
<tf.Tensor 'reshape_581/Reshape:0' shape=(?, 64) dtype=float32>

Calculating u2


In [655]:
u1 = Reshape((n_hidden,))(u1)
u2 = add([o1, u1])
u2


Out[655]:
<tf.Tensor 'add_142_1/add:0' shape=(?, 64) dtype=float32>

Second Layer


In [656]:
A2 = C1 #A(k + 1) = C(k)
m_i = TimeDistributed(A2)(mem_input)
m_i = Lambda(lambda x: K.sum(x, 2))(m_i)
m_i


Out[656]:
<tf.Tensor 'lambda_206/Sum:0' shape=(?, 88, 64) dtype=float32>

In [657]:
#Same as the previous layer
C2 = Embedding(vocab_size,output_dim=n_hidden)
c_i_2 = TimeDistributed(C2)(mem_input)
c_i_2 = Lambda(lambda x: K.sum(x, 2))(c_i_2)
c_i_2


Out[657]:
<tf.Tensor 'lambda_207/Sum:0' shape=(?, 88, 64) dtype=float32>

In [658]:
#Same as the previous layer
u2 = Reshape((1, n_hidden))(u2)
p2 = dot([m_i, u2], axes=2)
p2 = Reshape((max_num_memories,))(p2)
p2 = Activation(activation='softmax')(p2)
p2 = Reshape((max_num_memories,1))(p2)
o2 = dot([c_i_2, p2], axes=1)
o2 = Reshape(target_shape=(n_hidden,))(o2)
o2


Out[658]:
<tf.Tensor 'reshape_586/Reshape:0' shape=(?, 64) dtype=float32>

In [663]:
u2 = Reshape((n_hidden,))(u2)
u1 = Reshape((n_hidden,))(u1)
#u3 = add([o2, u1]) #u(k + 1) = o(k) + u(k)
#This is a hack, I was not able to get good results with u3 = u2 + o2
u3 = add([o2, u1]) #u(k + 1) = o(k) + u(k)
u3


Out[663]:
<tf.Tensor 'add_144_1/add:0' shape=(?, 64) dtype=float32>

In [664]:
#answer = MemOut(vocab_size, C2)(u3)
answer = Dense(vocab_size, activation='softmax')(u3)
answer


Out[664]:
<tf.Tensor 'dense_79/Softmax:0' shape=(?, 123) dtype=float32>

In [665]:
babi2 = Model([mem_input, query_input], answer)

In [ ]:
babi2.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
K.set_value(babi_memmn.optimizer.lr, 0.01)
babi2.fit([m_train, q_train], a_train, **parms, batch_size=32, epochs=8,

          validation_data=([m_test, q_test], a_test))


Train on 10000 samples, validate on 1000 samples
Epoch 1/8
13s - loss: 0.4886 - acc: 0.8363 - val_loss: 0.5807 - val_acc: 0.8100
Epoch 2/8
11s - loss: 0.4660 - acc: 0.8434 - val_loss: 0.5494 - val_acc: 0.7990
Epoch 3/8
11s - loss: 0.4356 - acc: 0.8534 - val_loss: 0.5416 - val_acc: 0.8240
Epoch 4/8
11s - loss: 0.4211 - acc: 0.8574 - val_loss: 0.5787 - val_acc: 0.7980
Epoch 5/8
11s - loss: 0.4109 - acc: 0.8645 - val_loss: 0.5428 - val_acc: 0.8100
Epoch 6/8
12s - loss: 0.3952 - acc: 0.8633 - val_loss: 0.5729 - val_acc: 0.8090
Epoch 7/8
11s - loss: 0.3841 - acc: 0.8707 - val_loss: 0.5905 - val_acc: 0.8010
Epoch 8/8

In [ ]:

Modular Hops Network

  • We now modularize the same piece of code to create a 3-hop network.

In [703]:
def get_mi(mem_input, A=None):
    if A == None:
        A = Embedding(vocab_size,output_dim=n_hidden)
    m = TimeDistributed(A)(mem_input)
    m = Lambda(lambda x: K.sum(x, 2))(m)
    return A, m

def get_ci(mem_input, C=None):
    if C == None:
        C = Embedding(vocab_size,output_dim=n_hidden)
    m = TimeDistributed(C)(mem_input)
    m = Lambda(lambda x: K.sum(x, 2))(m)
    return C, m

def get_u(B, query_input):
    u = B(query_input)
    u = (Lambda(lambda x: K.sum(x, 1)))(u)
    u = Reshape((1, n_hidden))(u)
    return u
        
def get_p(m, u):
    u = Reshape((1, n_hidden))(u)
    p = dot([m, u], axes=2)
    p = Reshape((max_num_memories,))(p)
    p = Activation(activation='softmax')(p)
    p = Reshape((max_num_memories,1))(p)
    return p

def get_o(c_i, p):
    o = dot([c_i, p], axes=1)
    o = Reshape(target_shape=(n_hidden,))(o)
    return o

def next_u(u_prev, o_prev):
    o_prev = Reshape(target_shape=(n_hidden,))(o_prev)
    u_prev = Reshape((n_hidden,))(u_prev)
    u_next = add([o_prev, u_prev]) #u(k + 1) = o(k) + u(k)
    return u_next

In [704]:
#initialize
mem_input = Input(shape=(max_num_memories, max_memory_len))
query_input = Input(shape=(max_ques_len,))

In [705]:
A1, mi_1 = get_mi(mem_input=mem_input)
u1 = get_u(B=A1, query_input=query_input) #B = A1
p1 = get_p(mi_1, u1)
C1, ci_1 = get_ci(mem_input)
o1 = get_o(ci_1, p1)
u2 = next_u(u1, o1)

In [706]:
A2, mi_2 = get_mi(mem_input=mem_input, A=C1) #A2 = C1
p2 = get_p(mi_2, u2)
C2, ci_2 = get_ci(mem_input)
o2 = get_o(ci_2, p2)
u3 = next_u(u1, o2)

In [707]:
A3, mi_3 = get_mi(mem_input=mem_input, A=C2) #A2 = C1
p3 = get_p(mi_3, u3)
C3, ci_3 = get_ci(mem_input)
o3 = get_o(ci_3, p3)
u4 = next_u(u1, o3)

In [708]:
answer = Dense(vocab_size, activation='softmax')(u4)
answer


Out[708]:
<tf.Tensor 'dense_84/Softmax:0' shape=(?, 123) dtype=float32>

In [709]:
babi_mod = Model([mem_input, query_input], answer)

In [ ]:
babi_mod.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
K.set_value(babi_memmn.optimizer.lr, 0.005)
babi_mod.fit([m_train, q_train], a_train, **parms, batch_size=32, epochs=10,

          validation_data=([m_test, q_test], a_test))


Train on 10000 samples, validate on 1000 samples
Epoch 1/10

19s - loss: 0.4045 - acc: 0.8551 - val_loss: 0.6237 - val_acc: 0.7940
Epoch 2/10
17s - loss: 0.3838 - acc: 0.8660 - val_loss: 0.5823 - val_acc: 0.7990
Epoch 3/10
17s - loss: 0.3633 - acc: 0.8727 - val_loss: 0.6467 - val_acc: 0.7900
Epoch 4/10
17s - loss: 0.3490 - acc: 0.8817 - val_loss: 0.6516 - val_acc: 0.8060
Epoch 5/10
16s - loss: 0.3400 - acc: 0.8847 - val_loss: 0.6400 - val_acc: 0.8180
Epoch 6/10
17s - loss: 0.3329 - acc: 0.8872 - val_loss: 0.5822 - val_acc: 0.8020
Epoch 7/10
16s - loss: 0.3149 - acc: 0.8909 - val_loss: 0.5893 - val_acc: 0.8170
Epoch 8/10
17s - loss: 0.3097 - acc: 0.8959 - val_loss: 0.6872 - val_acc: 0.7930
Epoch 9/10
17s - loss: 0.2986 - acc: 0.9024 - val_loss: 0.6451 - val_acc: 0.8050
Epoch 10/10
17s - loss: 0.2954 - acc: 0.9019 - val_loss: 0.6573 - val_acc: 0.7960

Out[ ]:
<keras.callbacks.History at 0x16763130e48>

In [ ]: