In [544]:
'''
Implementation of the single hop network from https://arxiv.org/abs/1503.08895.
Derived from:
- https://github.com/fchollet/keras/blob/master/examples/babi_memnn.py and
- Implementation by Jeremy Howard for http://course.fast.ai/.
'''
from __future__ import print_function
from keras.models import Sequential
from keras.layers import *
from keras.layers import Input
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import keras.backend as K
import numpy as np
import tarfile
import itertools
from keras.models import *
import random
import sys
from glob import glob
import re
import tqdm
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import *
import os
from keras_tqdm import TQDMNotebookCallback
The training data consists of a number of "examples". Each example contains a few memories, and the relevant question/answer pairs. Consider the following example
In the above example, we have 3 memories and 2 question/answer pairs:
Three Memories
2 Questions/Answer Pairs:
Q: Where is Mary? A: bathroom
Q: Where is Daniel? A: hallway
Note that for the first question, only memories (1, 2) are relevant. For the second question, memories (1, 2, 4) are relevant.
In [545]:
class Corpus:
def __init__(self):
'''
A corpus object maintains a mapping from a word (string) to a unique id (int).
'''
self.word_idx_dict = {}
self.uniq_word_cnt = 0
def update_vocab(self, words):
'''
Updates the corpus with the given list of words. The words that are seen for the
first time are added to the word -> id dictionary.
'''
for word in words:
if word not in self.word_idx_dict:
self.word_idx_dict[word] = self.uniq_word_cnt
self.uniq_word_cnt += 1
def words_idx(self, words):
'''
Returns the list of IDs corresponding to the given words.
'''
return [self.word_idx_dict[word] for word in words]
def tokenize(self, sentence):
return [x.strip() for x in re.split('(\W+)?', sentence) if x.strip()]
In [546]:
class ExampleParser:
'''
Responsible for parsing examples for the babi tasks as specified at https://research.fb.com/downloads/babi/
'''
@staticmethod
def add(example_lines, data, c):
'''
Takes the set of lines that form an example and:
a) updates the corpus with these lines
b) Parses the line to 3-tuples of the form: (memory, question, answer).
A single story line may yield several 3-tuples of the above form. E.g.:
1 Mary moved to the bathroom.
2 John went to the hallway.
3 Where is Mary? bathroom 1
4 Daniel went back to the hallway.
12 Where is Daniel? hallway 4
Will generate 2 tuples:
- ([1 Mary moved to the bathroom., 2 John went to the hallway.], Where is Mary?, bathroom)
- ([1 Mary moved to the bathroom., 2 John went to the hallway., 4 Daniel went back to the hallway.],
Where is Daniel?, hallway)
Note that instead of storing the actual words, an example stores the IDs of the associated words.
A word -> ID map is maintained in the corpus.
@example_lines: A set of lines that form an example.
@data: List of 3-tuples (memories, question, answer), updated by "add".
@c: The corpus object.
'''
memories = []
memories_txt = []
qa = []
for eg_line in example_lines:
if "\t" not in eg_line: #normal memory
eg_line = c.tokenize(eg_line)
c.update_vocab(eg_line)
mem_id, memory = eg_line[0], c.words_idx(eg_line[1:])
memories.append(c.words_idx(eg_line))
memories_txt.append(eg_line)
else: #question line
ques, ans, hints = eg_line.split("\t")
ques = c.tokenize(ques)[1:]
c.update_vocab(ques)
ans = c.tokenize(ans)
c.update_vocab(ans)
data.append(([m for m in memories],
c.words_idx(ques), c.words_idx(ans)))
@staticmethod
def process_files(lines, corpus):
'''
Reads the given file, identifies splits of the example and adds them to th.
'''
data = []
eg_lines = [lines[0].decode('utf-8').strip()]
for line in lines[1:]:
line = line.decode('utf-8').strip()
if int(line.split(" ", 1)[0]) == 1: #new story starts
ExampleParser.add(eg_lines, data, corpus)
eg_lines = [line.strip()]
else:
eg_lines.append(line.strip())
if len(eg_lines) > 0:
ExampleParser.add(eg_lines, data, corpus)
return data
In [547]:
challenges = {
# QA1 with 10,000 samples
'single_supporting_fact_10k': 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt',
# QA2 with 10,000 samples
'two_supporting_facts_10k': 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt',
}
path = get_file('babi-tasks-v1-2.tar.gz', origin='https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz')
tar = tarfile.open(path)
challenge_type = 'two_supporting_facts_10k'
challenge = challenges[challenge_type]
test_lines = tar.extractfile(challenge.format('test')).readlines()
train_lines = tar.extractfile(challenge.format('train')).readlines()
print(train_lines[0:4])
print(test_lines[0:4])
In [548]:
c = Corpus()
print("Processing training files")
train_tuples = ExampleParser.process_files(train_lines, c)
print("Processing test files")
test_tuples = ExampleParser.process_files(test_lines, c)
all_tuples = train_tuples + test_tuples
print("# training tuples = {0}\n# test tuples = {1}".format(len(train_tuples), len(test_tuples)))
In [549]:
print(train_tuples[0])
print(train_lines[0:3])
From the mapping, note that "1 Mary moved to the bathroom." has been mapped to "0, 1, 2, 3, 4, 5, 6". (line number is used as a feature).
Similarily, 3 Where is Mary? is mapped to [11, 12, 1, 13] (note that Mary -> 1, the mapping is retained).
The answer is 5, bathroom
At this point, we have the 3-tuples of (memories, question, answer).
Different tuples may have different number of memories, and different number of words in memories and questions.
As a reminder, a 3-tuple contains a bunch of memories, 1 question and the corresponding answer. Also, we have processed these tuples so that at this point, they are just a bunch of numbers.
Because we want to use the same network for training, we will "pad" our 3-tuples to make them all of the same size. This means that:
In [550]:
max_num_memories = max([len(example[0]) for example in all_tuples])
max_memory_len = max([len(memory) for example in all_tuples for memory in example[0]])
max_ques_len = max([len(example[1]) for example in all_tuples])
vocab_size = c.uniq_word_cnt
len(train_tuples), len(test_tuples), c.uniq_word_cnt, max_num_memories, max_memory_len, max_ques_len
Out[550]:
| Variable | Count |
|---|---|
| # Training Tuples | 10000 |
| # Test Tuples | 1000 |
| Max # words per memory | 8 |
| Max # memories in a tuple | 10 |
| Max # words per question | 4 |
In [551]:
def pad_tuples(tuples, max_memory_len, max_num_memories, max_ques_len, vocab_size):
'''
Takes a number of tuples, as well as measures required for 0 padding.
Returns a padded version of the memories, questions and the answer.
In other words, for each tuple, memories is now a matrix of:
a) max_num_memories * max_memory_len
b) question is an array with max_ques_len elements. The gaps are filled with 0s.
Also performs 1-hot encoding for the output.
'''
m, q, a = [], [], []
for (memories, ques, ans) in tuples:
memories= pad_sequences(memories, maxlen=max_memory_len)
memories = np.concatenate([memories, np.zeros((max_num_memories - memories.shape[0],
max_memory_len), 'int') ])
m.append(memories)
q.append(ques)
#ans_vec = np.zeros((vocab_size))
#ans_vec[ans] = 1
a.append(ans)
return np.array(m), pad_sequences(q, maxlen=max_ques_len), np.array(a)
In [552]:
m_train, q_train, a_train = pad_tuples(train_tuples, max_memory_len, max_num_memories, max_ques_len, c.uniq_word_cnt)
m_test, q_test, a_test = pad_tuples(test_tuples, max_memory_len, max_num_memories, max_ques_len, c.uniq_word_cnt)
print(m_train.shape)
print(q_train.shape)
print(a_train.shape)
print(m_test.shape)
print(q_test.shape)
print(a_test.shape)
We will now build the model as specified at https://arxiv.org/abs/1503.08895.
The relevant snippet from the paper are referenced inline. An attempt has been made to stay as close to the notation in paper as possible. Continuing from the notations used above, our network is designed to be trained on 1-tuple at a time. That is, the network will be fed a list of 3-tuples, where each of the 3-tuples is (memories, question, answer).
For the entirety of the model building discussion, the focus will be on 1 such tuple.
The set x1, x2, ..., x_i that the paper mentions is our list of memories (each of the x_i is basically a list of numbers).
For each of the inputs, x_i we learn an embedded representation, m_i. In the single fact example, the length of each x is 8. Each of the x is thus an 8-dimensional vectory. Similarily, the number of dimensions in the embedded space is 64. Thus, we want to map each of the 8 dim input vector to a 64 dim vector.
This is achieved by embedding each word of the input to a 64 dim vector, and then adding them all to give the 64 dim vector for the input.
Suppose that x_1 is = "1 Mary went to the kitchen", mapped to [0 0 12 23 32 33 22 21] (0 padding to make the length = 8). For brevity, suppose that the number of hidden dimensions was 5 (it's 64 in the real case), then step 1 would be to learn 1 5 dimensional vector for each word in the sequence:
0: '[0.13 0.69 0.52 -0.22 -1.15]'
kitchen: '[-0.5 , -1.11, -0.47, 0.09, -1.87]'
The representation for the sentence is then just a sum of all of these: [-0.77, -0.18, -1.46, 3.19, -1.]).
This is m_1.
In [553]:
n_hidden = 64
memories = Input(shape=(max_num_memories, max_memory_len))
x = TimeDistributed(Embedding(input_dim=vocab_size, output_dim=n_hidden))(memories)
m = Lambda(lambda xx: K.sum(xx, 2))(x)
memories.shape, m.shape
Out[553]:
In [554]:
query = Input(shape=(max_ques_len,))
u = Embedding(input_dim=vocab_size, output_dim=n_hidden)(query)
u = Lambda(lambda x : K.sum(x, 1))(u)
u = Reshape(target_shape=(1, n_hidden))(u)
query.shape, u.shape
Out[554]:
In [555]:
p = dot([m, u], axes=2)
p = Reshape((max_num_memories,))(p)
p = Activation(activation='softmax')(p)
p = Reshape((max_num_memories,1))(p)
p.shape
Out[555]:
In [556]:
x = TimeDistributed(Embedding(vocab_size, n_hidden))(memories)
c = Lambda(lambda xx: K.sum(xx, 2))(x)
c.shape
Out[556]:
In [557]:
o = dot([c, p], axes=1)
o = Reshape(target_shape=(1,n_hidden))(o)
o
Out[557]:
In [558]:
a_in = Lambda(lambda ou: sum([ou[0], ou[1]]))([o, u])
a_in = Reshape(target_shape=(n_hidden,))(a_in)
answer = Dense(vocab_size, activation='softmax')(a_in)
answer
Out[558]:
In [559]:
babi_memmn = Model([memories, query], answer)
In [560]:
babi_memmn.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
In [561]:
K.set_value(babi_memmn.optimizer.lr, 1e-2)
params = {'verbose': 2, 'callbacks': [TQDMNotebookCallback(leave_inner=False)]}
babi_memmn.fit([m_train, q_train], a_train, **params, batch_size=32, epochs=5,
validation_data=([m_test, q_test], a_test))
Out[561]:
We will now start building the multi hop network, starting with a verbose 2-hop network and then modularizing it. Let us look at the experiments section to see which configuration have the authors used for the final experiment.
Let's look at the multi-hop networks first.
We note the following from the diagram:
Layers of networks are stacked one over the other.
The input doesn't change across the layers.
The original query is supplied only to the first layer. For the subsequent layers, the following scheme is used: u(i + 1) = u(i) + o(i)
We will start with 2 hops first, and then move to three.
In [650]:
mem_input = Input(shape=(max_num_memories, max_memory_len))
query_input = Input(shape=(max_ques_len,))
In [651]:
A1 = Embedding(vocab_size,output_dim=n_hidden)
m_i = TimeDistributed(A1)(mem_input)
m_i = Lambda(lambda x: K.sum(x, 2))(m_i)
m_i
Out[651]:
In [652]:
B = A1 #as specified
u1 = B(query_input)
u1 = (Lambda(lambda x: K.sum(x, 1)))(u1)
u1 = Reshape((1, n_hidden))(u1)
u1
Out[652]:
In [653]:
#nothing special here
C1 = Embedding(vocab_size,output_dim=n_hidden)
c_i_1 = TimeDistributed(C1)(mem_input)
c_i_1 = Lambda(lambda x: K.sum(x, 2))(c_i_1)
c_i_1
Out[653]:
In [654]:
#Calculation for a given hop don't change at all.
p1 = dot([m_i, u1], axes=2)
p1 = Reshape((max_num_memories,))(p1)
p1 = Activation(activation='softmax')(p1)
p1 = Reshape((max_num_memories,1))(p1)
o1 = dot([c_i_1, p1], axes=1)
o1 = Reshape(target_shape=(n_hidden,))(o1)
o1
Out[654]:
In [655]:
u1 = Reshape((n_hidden,))(u1)
u2 = add([o1, u1])
u2
Out[655]:
In [656]:
A2 = C1 #A(k + 1) = C(k)
m_i = TimeDistributed(A2)(mem_input)
m_i = Lambda(lambda x: K.sum(x, 2))(m_i)
m_i
Out[656]:
In [657]:
#Same as the previous layer
C2 = Embedding(vocab_size,output_dim=n_hidden)
c_i_2 = TimeDistributed(C2)(mem_input)
c_i_2 = Lambda(lambda x: K.sum(x, 2))(c_i_2)
c_i_2
Out[657]:
In [658]:
#Same as the previous layer
u2 = Reshape((1, n_hidden))(u2)
p2 = dot([m_i, u2], axes=2)
p2 = Reshape((max_num_memories,))(p2)
p2 = Activation(activation='softmax')(p2)
p2 = Reshape((max_num_memories,1))(p2)
o2 = dot([c_i_2, p2], axes=1)
o2 = Reshape(target_shape=(n_hidden,))(o2)
o2
Out[658]:
In [663]:
u2 = Reshape((n_hidden,))(u2)
u1 = Reshape((n_hidden,))(u1)
#u3 = add([o2, u1]) #u(k + 1) = o(k) + u(k)
#This is a hack, I was not able to get good results with u3 = u2 + o2
u3 = add([o2, u1]) #u(k + 1) = o(k) + u(k)
u3
Out[663]:
In [664]:
#answer = MemOut(vocab_size, C2)(u3)
answer = Dense(vocab_size, activation='softmax')(u3)
answer
Out[664]:
In [665]:
babi2 = Model([mem_input, query_input], answer)
In [ ]:
babi2.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
K.set_value(babi_memmn.optimizer.lr, 0.01)
babi2.fit([m_train, q_train], a_train, **parms, batch_size=32, epochs=8,
validation_data=([m_test, q_test], a_test))
In [ ]:
In [703]:
def get_mi(mem_input, A=None):
if A == None:
A = Embedding(vocab_size,output_dim=n_hidden)
m = TimeDistributed(A)(mem_input)
m = Lambda(lambda x: K.sum(x, 2))(m)
return A, m
def get_ci(mem_input, C=None):
if C == None:
C = Embedding(vocab_size,output_dim=n_hidden)
m = TimeDistributed(C)(mem_input)
m = Lambda(lambda x: K.sum(x, 2))(m)
return C, m
def get_u(B, query_input):
u = B(query_input)
u = (Lambda(lambda x: K.sum(x, 1)))(u)
u = Reshape((1, n_hidden))(u)
return u
def get_p(m, u):
u = Reshape((1, n_hidden))(u)
p = dot([m, u], axes=2)
p = Reshape((max_num_memories,))(p)
p = Activation(activation='softmax')(p)
p = Reshape((max_num_memories,1))(p)
return p
def get_o(c_i, p):
o = dot([c_i, p], axes=1)
o = Reshape(target_shape=(n_hidden,))(o)
return o
def next_u(u_prev, o_prev):
o_prev = Reshape(target_shape=(n_hidden,))(o_prev)
u_prev = Reshape((n_hidden,))(u_prev)
u_next = add([o_prev, u_prev]) #u(k + 1) = o(k) + u(k)
return u_next
In [704]:
#initialize
mem_input = Input(shape=(max_num_memories, max_memory_len))
query_input = Input(shape=(max_ques_len,))
In [705]:
A1, mi_1 = get_mi(mem_input=mem_input)
u1 = get_u(B=A1, query_input=query_input) #B = A1
p1 = get_p(mi_1, u1)
C1, ci_1 = get_ci(mem_input)
o1 = get_o(ci_1, p1)
u2 = next_u(u1, o1)
In [706]:
A2, mi_2 = get_mi(mem_input=mem_input, A=C1) #A2 = C1
p2 = get_p(mi_2, u2)
C2, ci_2 = get_ci(mem_input)
o2 = get_o(ci_2, p2)
u3 = next_u(u1, o2)
In [707]:
A3, mi_3 = get_mi(mem_input=mem_input, A=C2) #A2 = C1
p3 = get_p(mi_3, u3)
C3, ci_3 = get_ci(mem_input)
o3 = get_o(ci_3, p3)
u4 = next_u(u1, o3)
In [708]:
answer = Dense(vocab_size, activation='softmax')(u4)
answer
Out[708]:
In [709]:
babi_mod = Model([mem_input, query_input], answer)
In [ ]:
babi_mod.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
K.set_value(babi_memmn.optimizer.lr, 0.005)
babi_mod.fit([m_train, q_train], a_train, **parms, batch_size=32, epochs=10,
validation_data=([m_test, q_test], a_test))
Out[ ]:
In [ ]: