In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import time
import re
import copy
import os

class dmn_plus:
    def __init__(self, task):
        self.vocab_size = 400000
        self.embedding_dim = 50
        self.hidden_layer_size = 80
        self.num_steps = 3
        self.batch_size = 100
        self.dropout_probability = 0.9
        self.l2_regularization_lambda = 0.001
        self.learning_rate = 0.001
        self.num_epochs = 128
        self.num_epochs_before_checking_valid_loss = 5
        self.num_consecutive_strips_before_stopping = 10
        self.datatype_id_dict = {'train': 0, 'valid': 1, 'test': 2, 'user': 3}
        self.datatypes = ['train', 'valid', 'test', 'user']
        self.load_embeddings()
        self.task = task
        self.load_data(self.task)
        self.create_tensorflow_graph()
        
    def load_embeddings(self):
        file = open("../../../datasets/glove_6b/glove.6B.50d.txt")    
        self.embedding = np.ndarray([self.vocab_size, self.embedding_dim])
        self.word_id_dict = {}
        self.id_word_dict = {}
        id = 0
        for line in file:
            items = line.split(' ')
            self.word_id_dict[items[0]] = id
            self.id_word_dict[id] = items[0]
            self.embedding[id,:] = np.array([float(i) for i in items[1:]])
            id += 1
        file.close()
        
    def load_babi_data(self, datatype_id):
        path_to_file_directory = "../../../datasets/facebook_babi/tasks_1-20_v1-2/en-valid-10k/"
        path_to_file = path_to_file_directory + 'qa' + str(self.task) + '_' + self.datatypes[datatype_id] + '.txt'
        
        file = open(path_to_file)
        num_words_in_longest_input_sentence = 0
        num_words_in_longest_question = 0
        num_sentences_in_each_chapter = []
        chapter_input = []
        data = []

        for line in file:
            items = re.sub('[?.]', '', line).lower().split()
            if items[-1].isdigit():
                # find the index of the second digit in that line
                index_of_second_digit = len(items)
                for index in range(len(items)-1, 0, -1):
                    if items[index].isdigit():
                        index_of_second_digit = index
                data.append({'I': copy.deepcopy(chapter_input),
                         'Q': items[1:index_of_second_digit-1],
                         'A': [items[index_of_second_digit-1]]})
                num_sentences_in_each_chapter.append(len(chapter_input))
                num_words_in_longest_question = max(num_words_in_longest_question, len(items[1:index_of_second_digit-1]))
            else:
                if items[0] == '1':
                    chapter_input = [items[1:]]
                else:
                    chapter_input.append(items[1:])
                num_words_in_longest_input_sentence = max(num_words_in_longest_input_sentence, len(items[1:]))
        file.close()

        num_sentences_in_longest_input = max(num_sentences_in_each_chapter)
        num_chapters = len(data)

        return([data, num_sentences_in_each_chapter, num_words_in_longest_input_sentence,
              num_words_in_longest_question, num_sentences_in_longest_input, num_chapters])
    
    def embed_and_pad_data(self, datatype_id):
        num_chapters = self.data_and_metadata[datatype_id][5]
        data_inputs = np.zeros([num_chapters, self.num_sentences_in_longest_input, self.num_words_in_longest_input_sentence, self.embedding_dim])
        data_questions = np.zeros([num_chapters, self.num_words_in_longest_question, self.embedding_dim])
        data_answers = np.zeros([num_chapters])
        for chapter_index, chapter in enumerate(self.data_and_metadata[datatype_id][0]):
            for sentence_index, sentence in enumerate(chapter['I']):
                data_inputs[chapter_index, sentence_index, 0:len(sentence), :] = self.embedding[[self.word_id_dict[word] for word in sentence]]
            data_questions[chapter_index, 0:len(chapter['Q']), :] = self.embedding[[self.word_id_dict[word] for word in chapter['Q']]]
            data_answers[chapter_index] = None if chapter['A'][0] == None else self.word_id_dict[chapter['A'][0]]
            
        return([data_inputs, data_questions, data_answers])
    
    def create_position_encoding(self):
        self.position_encoding = np.ones([self.embedding_dim, self.num_words_in_longest_input_sentence], dtype=np.float32)

        ## Below (my implementation, from section 3.1 in https://arxiv.org/pdf/1603.01417.pdf) didn't work.
        # for j in range(1, num_words_in_longest_input_sentence+1):
        #     for d in range(1, embedding_dim+1):
        #         position_encoding[d-1, j-1] = (1 - j/num_words_in_longest_input_sentence) - (d/embedding_dim)*(1 - 2*j/num_words_in_longest_input_sentence)

        ## Copied from https://github.com/domluna/memn2n
        ls = self.num_words_in_longest_input_sentence+1
        le = self.embedding_dim+1
        for i in range(1, le):
            for j in range(1, ls):
                self.position_encoding[i-1, j-1] = (i - (le-1)/2) * (j - (ls-1)/2)
        self.position_encoding = 1 + 4 * self.position_encoding / self.embedding_dim / self.num_words_in_longest_input_sentence
        self.position_encoding = np.transpose(self.position_encoding)
        
    def load_data(self, task):
        self.data_and_metadata = [self.load_babi_data(datatype_id = i) for i in range(3)]      
        self.data_and_metadata.append(self.preprocess_user_data())
        
        self.num_words_in_longest_input_sentence = max([self.data_and_metadata[i][2] for i in range(3)])
        self.num_words_in_longest_question = max([self.data_and_metadata[i][3] for i in range(3)])
        self.num_sentences_in_longest_input = max([self.data_and_metadata[i][4] for i in range(3)])
        
        self.embedded_data = [self.embed_and_pad_data(datatype_id = i) for i in range(4)]
        
        self.create_position_encoding()
        
    def answer_user_data(self, inputs, questions):    
        # load model on user data
        self.data_and_metadata[self.datatype_id_dict['user']] = self.preprocess_user_data(inputs, questions)
        self.embedded_data[self.datatype_id_dict['user']] = self.embed_and_pad_data(self.datatype_id_dict['user'])
        # get predictions
        predictions = self.sess.run(self.predictions, feed_dict = self.get_batch(datatype = 'user', batch_number = 0))
        predictions = [self.id_word_dict[id] for id in predictions]
        return(predictions)
    
    def preprocess_user_data(self, inputs = [], questions = []):    
        num_sentences_in_each_chapter = []
        chapter_input = []
        data = []

        for index in range(len(inputs)):
            input = re.sub('[?]', '', inputs[index]).lower().split('.')
            chapter_input = []
            for sentence in input:
                if sentence != '':
                    chapter_input.append(sentence.split())
            cleaned_question = re.sub('[?]', '', questions[index]).lower().split()
            data.append({'I': chapter_input,
                         'Q': cleaned_question,
                         'A': [None]})
            num_sentences_in_each_chapter.append(len(chapter_input))
        num_chapters = len(data)
        
        return([data, num_sentences_in_each_chapter, None,
              None, None, num_chapters])
        
    def get_batch(self, datatype, batch_number):
        index = self.datatype_id_dict[datatype]
        return {self.inputs: self.embedded_data[index][0][batch_number*self.batch_size: (batch_number+1)*self.batch_size],
                self.questions: self.embedded_data[index][1][batch_number*self.batch_size: (batch_number+1)*self.batch_size],
                self.answers: self.embedded_data[index][2][batch_number*self.batch_size: (batch_number+1)*self.batch_size],
                self.input_lengths: self.data_and_metadata[index][1][batch_number*self.batch_size: (batch_number+1)*self.batch_size]
               }
    
    def perform_epoch(self, num_chapters, datatype):
        epoch_loss = epoch_num_correct = 0
        for batch_idx in range(num_chapters/self.batch_size):
            batch_loss, batch_num_correct, _ = self.sess.run((self.loss, self.num_correct, (self.optimizer if datatype == 'train' else self.question_vector)), 
                                                                        feed_dict = self.get_batch(datatype = datatype, batch_number = batch_idx))
            epoch_loss += batch_loss
            epoch_num_correct += batch_num_correct
        return(epoch_loss, epoch_num_correct)
                
    def train(self):            
        self.sess.run(tf.global_variables_initializer())
        best_valid_loss = float("inf")
        previous_valid_loss = float("inf")
        is_valid_loss_greater_strip = []
        train_loss = []
        train_num_chapters = self.data_and_metadata[self.datatype_id_dict['train']][5]
        valid_num_chapters = self.data_and_metadata[self.datatype_id_dict['valid']][5]
        start_time = time.time()
        for epoch in range(self.num_epochs):
            epoch_loss, epoch_num_correct = self.perform_epoch(train_num_chapters, 'train')
            print("Epoch %d: %.2f%% complete, %d mins, Avg loss: %.2f, Num correct: %d, Accuracy: %.2f%%" % (epoch, 
                                                                                   epoch*100.0/self.num_epochs,
                                                                                    (time.time() - start_time)/60,
                                                                                   epoch_loss/train_num_chapters, 
                                                                                    epoch_num_correct,
                                                                                    epoch_num_correct*100.0/train_num_chapters))
            train_loss.append(epoch_loss/train_num_chapters)
            # early stopping
            if epoch%self.num_epochs_before_checking_valid_loss == 0:
                epoch_loss, epoch_num_correct = self.perform_epoch(valid_num_chapters, 'valid')
                print("\nValidation avg loss: %.2f, Num correct: %d, Accuracy: %.2f%%"%(epoch_loss/valid_num_chapters, epoch_num_correct, float(epoch_num_correct*100)/valid_num_chapters))
                # self.save()
                
                # UP stopping criterion (from http://page.mi.fu-berlin.de/prechelt/Biblio/stop_tricks1997.pdf)
                is_valid_loss_greater_strip.append(False if epoch_loss < previous_valid_loss else True)
                if(sum(is_valid_loss_greater_strip[-self.num_consecutive_strips_before_stopping:]) == self.num_consecutive_strips_before_stopping):
                    print("Stopping Early\nDuration: %d mins" % int((time.time() - start_time)/60))
                    return
                
                if epoch_loss < best_valid_loss:
                    best_valid_loss = epoch_loss
                    self.save()                
                previous_valid_loss = epoch_loss
                
            # early stopping if progress is < 0.1
            progress = np.mean(train_loss[-5:])/min(train_loss[-5:]) - 1
            print(progress)
            if progress < 0.01 and epoch >= 9:
                if epoch_loss < best_valid_loss:
                    best_valid_loss = epoch_loss
                    self.save()
                return
                # crude stopping criterion (stop as soon as validation loss increases)
                # if new validation loss is lower than the old one, save new weights
#                 if epoch_loss < old_valid_loss:
#                     old_valid_loss = epoch_loss
#                     self.save()
#                 # else, stop training
#                 else:
#                     print("Stopping Early\nDuration: %d mins" % int((time.time() - start_time)/60))
#                     return
        print("Duration: %d mins" % int((time.time() - start_time)/60))
        
    def save(self):
        # create filename based on task
        save_path = self.saver.save(self.sess, "../saved-models/task-%d.ckpt"%self.task)
        print("Model saved in file: %s\n" % save_path)
        
    def restore(self, task):
        # check if there is any saved model for that particular task
        if os.path.isfile("../saved-models/task-%d.ckpt.meta"%task):
            self.saver.restore(self.sess, "../saved-models/task-%d.ckpt"%task)
            print("Model restored")
        else:
            print("Saved model for given task does not exist")
        
    def test(self):
        # load the weights from the model that generated the best validation loss
        self.restore(self.task)
        start_time = time.time()
        num_chapters = self.data_and_metadata[self.datatype_id_dict['test']][5]
        total_loss, total_num_correct = self.perform_epoch(num_chapters, 'test')
        print("%d mins, Avg loss: %.2f, Num correct: %d, Accuracy: %.2f%%" % ((time.time() - start_time)/60,
                                                                          total_loss/num_chapters,
                                                                          total_num_correct,
                                                                          total_num_correct*100.0/num_chapters))
        
    def create_tensorflow_graph(self):
        self.inputs = tf.placeholder(tf.float32, [None, self.num_sentences_in_longest_input, self.num_words_in_longest_input_sentence, self.embedding_dim])
        self.questions = tf.placeholder(tf.float32, [None, None, self.embedding_dim])
        self.answers = tf.placeholder(tf.int32, [None])
        self.input_lengths = tf.placeholder(tf.int32, [None])

        ## Question module
        with tf.variable_scope('question_module'):
            question_gru_cell = tf.contrib.rnn.GRUCell(self.hidden_layer_size)
            _, self.question_vector = tf.nn.dynamic_rnn(question_gru_cell,
                                                  self.questions,
                                                  dtype=tf.float32)

        ## Input module
        with tf.variable_scope('input_module'):

            positionally_encoded_inputs = tf.reduce_sum(self.inputs*self.position_encoding, 2)

            input_forward_gru_cell = tf.contrib.rnn.GRUCell(self.hidden_layer_size)
            input_backward_gru_cell = tf.contrib.rnn.GRUCell(self.hidden_layer_size)
            input_module_output, _ = tf.nn.bidirectional_dynamic_rnn(input_forward_gru_cell,
                                                                    input_backward_gru_cell,
                                                                    positionally_encoded_inputs,
                                                                    sequence_length = self.input_lengths,
                                                                    dtype = tf.float32)
            input_fact_vectors = tf.add(input_module_output[0], input_module_output[1])
            input_fact_vectors = tf.nn.dropout(input_fact_vectors, self.dropout_probability)

        ## Episodic Memory module
        with tf.variable_scope('episodic_memory_module'):
            weight = tf.get_variable("weight", [3*self.hidden_layer_size, 80],
                                            initializer=tf.random_normal_initializer())
            bias = tf.get_variable("bias", [1, self.hidden_layer_size],
                                            initializer=tf.random_normal_initializer())
            self.previous_memory = self.question_vector
            for step in range(self.num_steps):
                attentions = []
                for fact_index, fact_vector in enumerate(tf.unstack(input_fact_vectors, axis = 1)):
                    reuse = bool(step) or bool(fact_index)
                    with tf.variable_scope("attention", reuse = reuse):
                        z = tf.concat([tf.multiply(fact_vector, self.question_vector), 
                                       tf.multiply(fact_vector, self.previous_memory),
                                       tf.abs(tf.subtract(fact_vector, self.question_vector)),
                                       tf.abs(tf.subtract(fact_vector, self.previous_memory))], 1)
                        attention = tf.contrib.layers.fully_connected(z,
                                                                    self.embedding_dim,
                                                                    activation_fn=tf.nn.tanh,
                                                                    reuse=reuse, scope="fc1")
                        attention = tf.contrib.layers.fully_connected(attention,
                                                                    1,
                                                                    activation_fn=None,
                                                                    reuse=reuse, scope="fc2")
                        attentions.append(tf.squeeze(attention))
                attentions = tf.expand_dims(tf.nn.softmax(tf.transpose(tf.stack(attentions))), axis=-1)
                reuse = True if step > 0 else False
                # soft attention
                self.context_vector = tf.reduce_sum(tf.multiply(input_fact_vectors, attentions), axis = 1)
                self.previous_memory = tf.nn.relu(tf.matmul(tf.concat([self.previous_memory, self.context_vector, self.question_vector], axis = 1), 
                                                            weight) + bias)
                        
            self.previous_memory = tf.nn.dropout(self.previous_memory, self.dropout_probability)

        ## Answer module
        with tf.variable_scope('answer_module') as scope:
            logits = tf.contrib.layers.fully_connected(inputs = tf.concat([self.previous_memory, self.question_vector], axis = 1),
                                                      num_outputs = self.vocab_size,
                                                      activation_fn = None)

            ## Loss and metrics
            self.loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(logits = logits, labels = self.answers))

            # add l2 regularization for all variables except biases
            for v in tf.trainable_variables():
                if not 'bias' in v.name.lower():
                    self.loss += self.l2_regularization_lambda * tf.nn.l2_loss(v)

            self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)

            self.predictions = tf.cast(tf.argmax(tf.nn.softmax(logits), 1), 'int32')
            self.num_correct = tf.reduce_sum(tf.cast(tf.equal(self.predictions, self.answers), tf.int32))
            
        self.sess = tf.Session()
        self.saver = tf.train.Saver()

To train


In [2]:
model = dmn_plus(task = 3)
model.train()


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-2-d3556abc9293> in <module>()
----> 1 model = dmn_plus(task = 3)
      2 model.train()

<ipython-input-1-4d6b3f426f7b> in __init__(self, task)
     25         self.task = task
     26         self.load_data(self.task)
---> 27         self.create_tensorflow_graph()
     28 
     29     def load_embeddings(self):

<ipython-input-1-4d6b3f426f7b> in create_tensorflow_graph(self)
    322                     self.loss += self.l2_regularization_lambda * tf.nn.l2_loss(v)
    323 
--> 324             self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
    325 
    326             self.predictions = tf.cast(tf.argmax(tf.nn.softmax(logits), 1), 'int32')

/home/amolmane1/.local/lib/python2.7/site-packages/tensorflow/python/training/optimizer.pyc in minimize(self, loss, global_step, var_list, gate_gradients, aggregation_method, colocate_gradients_with_ops, name, grad_loss)
    313         aggregation_method=aggregation_method,
    314         colocate_gradients_with_ops=colocate_gradients_with_ops,
--> 315         grad_loss=grad_loss)
    316 
    317     vars_with_grad = [v for g, v in grads_and_vars if g is not None]

/home/amolmane1/.local/lib/python2.7/site-packages/tensorflow/python/training/optimizer.pyc in compute_gradients(self, loss, var_list, gate_gradients, aggregation_method, colocate_gradients_with_ops, grad_loss)
    384         gate_gradients=(gate_gradients == Optimizer.GATE_OP),
    385         aggregation_method=aggregation_method,
--> 386         colocate_gradients_with_ops=colocate_gradients_with_ops)
    387     if gate_gradients == Optimizer.GATE_GRAPH:
    388       grads = control_flow_ops.tuple(grads)

/home/amolmane1/.local/lib/python2.7/site-packages/tensorflow/python/ops/gradients_impl.pyc in gradients(ys, xs, grad_ys, name, colocate_gradients_with_ops, gate_gradients, aggregation_method)
    566 
    567       # Update pending count for the inputs of op and enqueue ready ops.
--> 568       _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state)
    569 
    570   if loop_state:

/home/amolmane1/.local/lib/python2.7/site-packages/tensorflow/python/ops/gradients_impl.pyc in _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state)
    589   for x in op.inputs:
    590     # pylint: disable=protected-access
--> 591     pending_count[x.op._id] -= 1
    592     ready = (pending_count[x.op._id] == 0)
    593     if loop_state and not ready:

/home/amolmane1/.local/lib/python2.7/site-packages/tensorflow/python/framework/ops.pyc in _id(self)
   1360   def _id(self):
   1361     """The unique integer id of this operation."""
-> 1362     return self._id_value
   1363 
   1364   @property

KeyboardInterrupt: 

To get test error after training


In [3]:
model.test()


INFO:tensorflow:Restoring parameters from ../saved-models/task-3.ckpt
Model restored
0 mins, Avg loss: 1.77, Num correct: 173, Accuracy: 17.30%

To get word answer for input-question pairs (in word form) for GUI


In [4]:
# model.answer_user_data(inputs = ["John went to the bathroom. Adam went to the office. John went back to the garden."], questions = ["Where is John?"])

To get test error after loading pre-trained model


In [5]:
# model = dmn_plus(task = 2)
# model.test()