Here we use a simple Encoder/Decoder GRU network to predict answers from the Cornell Movie-Dialog Corpus. We use PyTorch as a deep learning framework.
Most of the code in this notebook comes from the following tutorial on English-French translation.
https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
We apply the Machine Translation framework described in the tutorial to Dialogue Management by processing sentences in the corpus by pairs: we encode the sentence, and decode the answer.
In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
from datetime import datetime
from collections import defaultdict
from six import iteritems
import numpy as np
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
In [2]:
from tqdm import tqdm_notebook as tqdm
In [3]:
DEVICE
Out[3]:
In [4]:
from due.corpora import cornell
import itertools
N_DIALOGS = 100
episodes = list(itertools.islice(cornell.episode_generator(), N_DIALOGS))
# episodes = cornell.load()
In [5]:
episodes[95].events
Out[5]:
In [ ]:
import pickle
from due.episode import Episode
saved_episodes_filename = 'SW_EPISODES.pkl'
with open(saved_episodes_filename, 'rb') as f:
saved_episodes = pickle.load(f)
episodes = [Episode.load(e) for e in saved_episodes]
In [6]:
from due.nlp.preprocessing import normalize_sentence
In [7]:
s = "Can we make this quick? Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad. Again."
In [8]:
s_normalized = normalize_sentence(s, False)
print(s_normalized)
Here we generate a dataset of utterances and their responses. The output of this section is:
str
) X
str
) y
, one per utterance in X
.Example:
["hi", "hello how are you?", "i'm fine thanks", ...]
["hello how are you?", "i'm fine thanks", "good to hear", ...]
Note that within an Episode i
, y_i
is just X_i[1:]
. This is not true when X
and y
are obtained concatenating data from multiple episodes.
In [9]:
from due.event import Event
In [10]:
# from due.episode import extract_utterance_pairs
def _is_utterance(event):
return event.type == Event.Type.Utterance
def extract_utterance_pairs(episode, preprocess_f=None):
"""
Process Events in an Episode, extracting all the Utterance Event pairs that
can be interpreted as one dialogue turn (ie. an Agent's utterance, and a
different Agent's response).
In particular, Event pairs are extracted from the Episode so that:
* Both Events are Utterances (currently, non-utterances will raise an exception)
* The second Event immediately follows the first
* The two Events are acted by two different Agents
This means that if an utterance has more than one answers, only the first
one will be included in the result.
If a `preprocess_f` function is specified, resulting utterances will be run
through this function before being returned. A LRU Cache is applied to
`preprocess_f`, as most sentences will be returned as both utterances and
answers/
Return two lists of the same length, so that each utterance `X_i` in the
first list has its response `y_i` in the second.
:param episode: an Episode
:type episode: :class:`due.episode.Episode`
:param preprocess_f: when given, sentences will be run through this function before being returned
:type preprocess_f: `func`
:return: a list of utterances and the list of their answers (one per utterance)
:rtype: (`list`, `list`)
"""
preprocess_f = lru_cache(4)(preprocess_f) if preprocess_f else lambda x: x
result_X = []
result_y = []
for e1, e2 in zip(episode.events, episode.events[1:]):
if not _is_utterance(e1) or not _is_utterance(e2):
raise NotImplementedError("Non-utterance Events are not supported yet")
if e1.agent != e2.agent and e1.payload and e2.payload:
result_X.append(preprocess_f(e1.payload))
result_y.append(preprocess_f(e2.payload))
return result_X, result_y
extract_utterance_pairs(episodes[0])
Out[10]:
In [11]:
from tqdm import tqdm_notebook as tqdm
X = []
y = []
for e in tqdm(episodes):
try:
episode_X, episode_y = extract_utterance_pairs(e)
except AttributeError:
print("Skipping episode with events: %s" % e.events)
X.extend(episode_X)
y.extend(episode_y)
In [12]:
# from due.vocabulary import Vocabulary
from due import __version__
UNK = '<UNK>'
SOS = '<SOS>'
EOS = '<EOS>'
class Vocabulary():
def __init__(self):
self.word_to_index = {}
self.index_to_word = {}
self.index_to_count = defaultdict(int)
self.current_index = 0
self.add_word(UNK) # Unknown token
self.add_word(SOS) # Start of String
self.add_word(EOS) # End of String
def add_word(self, word):
"""
Add a new word to the dictionary.
:param word: the word to add
:type word: `str`
"""
if word in self.word_to_index:
index = self.word_to_index[word]
else:
index = self.current_index
self.current_index += 1
self.word_to_index[word] = index
self.index_to_word[index] = word
self.index_to_count[index] += 1
def index(self, word):
"""
Retrieve a word's index in the Vocabulary. Return the index of the <UNK>
token if not present.
:param word: the word to look up
:type word: `str`
:return: the word's index if existing, *<UNK>*'s index otherwise
:rtype: `int`
"""
if word in self.word_to_index:
return self.word_to_index[word]
return self.word_to_index[UNK]
def word(self, index):
"""
Return the word corresponding to the given index
:param index: the index to look up
:type index: `int`
:return: the words corresponding to the given index
:rtype: `str`
"""
return self.index_to_word[index]
def size(self):
"""
Return the number of words in the Vocabulary
:return: number of words in the Vocabulary
:rtype: `int`
"""
return len(self.word_to_index)
def save(self):
"""
Return a serializable `dict` representing the Vocabulary.
:return: a serializable representation of self
:rtype: `dict`
"""
return {
'_version': __version__,
'word_to_index': self.word_to_index,
'index_to_word': self.index_to_word,
'index_to_count': self.index_to_count,
'current_index': self.current_index,
}
@staticmethod
def load(data):
result = Vocabulary()
result.word_to_index = data['word_to_index']
result.index_to_word = data['index_to_word']
result.index_to_count = data['index_to_count']
result.current_index = data['current_index']
return result
In [13]:
vocabulary_full = Vocabulary()
for sentence in set(X + y):
for word in sentence.split():
vocabulary_full.add_word(word)
In [14]:
vocabulary_full.size()
Out[14]:
In [15]:
def prune_vocabulary(vocabulary, min_occurrences):
"""
Return a copy of the given vocabulary where words with less than
`min_occurrences` occurrences are removed.
:param vocabulary: a Vocabulary
:type vocabulary: :class:`due.nlp.vocabulary.Vocabulary`
:param min_occurrences: minimum number of occurrences for a word to be kept
:type min_occurrences: `int`
:return: a pruned copy of the given vocabulary
:rtype: :class:`due.nlp.vocabulary.Vocabulary`
"""
result = Vocabulary()
for index, count in iteritems(vocabulary.index_to_count):
if count >= min_occurrences:
result.add_word(vocabulary.word(index))
return result
In [16]:
vocabulary = prune_vocabulary(vocabulary_full, min_occurrences=2)
In [17]:
vocabulary.size()
Out[17]:
We could initialize the model's embedding layer with random weights, but we expect better results using pre-trained word embeddings instead. We chose GloVe 6B, 300d word vectors for this purpose.
To set these vectors as default embeddings for our network we need to prepare a matrix of (vocabulary_size, embedding_dim)
elements where the i-th row is the embedding vector of the word of index i in our vocabulary.
In [18]:
from due import resource_manager
rm = resource_manager
def get_embedding_matrix(vocabulary, embeddings_stream, embedding_dim, stub=False):
"""
Return a N x D matrix, where N is the number of words in the vocabulary,
and D is the given embeddings' dimensionality. The *i*-th word in the matrix
contains the embedding of the word with index *i* in the Vocabulary.
Sample usage:
.. code-block:: python
with rm.open_resource_file('embeddings.glove6B', 'glove.6B.300d.txt') as f:
embedding_matrix = get_embedding_matrix(vocabulary, f, 300)
:param vocabulary: a Vocabulary
:type vocabulary: :class:`due.nlp.vocabulary.Vocabulary`
:param embeddings_stream: stream to a resource containing word embeddings in the word2vec format
:type embeddings_stream: *file*
:param embedding_dim: dimensionality of the embeddings
:type embedding_dim: `int`
:param stub: if True, return a random N x D matrix without reading the embedding source
:type stub: bool
"""
if stub:
return np.random.rand(vocabulary.size(), embedding_dim)
unk_index = vocabulary.index(UNK)
result = np.zeros((vocabulary.size(), 300))
for line in tqdm(embeddings_stream):
line_split = line.split()
word = line_split[0]
index = vocabulary.index(word)
if index != unk_index:
vector = [float(x) for x in line_split[1:]]
result[index, :] = vector
sos_index = vocabulary.index(SOS)
result[sos_index, :] = np.ones(300)
return result
In [19]:
EMBEDDING_DIM = 300
with rm.open_resource_file('embeddings.glove6B', 'glove.6B.300d.txt') as f:
embedding_matrix = torch.FloatTensor(get_embedding_matrix(vocabulary, f, EMBEDDING_DIM), device=DEVICE)
# embedding_matrix = torch.FloatTensor(get_embedding_matrix(vocabulary, None, EMBEDDING_DIM, stub=True), device=DEVICE)
In [20]:
embedding_matrix.size()
Out[20]:
Here we define a simple model that can be trained one sentence pair at the time. To reduce training time and improve generalization capabilities, usually deep learning systems are trained in batches. Batch training is implemented later on in this Notebook.
Here we define a function to encode a sentence into a Torch tensor of indices
In [21]:
def sentence_to_tensor(sentence):
sentence_indexes = [vocabulary.index(w) for w in sentence.split()]
sentence_indexes.append(vocabulary.index('<EOS>'))
return torch.tensor(sentence_indexes, dtype=torch.long, device=DEVICE).view(-1, 1)
In [22]:
sentence_to_tensor(X[0])
Out[22]:
The model we used is copied straight from the one presented in the reference tutorial (https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html).
Note that attention is not implemented yet.
In [23]:
class EncoderRNN(nn.Module):
def __init__(self, hidden_size, embedding_matrix):
super(EncoderRNN, self).__init__()
self.hidden_size = hidden_size
# self.embedding = nn.Embedding(vocabulary_size, embedding_size) # random init
self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
embedding_dim = self.embedding.embedding_dim
self.gru = nn.GRU(embedding_dim, hidden_size)
def forward(self, input_data, hidden):
embedded = self.embedding(input_data).view(1, 1, -1)
output = embedded
output, hidden = self.gru(output, hidden)
return output, hidden
def init_hidden(self):
return torch.zeros(1, 1, self.hidden_size, device=DEVICE)
In [24]:
class DecoderRNN(nn.Module):
def __init__(self, hidden_size, embedding_matrix):
super(DecoderRNN, self).__init__()
self.hidden_size = hidden_size
# self.embedding = nn.Embedding(vocabulary_size, embedding_size)
self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
embedding_dim = self.embedding.embedding_dim
vocabulary_size = self.embedding.num_embeddings
self.gru = nn.GRU(embedding_dim, hidden_size)
self.out = nn.Linear(hidden_size, vocabulary_size)
self.softmax = nn.LogSoftmax(dim=1)
def forward(self, input_data, hidden):
output = self.embedding(input_data).view(1, 1, -1)
output = F.relu(output)
output, hidden = self.gru(output, hidden)
output = self.out(output[0])
output = self.softmax(output)
return output, hidden
def init_hidden(self):
return torch.zeros(1, 1, self.hidden_size, device=DEVICE)
In [25]:
import random
In [26]:
TEACHER_FORCING_RATIO = 0.5
MAX_LENGTH = 500 # Will raise an error if a longer sentence is encountered
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
encoder_hidden = encoder.init_hidden()
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()
input_length = input_tensor.size(0)
target_length = target_tensor.size(0)
encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=DEVICE)
loss = 0
for ei in range(input_length):
encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
encoder_outputs[ei] = encoder_output[0, 0]
decoder_input = torch.tensor([[vocabulary.index('<SOS>')]], device=DEVICE)
decoder_hidden = encoder_hidden
# use_teacher_forcing = True if random.random() < TEACHER_FORCING_RATIO else False
use_teacher_forcing = True
if use_teacher_forcing:
for di in range(target_length):
decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
loss += criterion(decoder_output, target_tensor[di])
decoder_input = target_tensor[di]
loss.backward()
encoder_optimizer.step()
decoder_optimizer.step()
return loss.item() / target_length
In [27]:
from datetime import datetime
LEARNING_RATE = 0.01
VOCABULARY_SIZE = vocabulary.size()
EMBEDDING_SIZE = 300
HIDDEN_SIZE = 512
encoder = EncoderRNN(HIDDEN_SIZE, embedding_matrix).to(DEVICE)
decoder = DecoderRNN(HIDDEN_SIZE, embedding_matrix).to(DEVICE)
encoder_optimizer = optim.SGD(encoder.parameters(), lr=LEARNING_RATE)
decoder_optimizer = optim.SGD(decoder.parameters(), lr=LEARNING_RATE)
criterion = nn.NLLLoss()
epoch = 0
In [28]:
PRINT_EVERY = 50
i = 1
tick = datetime.now()
loss_sum = 0.0
for input_sentence, target_sentence in tqdm(zip(X, y)):
input_tensor = sentence_to_tensor(input_sentence)
target_tensor = sentence_to_tensor(target_sentence)
loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
loss_sum += loss
if i%PRINT_EVERY == 0:
print(i, loss_sum/PRINT_EVERY)
loss_sum = 0.0
i += 1
tock = datetime.now()
epoch += 1
print(tock-tick)
print(i, loss_sum/PRINT_EVERY)
In [29]:
# TODO
In [30]:
def predict_answer(input_sentence, vocabulary, encoder, decoder):
result = []
input_tensor = sentence_to_tensor(input_sentence)
input_length = input_tensor.size(0)
encoder_hidden = encoder.init_hidden()
for ei in range(input_length):
_, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
decoder_input = torch.tensor([[vocabulary.index('<SOS>')]], device=DEVICE)
decoder_hidden = encoder_hidden
for di in range(MAX_LENGTH):
decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
topv, topi = decoder_output.topk(1)
decoder_input = topi.squeeze().detach()
predicted_index = decoder_input.item()
if predicted_index == vocabulary.index('<EOS>'):
break
result.append(vocabulary.word(predicted_index))
return " ".join(result)
In [31]:
predict_answer("what's the meaning of life?'", vocabulary, encoder, decoder)
Out[31]:
Instead of feeding sentence pairs one by one, we want the training procedure to predict a number of samples before computing the loss and completing the optimization step. This is called batch training.
The code below is inspired to https://github.com/pengyuchen/PyTorch-Batch-Seq2seq/blob/master/seq2seq_translation_tutorial.py
In [32]:
# Fake embedding layer
embedding = nn.Embedding(5, 10).to(DEVICE)
In [33]:
# Single sentence tensor
sentence_indexes = [1, 2, 3]
sentence_tensor = torch.tensor(sentence_indexes, dtype=torch.long, device=DEVICE).view(-1, 1)
input_data = sentence_tensor[0]
input_data
Out[33]:
In [34]:
BATCH_SIZE = 2
# Batch tensor
input_batch = torch.tensor([1, 4], device=DEVICE).view(-1, 1)
input_batch
Out[34]:
In [35]:
embedding(input_data)
Out[35]:
In [36]:
embedding(input_batch)
Out[36]:
In [37]:
embedding(input_batch).view(1, BATCH_SIZE, -1)
Out[37]:
In [38]:
class EncoderRNNBatch(nn.Module):
def __init__(self, hidden_size, embedding_matrix):
super(EncoderRNNBatch, self).__init__()
self.hidden_size = hidden_size
self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
embedding_dim = self.embedding.embedding_dim
self.gru = nn.GRU(embedding_dim, hidden_size)
def forward(self, input_data, batch_size, hidden):
embedded = self.embedding(input_data).view(1, batch_size, -1)
output = embedded
output, hidden = self.gru(output, hidden)
return output, hidden
def init_hidden(self, batch_size):
return torch.zeros(1, batch_size, self.hidden_size, device=DEVICE)
In [39]:
encoder = EncoderRNN(32, embedding_matrix).to(DEVICE)
encoder_batch = EncoderRNNBatch(32, embedding_matrix).to(DEVICE)
In [40]:
# 1-by-1 model
encoder_hidden = encoder.init_hidden()
encoder(input_data, encoder_hidden)
Out[40]:
In [41]:
# Batch model
encoder_hidden_batch = encoder_batch.init_hidden(BATCH_SIZE)
encoder_batch(input_batch, BATCH_SIZE, encoder_hidden_batch)
Out[41]:
In [42]:
class DecoderRNNBatch(nn.Module):
def __init__(self, hidden_size, embedding_matrix):
super(DecoderRNNBatch, self).__init__()
self.hidden_size = hidden_size
self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
embedding_dim = self.embedding.embedding_dim
vocabulary_size = self.embedding.num_embeddings
self.gru = nn.GRU(embedding_dim, hidden_size)
self.out = nn.Linear(hidden_size, vocabulary_size)
self.softmax = nn.LogSoftmax(dim=1)
def forward(self, input_data, batch_size, hidden):
output = self.embedding(input_data).view(1, batch_size, -1)
output = F.relu(output)
output, hidden = self.gru(output, hidden)
output = self.out(output[0])
output = self.softmax(output)
return output, hidden
def init_hidden(self, batch_size):
return torch.zeros(1, BATCH_SIZE, self.hidden_size, device=DEVICE)
In [43]:
# vocabulary_size=10, embedding_dim=5
toy_embedding_matrix = torch.FloatTensor(np.random.rand(10, 5), device=DEVICE)
In [44]:
decoder = DecoderRNN(32, toy_embedding_matrix).to(DEVICE)
decoder_batch = DecoderRNNBatch(32, toy_embedding_matrix).to(DEVICE)
In [45]:
# 1-by-1 model
decoder_input = torch.tensor([[vocabulary.index('<SOS>')]], device=DEVICE)
decoder_hidden = encoder_hidden
decoder(decoder_input, decoder_hidden)
Out[45]:
In [46]:
# Batch model
decoder_input_batch = torch.tensor([[vocabulary.index('<SOS>')]*BATCH_SIZE], device=DEVICE)
decoder_hidden_batch = encoder_hidden_batch
decoder_batch(decoder_input_batch, BATCH_SIZE, decoder_hidden_batch)
Out[46]:
In [47]:
try:
del encoder
del decoder
del decoder_batch
del encoder_hidden
del encoder_hidden_batch
del decoder_input
del decoder_hidden
del decoder_input_batch
del decoder_hidden_batch
except NameError:
pass
In [48]:
def batches(X, y, batch_size):
"""
Generate two sequences of batches from the input lists `X` and `y`, so that
each batch contains `batch_size` elements.
>>> list(batches([0, 1, 2, 3, 4, 5, 6], ['a', 'b', 'c', 'd', 'e', 'f', 'g'], 3))
[([0, 1, 2], ['a', 'b', 'c']), ([3, 4, 5], ['d', 'e', 'f']), ([6], ['g'])]
:param X: a sequence of elements
:type X: `list`
:param y: a sequence of elements
:type y: `list`
:param batch_size: number of elements in each batch
:type batch_size: `int`
:return: a generator of the list of batches
:rtype: `list` of (`list`, `list`)
"""
for i in range(int(np.ceil(len(X)/batch_size))):
start_index = i*batch_size
end_index = start_index + batch_size
yield X[start_index:end_index], y[start_index:end_index]
In [49]:
list(batches([0, 1, 2, 3, 4, 5, 6], ['a', 'b', 'c', 'd', 'e', 'f', 'g'], 3))
Out[49]:
In [50]:
sentence_to_tensor(X[0])[0] # Input of normal encoder
Out[50]:
In [51]:
input_batch # What we want
Out[51]:
In [52]:
def pad_sequence(sequence, pad_value, final_length):
"""
Trim the sequence if longer than final_length, pad it with pad_value if shorter.
In any case at lest one pad element will be left at the end of the sequence (this is
because we usually pad with the <EOS> token)
>>> pad_sequence([1, 2, 3], 0, 5)
[1, 2, 3, 0, 0]
>>> pad_sequence([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 0, 5)
[1, 2, 3, 4, 0]
:param sequence: any sequence of elements
:type sequence: `list`-like
:param pad_value: a value to pad the sequence with
:type pad_value: *any*
:param final_length: length of the final sequence
:type final_length: `int`
:return: the padded (or shortened) sequence, with at least one trailing `pad_value`
:rtype: `list`
"""
if len(sequence) >= final_length:
result = sequence[:final_length]
result[-1] = pad_value
return result
return sequence + [pad_value] * (final_length - len(sequence))
In [53]:
pad_sequence([1, 2, 3], 0, 5)
Out[53]:
In [54]:
pad_sequence([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 0, 5)
Out[54]:
In [55]:
a = np.array([[1, 2, 3], [4, 5, 6]])
a = a.transpose()
np.expand_dims(a, 2)[0]
Out[55]:
In [56]:
def batch_to_tensor(batch, vocabulary, max_words=None, device=None):
"""
Receive a list of sentences (strings), return a (*n_words* x *batch_size* x 1)
tensor `m`, so that `m[i]` contains an array `a` of *batch_size* rows and 1
column, so that `a[j]` contains the index of the `i`-th word in the `j`-th
sentence in the batch.
The **maximum number of words** in the sentences can be limited to
`max_word`. If `max_words` is not set, the limit will be set by the longest
sentence in the batch.
Sentences that are shorter than the maximum length in the resulting matrix
will be **padded** with EOS. At least one EOS token is appended to every
sentence in the resulting matrix.
:param batch: a list of sentence
:type batch: `list` of `str`
:param vocabulary: a Vocabulary to look up word indexes
:type vocabulary: :class:`due.nlp.vocabulary.Vocabulary`
:param max_words: sentences shorter than `max_words` will be trimmed
:type max_words: `int`
:param device: a Torch device to map the tensor to (eg. `torch.device("cuda")`)
:type device: :class:`torch.device`
:return: a Torch tensor that is equivalent to the output of :func:`batch_to_matrix`
:rtype: :class:`torch.tensor`
"""
sentence_indexes = [[vocabulary.index(w) for w in sentence.split()] for sentence in batch]
max_length = max([len(x) for x in sentence_indexes])
if max_words:
max_length = min(max_length, max_words)
sentence_indexes = [pad_sequence(s, vocabulary.index(EOS), max_length+1) for s in sentence_indexes]
result = np.transpose(sentence_indexes)
result = np.expand_dims(result, axis=2)
return torch.tensor(result, dtype=torch.long, device=device)
In [57]:
batch = ['this is a sentence', 'this is another much longer sentence', 'short sentence']
batch_tensor = batch_to_tensor(batch, vocabulary, device=DEVICE)
n_words = batch_tensor.size(0)
batch_size = batch_tensor.size(1)
first_word = batch_tensor[0]
print(n_words)
print(batch_size)
print(first_word)
In [58]:
torch.cuda.empty_cache()
In [59]:
TEACHER_FORCING_RATIO = 1.
MAX_LENGTH = 20
def train_batch(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
batch_size = input_tensor.size(1)
encoder_hidden = encoder.init_hidden(batch_size)
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()
input_length = input_tensor.size(0)
target_length = target_tensor.size(0)
# encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=DEVICE)
loss = 0
for ei in range(input_length):
encoder_output, encoder_hidden = encoder(input_tensor[ei], batch_size, encoder_hidden)
# encoder_outputs[ei] = encoder_output[0, 0]
decoder_input = torch.tensor([[vocabulary.index('<SOS>')]*batch_size], device=DEVICE)
decoder_hidden = encoder_hidden
# use_teacher_forcing = True if random.random() < TEACHER_FORCING_RATIO else False
use_teacher_forcing = True
if use_teacher_forcing:
for di in range(target_length):
decoder_output, decoder_hidden = decoder(decoder_input, batch_size, decoder_hidden)
# print("decoder_output:", decoder_output, decoder_output.size())
# print("target_tensor[di]:", target_tensor[di], target_tensor[di].size())
loss += criterion(decoder_output, target_tensor[di].view(batch_size))
decoder_input = target_tensor[di]
else:
eos_tensor = torch.tensor([vocabulary.index('<EOS>')], device=DEVICE)
for di in range(target_length):
decoder_output, decoder_hidden = decoder(decoder_input, batch_size, decoder_hidden)
topv, topi = decoder_output.topk(1)
decoder_input = topi.squeeze().detach()
predicted_words = target_tensor[di].view(batch_size)
loss += criterion(decoder_output, predicted_words)
if (predicted_words == eos_tensor*batch_size).all():
break
loss.backward()
encoder_optimizer.step()
decoder_optimizer.step()
return loss.item() / target_length
In [60]:
LEARNING_RATE = 0.01
VOCABULARY_SIZE = vocabulary.size()
EMBEDDING_SIZE = 300
HIDDEN_SIZE = 512
encoder = EncoderRNNBatch(HIDDEN_SIZE, embedding_matrix).to(DEVICE)
decoder = DecoderRNNBatch(HIDDEN_SIZE, embedding_matrix).to(DEVICE)
In [61]:
encoder_optimizer = optim.SGD(encoder.parameters(), lr=LEARNING_RATE)
decoder_optimizer = optim.SGD(decoder.parameters(), lr=LEARNING_RATE)
criterion = nn.NLLLoss()
epoch = 0
In [62]:
len(X)
Out[62]:
In [63]:
def predict_answer_batch(input_sentence, vocabulary, encoder, decoder):
result = []
input_tensor = batch_to_tensor([input_sentence], vocabulary, device=DEVICE)
input_length = input_tensor.size(0)
batch_size = input_tensor.size(1)
encoder_hidden = encoder.init_hidden(batch_size)
for ei in range(input_length):
_, encoder_hidden = encoder(input_tensor[ei], batch_size, encoder_hidden)
decoder_input = torch.tensor([[vocabulary.index('<SOS>')] * batch_size], device=DEVICE)
decoder_hidden = encoder_hidden
for di in range(MAX_LENGTH):
decoder_output, decoder_hidden = decoder(decoder_input, batch_size, decoder_hidden)
topv, topi = decoder_output.topk(1)
decoder_input = topi.squeeze().detach()
# print(decoder_output)
predicted_index = decoder_input.item()
if predicted_index == vocabulary.index('<EOS>'):
break
result.append(vocabulary.word(predicted_index))
return " ".join(result)
In [64]:
BATCH_SIZE = 64
PRINT_EVERY = 500
EPOCHS = 1
for _ in range(EPOCHS):
i = 1
tick = datetime.now()
loss_sum = 0.0
loss_sum_partial = 0.0
for input_batch, target_batch in tqdm(batches(X, y, BATCH_SIZE)):
input_tensor = batch_to_tensor(input_batch, vocabulary, MAX_LENGTH, device=DEVICE)
target_tensor = batch_to_tensor(target_batch, vocabulary, MAX_LENGTH, device=DEVICE)
loss = train_batch(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
loss_sum += loss
loss_sum_partial += loss
if i%PRINT_EVERY == 0:
print(i, loss_sum_partial/PRINT_EVERY)
loss_sum_partial = 0.0
i += 1
tock = datetime.now()
epoch += 1
print(tock-tick)
print(i, loss_sum/i)
print(predict_answer_batch("hi", vocabulary, encoder, decoder))
print(predict_answer_batch("how are you?", vocabulary, encoder, decoder))
print(predict_answer_batch("what's your name?", vocabulary, encoder, decoder))
print(predict_answer_batch("My name is Anna", vocabulary, encoder, decoder))
print(predict_answer_batch("What's the meaning of life?", vocabulary, encoder, decoder))
print()
In [65]:
epoch
Out[65]:
In [66]:
MODEL_NAME = "encdec-cornell-TEST"
model_filename = "%s_MODEL.pt" % MODEL_NAME
dataset_filename = "%s_DATASET.pt" % MODEL_NAME
In [67]:
model = {
'encoder': encoder.state_dict(),
'decoder': decoder.state_dict(),
'epoch': epoch,
'embedding_matrix': embedding_matrix
}
torch.save(model, model_filename)
In [68]:
dataset = {
"X": X,
"y": y,
"vocabulary": vocabulary.save()
}
torch.save(dataset, dataset_filename)
In [69]:
from due.nlp.preprocessing import normalize_sentence
from due.nlp.vocabulary import Vocabulary, get_embedding_matrix
In [70]:
dataset_deserialized = torch.load(dataset_filename)
X_deserialized = dataset_deserialized["X"]
y_deserialized = dataset_deserialized["y"]
vocabulary_deserialized = Vocabulary.load(dataset_deserialized['vocabulary'])
In [72]:
model_deserialized = torch.load(model_filename)
embedding_matrix_deserialized = model_deserialized['embedding_matrix']
encoder_deserialized = EncoderRNNBatch(HIDDEN_SIZE, embedding_matrix_deserialized).to(DEVICE)
encoder_deserialized.load_state_dict(model_deserialized['encoder'])
decoder_deserialized = DecoderRNNBatch(HIDDEN_SIZE, embedding_matrix_deserialized).to(DEVICE)
decoder_deserialized.load_state_dict(model_deserialized['decoder'])
epoch_deserialized = model_deserialized['epoch']
In [73]:
from due.nlp.batches import batches, pad_sequence, batch_to_tensor
In [74]:
X = X_deserialized
y = y_deserialized
vocabulary = vocabulary_deserialized
encoder = encoder_deserialized
decoder = decoder_deserialized
epoch = epoch_deserialized
criterion = nn.NLLLoss()
In [75]:
from datetime import datetime
LEARNING_RATE = 0.01
VOCABULARY_SIZE = vocabulary.size()
EMBEDDING_SIZE = 300
HIDDEN_SIZE = 512
BATCH_SIZE = 64
embedding_matrix = embedding_matrix_deserialized
encoder_optimizer = optim.SGD(encoder.parameters(), lr=LEARNING_RATE)
decoder_optimizer = optim.SGD(decoder.parameters(), lr=LEARNING_RATE)
criterion = nn.NLLLoss()
In [ ]: