In [1]:
%pylab inline
In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
USE_CUDA = False
SHOW_ATTENTION = True
In [3]:
from nalgene.generate import *
parsed = parse_file('.', 'grammar.nlg')
parsed.map_leaves(tokenizeLeaf)
In [4]:
import torch
from torchtext.vocab import load_word_vectors
def tokenize_sentence(s):
s = s.lower()
s = re.sub(r'(\d)', r'\1 ', s)
s = re.sub(r'[^a-z0-9 \']', ' ', s)
s = re.sub(r'\s+', ' ', s).strip()
return s.split(' ')
class GloVeLang:
def __init__(self, size):
self.size = size
base_dir = '../pytorch-seq2seq-intent-parsing/data/'
glove_dict, glove_arr, glove_size = load_word_vectors(base_dir, 'glove.twitter.27B', size)
self.glove_dict = glove_dict
self.glove_arr = glove_arr
def __str__(self):
return "%s(size = %d)" % (self.__class__.__name__, self.size)
def vector_from_word(self, word):
if word in self.glove_dict:
return self.glove_arr[self.glove_dict[word]]
else:
return torch.zeros(self.size)
def tokens_to_tensor(self, words):
tensor = torch.zeros(len(words), 1, self.size)
for wi in range(len(words)):
word = words[wi]
tensor[wi][0] = self.vector_from_word(word)
return tensor
input_lang = GloVeLang(100)
In [5]:
def descend(node, fn, child_type='phrase', returns=None):
if returns is None: returns = []
returned = fn(node)
returns.append(returned)
for child in node.children:
if (child_type is None) or (child.type == child_type):
descend(child, fn, child_type, returns)
return returns
In [6]:
def ascend(node, fn):
if node.parent is None:
return fn(node)
else:
return ascend(node.parent, fn)
In [7]:
# input_tokens = []
# def get_input_tokens(node):
# if node.type == 'word':
# input_tokens.append(node.key)
# descend(parsed, get_input_tokens, None)
# input_tokens = list(set(input_tokens))
# input_tokens = ['EOS'] + input_tokens
# print(input_tokens)
For output tokens, we can just take the top level node names that are either phrases or variables.
In [8]:
output_tokens = [child.key for child in parsed.children if child.type in ['phrase', 'ref', 'value']]
output_tokens = ['EOS'] + output_tokens
print(output_tokens)
In [9]:
def words_for_position(words, position):
if position is None:
return words
start, end, length = position
return words[start : end + 1]
In [10]:
def relative_position(node, parent):
if parent.position is None:
return node.position
return node.position[0] - parent.position[0], node.position[1] - parent.position[0], node.position[2]
In [11]:
def data_for_node(flat, node):
words = [child.key for child in flat.children]
inputs = words_for_position(words, node.position)
keys = [child.key for child in node.children]
positions = [relative_position(child, node) for child in node.children]
return node.key, inputs, list(zip(keys, positions))
In [12]:
def tokens_to_tensor(tokens, source_tokens, append_eos=True):
indexes = []
for token in tokens:
indexes.append(source_tokens.index(token))
if append_eos:
indexes.append(0)
return torch.LongTensor(indexes)
In [13]:
def ranges_to_tensor(ranges, seq_len):
ranges_tensor = torch.zeros(len(ranges), seq_len)
for r in range(len(ranges)):
start, end, _ = ranges[r]
ranges_tensor[r, start:end+1] = 1
return ranges_tensor
The core model is a regular seq2seq/encoder-decoder model with attention. The attention model is from Luong et al.'s "Effective Approaches to Attention-based Neural Machine Translation" using dot-product based attention energies, with one important difference: there is no softmax layer, allowing attention to focus on multiple tokens at once. Instead a sigmoid layer is added to squeeze outputs between 0 and 1.
The encoder and decoder take one additional input context which represents the type of phrase, e.g. %setLightState. At the top level node the context is always %.
The encoder encodes the input sequence into a series of vectors using a bidirectional GRU. The decoder "translates" this into a sequence of phrase tokens, given the encoder outputs and current context, e.g. "turn off the office light" + %setLightState → [$on_off, $light].
Once the decoder has chosen tokens and alignments, the phrase tokens and selection of inputs are used as the context and inputs of the next iteration. This recurs until no more phrase tokens are found.
In [14]:
class Encoder(nn.Module):
def __init__(self, input_size, hidden_size, n_layers=1, dropout=0.05):
super(Encoder, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.n_layers = n_layers
self.embedding = nn.Linear(input_size, hidden_size)
self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=dropout, bidirectional=True)
def forward(self, context_input, word_inputs):
# TODO: Incorporate context input
# TODO: Batching
seq_len = word_inputs.size(0)
batch_size = word_inputs.size(1)
embedded = self.embedding(word_inputs.view(seq_len * batch_size, -1)) # Process seq x batch at once
output = embedded.view(seq_len, batch_size, -1) # Resize back to seq x batch for RNN
outputs, hidden = self.gru(output)
outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:] # Sum bidirectional outputs
return outputs, hidden
In [15]:
class Attention(nn.Module):
def __init__(self):
super(Attention, self).__init__()
def forward(self, hidden, encoder_outputs):
seq_len = len(encoder_outputs)
# Create variable to store attention energies
attention_energies = Variable(torch.zeros(seq_len)) # B x 1 x S
if USE_CUDA: attention_energies = attention_energies.cuda()
# Calculate energies for each encoder output
for i in range(seq_len):
attention_energies[i] = hidden.dot(encoder_outputs[i])
# Squeeze to range 0 to 1, resize to 1 x 1 x seq_len
return F.sigmoid(attention_energies).unsqueeze(0).unsqueeze(0)
In [16]:
class Decoder(nn.Module):
def __init__(self, hidden_size, output_size, n_layers=1, dropout=0.05):
super(Decoder, self).__init__()
# Keep parameters for reference
self.hidden_size = hidden_size
self.output_size = output_size
self.n_layers = n_layers
self.dropout = dropout
# Define layers
self.embedding = nn.Embedding(output_size, hidden_size)
self.gru = nn.GRU(hidden_size * 2, hidden_size, n_layers, dropout=dropout)
self.out = nn.Linear(hidden_size * 2, output_size)
# Attention module
self.attention = Attention()
def forward(self, context_input, word_input, last_hidden, encoder_outputs):
# Note: we run this one step at a time
# TODO: Batching
# Get the embedding of the current input word (last output word)
word_embedded = self.embedding(word_input).view(1, 1, -1) # S=1 x B x N
# Combine context and embedded word, through RNN
rnn_input = torch.cat((context_input.unsqueeze(0), word_embedded), 2)
rnn_output, hidden = self.gru(rnn_input, last_hidden)
# Calculate attention from current RNN state and all encoder outputs; apply to encoder outputs
attention_weights = self.attention(rnn_output.squeeze(0), encoder_outputs)
context = attention_weights.bmm(encoder_outputs.transpose(0, 1)) # B x 1 x N
# Final output layer (next word prediction) using the RNN hidden state and context vector
rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N
context = context.squeeze(1) # B x S=1 x N -> B x N
output = F.log_softmax(self.out(torch.cat((rnn_output, context), 1)))
# Return final output, hidden state, and attention weights (for visualization)
return output, hidden, attention_weights
We can turn the whole thing into one module by combining the Encoder and Decoder networks. There's also an embedding layer for the context tokens.
In [17]:
MAX_LENGTH = 50
class RARNN(nn.Module):
def __init__(self, input_size, output_tokens, hidden_size):
super(RARNN, self).__init__()
self.output_tokens = output_tokens
self.input_size = input_size
self.output_size = len(output_tokens)
self.hidden_size = hidden_size
self.embedding = nn.Embedding(self.output_size, hidden_size)
self.encoder = Encoder(self.input_size, hidden_size)
self.decoder = Decoder(hidden_size, self.output_size)
def forward(self, context_input, word_inputs, word_targets=None):
# Get embedding for context input
context_embedded = self.embedding(context_input)
input_len = word_inputs.size(0)
target_len = word_targets.size(0) if word_targets is not None else MAX_LENGTH
# Run through encoder
encoder_outputs, encoder_hidden = self.encoder(context_embedded, word_inputs)
decoder_hidden = encoder_hidden # Use encoder's last hidden state
decoder_input = Variable(torch.LongTensor([0])) # EOS/SOS token
if USE_CUDA:
decoder_input = decoder_input.cuda()
# Variables to store decoder and attention outputs
decoder_outputs = Variable(torch.zeros(target_len, self.output_size))
decoder_attentions = Variable(torch.zeros(target_len, input_len))
if USE_CUDA:
decoder_outputs = decoder_outputs.cuda()
decoder_attentions = decoder_attentions.cuda()
# Run through decoder
for i in range(target_len):
decoder_output, decoder_hidden, decoder_attention = self.decoder(context_embedded, decoder_input, decoder_hidden, encoder_outputs)
decoder_outputs[i] = decoder_output
decoder_attentions[i] = decoder_attention
# Teacher forcing with known targets, if provided
if word_targets is not None:
decoder_input = word_targets[i]
# Sample with last outputs
else:
max_index = decoder_output.topk(1)[1].data[0][0]
decoder_input = Variable(torch.LongTensor([max_index]))
if USE_CUDA:
decoder_input = decoder_input.cuda()
if max_index == 0: break # EOS
# Slice outputs
if word_targets is None:
if i > 0:
decoder_outputs = decoder_outputs[:i]
decoder_attentions = decoder_attentions[:i]
else:
decoder_outputs = Variable(torch.Tensor())
decoder_attentions = Variable(torch.Tensor())
elif target_len > 1:
decoder_attentions = decoder_attentions[:-1] # Ignore attentions on EOS
return decoder_outputs, decoder_attentions
The inputs to the network are the current phrase label, e.g. %getLightState and the string to parse, "the living room light is on". The outputs are the child node labels, $light and $on_off with a selection of words given by attention-like weights over the sequence, treated as boolean values given a threshold.
In [18]:
input_size = 100
hidden_size = 100
learning_rate = 1e-4
weight_decay = 1e-6
rarnn = RARNN(input_size, output_tokens, hidden_size)
optimizer = torch.optim.Adam(rarnn.parameters(), lr=learning_rate, weight_decay=weight_decay)
decoder_criterion = nn.NLLLoss()
attention_criterion = nn.MSELoss(size_average=False)
In [19]:
def train(flat, node):
context, inputs, targets = data_for_node(flat, node)
# Turn inputs into tensors
context_var = tokens_to_tensor([context], rarnn.output_tokens, False)
context_var = Variable(context_var)
inputs_var = input_lang.tokens_to_tensor(inputs) # seq x batch x size
inputs_var = Variable(inputs_var)
target_tokens = [target_token for target_token, _ in targets]
target_ranges = [target_range for _, target_range in targets]
target_tokens_var = tokens_to_tensor(target_tokens, rarnn.output_tokens)
target_tokens_var = Variable(target_tokens_var)
target_ranges_var = ranges_to_tensor(target_ranges, len(inputs))
target_ranges_var = Variable(target_ranges_var)
# Run through model
decoder_outputs, attention_outputs = rarnn(context_var, inputs_var, target_tokens_var)
# Loss calculation and backprop
optimizer.zero_grad()
decoder_loss = decoder_criterion(decoder_outputs, target_tokens_var)
if len(targets) > 0:
attention_loss = attention_criterion(attention_outputs, target_ranges_var)
else:
attention_loss = 0
total_loss = decoder_loss + attention_loss
total_loss.backward()
optimizer.step()
return total_loss.data[0]
In [20]:
import sconce
job = sconce.Job('rarnn')
job.plot_every = 20
job.log_every = 100
n_epochs = 5000
for i in range(n_epochs):
walked_flat, walked_tree = walk_tree(parsed, parsed['%'], None)
def _train(node): return train(walked_flat, node)
ds = descend(walked_tree, _train)
d = sum(ds) / len(ds)
job.record(i, d)
In [21]:
def evaluate(context, inputs, node=None):
if node == None:
node = Node('parsed')
node.position = (0, len(inputs))
# Turn data into tensors
context_var = tokens_to_tensor([context], rarnn.output_tokens, False)
context_var = Variable(context_var)
inputs_var = input_lang.tokens_to_tensor(inputs) # seq x batch x size
inputs_var = Variable(inputs_var)
# Run through RARNN
decoder_outputs, attention_outputs = rarnn(context_var, inputs_var)
# Given the decoder and attention outputs, gather contexts and inputs for sub-phrases
# Use attention values > 0.5 to select words for next input sequence
next_contexts = []
next_inputs = []
next_positions = []
for i in range(len(decoder_outputs)):
max_value, max_index = decoder_outputs[i].topk(1)
max_index = max_index.data[0]
next_contexts.append(rarnn.output_tokens[max_index]) # Get decoder output token
a = attention_outputs[i]
next_input = []
next_position = []
for t in range(len(a)):
at = a[t].data[0]
if at > 0.5:
if len(next_position) == 0: # Start position
next_position.append(t)
next_input.append(inputs[t])
else:
if len(next_position) == 1: # End position
next_position.append(t - 1)
if len(next_position) == 1: # End position
next_position.append(t)
next_inputs.append(next_input)
if len(next_position) == 2:
next_position = (next_position[0] + node.position[0], next_position[1] + node.position[0])
next_positions.append(next_position)
evaluated = list(zip(next_contexts, next_inputs, next_positions))
# Print decoded outputs
print('\n(evaluate) %s %s -> %s' % (context, ' '.join(inputs), next_contexts))
# Plot attention outputs
if SHOW_ATTENTION:
if len(attention_outputs) > 1:
print(attention_outputs)
fig = plt.figure(figsize=(len(inputs) / 3, 99))
sub = fig.add_subplot(111)
sub.matshow(attention_outputs.data.squeeze(1).numpy(), vmin=0, vmax=1, cmap='hot')
plt.show(); plt.close()
else:
print("WARNING: No attention outputs")
for context, inputs, position in evaluated:
# Add a node for parsed sub-phrases and values
sub_node = Node(context)
sub_node.position = position
node.add(sub_node)
# Recursively evaluate sub-phrases
if context[0] == '%':
if len(inputs) > 0:
evaluate(context, inputs, sub_node)
else:
print("WARNING: Empty inputs")
# Or add words directly to value node
elif context[0] == '$':
sub_node.add(' '.join(inputs))
return node
In [22]:
def evaluate_and_print(context, inputs):
evaluated = evaluate(context, inputs)
print(' '.join(inputs))
print(evaluated)
return evaluated
In [23]:
evaluate_and_print('%', "hey maia what's the btc price".split(' '))
Out[23]:
In [24]:
evaluate_and_print('%', "plz call me when the price of tesla is above 5 0".split(' '))
Out[24]:
In [25]:
evaluate_and_print('%', "hey maia if the ethereum price is less than 2 0 then turn the living room light on".split(' '))
evaluate_and_print('%', "hey maia what's the ethereum price".split(' '))
evaluate_and_print('%', "hey maia play some Skrillex please and then turn the office light off".split(' '))
evaluate_and_print('%', "turn the office light up and also could you please turn off the living room light and make the temperature of the bedroom to 6 thank you maia".split(' '))
evaluate_and_print('%', "turn the living room light off and turn the bedroom light up and also turn the volume up".split(' '))
Out[25]:
In [26]:
def parse(s, cb):
words = tokenize_sentence(s)
print('words', words)
try:
evaluated = evaluate_and_print('%', words)
cb({'words': words, 'parsed': evaluated.to_json()})
except Exception:
cb({'error': "Failed to evaluate"})
parse('heya shoot me an email if the price of bitcoin is bigger than 3 0 0 0 would ya', lambda r: print("response", r))
In [27]:
import somata
service = somata.Service('maia:parser', {'parse': parse}, {'bind_port': 8855})
In [ ]:
service.socket.close()
In [ ]: