After training a skip-gram model in 5_word2vec.ipynb
, the goal of this notebook is to train a LSTM character model over Text8 data.
In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import os
import numpy as np
import random
import string
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve
In [2]:
url = 'http://mattmahoney.net/dc/'
def maybe_download(filename, expected_bytes):
"""Download a file if not present, and make sure it's the right size."""
if not os.path.exists(filename):
filename, _ = urlretrieve(url + filename, filename)
statinfo = os.stat(filename)
if statinfo.st_size == expected_bytes:
print('Found and verified %s' % filename)
else:
print(statinfo.st_size)
raise Exception(
'Failed to verify ' + filename + '. Can you get to it with a browser?')
return filename
filename = maybe_download('text8.zip', 31344016)
In [3]:
def read_data(filename):
with zipfile.ZipFile(filename) as f:
name = f.namelist()[0]
data = tf.compat.as_str(f.read(name))
return data
text = read_data(filename)
print('Data size %d' % len(text))
Create a small validation set.
In [4]:
valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])
Utility functions to map characters to vocabulary IDs and back.
In [5]:
vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' '
first_letter = ord(string.ascii_lowercase[0])
def char2id(char):
if char in string.ascii_lowercase:
return ord(char) - first_letter + 1
elif char == ' ':
return 0
else:
print('Unexpected character: %s' % char)
return 0
def id2char(dictid):
if dictid > 0:
return chr(dictid + first_letter - 1)
else:
return ' '
print(char2id('a'), char2id('z'), char2id(' '), char2id('ï'))
print(id2char(1), id2char(26), id2char(0))
Function to generate a training batch for the LSTM model.
In [6]:
batch_size=64
num_unrollings=10
class BatchGenerator(object):
def __init__(self, text, batch_size, num_unrollings):
self._text = text
self._text_size = len(text)
self._batch_size = batch_size
self._num_unrollings = num_unrollings
segment = self._text_size // batch_size
self._cursor = [ offset * segment for offset in range(batch_size)] # this sets 64 cursor starting indexes
# for the text
self._last_batch = self._next_batch()
def _next_batch(self):
"""Generate a single batch from the current cursor position in the data."""
batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float) # initialise a matrix of 64 * 27
# I.e. each batch in this case involves 64 different sets of character sequences.
for b in range(self._batch_size): # repeat for each of these 64 character sequences
batch[b, char2id(self._text[self._cursor[b]])] = 1.0 # for each row representing a sequence in the batch
# set the index in the character array to one based on the character in the text at the cursor for
# the particular batch
# Then increment the cursor for the particular batch
self._cursor[b] = (self._cursor[b] + 1) % self._text_size # % just enables cycling of data
return batch
def next(self):
"""Generate the next array of batches from the data. The array consists of
the last batch of the previous array, followed by num_unrollings new ones.
"""
batches = [self._last_batch]
for step in range(self._num_unrollings):
batches.append(self._next_batch())
self._last_batch = batches[-1]
return batches
def characters(probabilities):
"""Turn a 1-hot encoding or a probability distribution over the possible
characters back into its (most likely) character representation."""
return [id2char(c) for c in np.argmax(probabilities, 1)]
def batches2string(batches):
"""Convert a sequence of batches back into their (most likely) string
representation."""
s = [''] * batches[0].shape[0] # s is list with 64 blank entries
for b in batches: # for each of 11 batches
# characters(b) is a list of 64 characters - i.e. batch > characters
s = [''.join(x) for x in zip(s, characters(b))] # joins batch characters vertically
return s
train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 1)
print(batches2string(train_batches.next()))
print(batches2string(train_batches.next()))
print(batches2string(valid_batches.next()))
print(batches2string(valid_batches.next()))
So a valid batch contains two characters - one for the input, one for the output which is the next character in the sequence.
In [7]:
batches = train_batches.next()
In [8]:
len(batches)
Out[8]:
In [9]:
s = [''] * batches[0].shape[0]
In [10]:
labels = np.concatenate(list(batches)[1:])
labels
Out[10]:
In [11]:
c = characters(batches[0])
In [12]:
s = [''.join(x) for x in zip(s, c)]
In [13]:
s
Out[13]:
Each call to next returns a list of 11 batches (the last batch plus 10 unrollings).
Each batch has 64 characters? Next batch contains the next character for each of those 64 characters. This is repeated 10 times to yield 11 batchs: a last set of characters and 10 iterations forward through the sequence of text.
How next batch works -
In [14]:
# Logprob is used to compute the perplexity
def logprob(predictions, labels):
"""Log-probability of the true labels in a predicted batch."""
predictions[predictions < 1e-10] = 1e-10
return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]
def sample_distribution(distribution):
"""Sample one element from a distribution assumed to be an array of normalized
probabilities.
"""
r = random.uniform(0, 1)
s = 0
for i in range(len(distribution)):
s += distribution[i]
if s >= r:
return i
return len(distribution) - 1
def sample(prediction):
"""Turn a (column) prediction into 1-hot encoded samples."""
p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
p[0, sample_distribution(prediction[0])] = 1.0
return p
def random_distribution():
"""Generate a random column of probabilities."""
b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
return b/np.sum(b, 1)[:,None]
Simple LSTM Model.
To help understand what this is doing see:
This is the input size, which equals 26 lower case characters plus space.
What is num_nodes? - Size of the hidden dimension of the cell, or the number of units in the LSTM cell as per here - http://monik.in/a-noobs-guide-to-implementing-rnn-lstm-using-tensorflow/ .
Where does the classifier come in with w and b?
The cell has 'state' (C) and 'output' (h or y).
Input gate is a set of weights applied to the current input and the previous output + bias. ix below is the part of the weights applied to the input and im is the part of the weights applied to the previous output.
Classifier is applied to the output of the trained cell? Yes - even more so it is applied to the outputs of a bunch of cells equal to the number of unrollings?
The LSTM cell takes a character as an input and outputs a high dimensional vector of length num_nodes. Each cell produces an output of dimension num_nodes = 64. The classifier takes this hidden output and generates a character prediction based on its own weights and bias (w and b).
The classifier tries to predict an output based on the last num_unrolling outputs? No. Just on a LSTM cell output.
We have train_data and train_inputs - how do these differ? Train data is used to generate inputs and labels (labels are just the inputs shifted by one along the character axis).
Number of unrollings = 10. Batch size = 64.
Unrollings = number of characters in the history to look at?
Train_data is a list of 11 entries each of shape 64 x 27. It is 11 as the first 10 are the inputs and the labels are the following 10 inputs. Each input is a matrix of shape 64 x 27 (batch size x vocab size) and each label is a matrix of the same size that represents the next set of inputs in the sequence. I.e. the input comprises 64 rows of vectors indicating a character.
For each input, the output and the state of the LSTM is computed. The output is added to a list of outputs. Outputs are then compared with the labels? No, outputs are fed into a classifier - output of the classifier indicates predicted character.
Each output is a matrix of size batch size x hidden layers?
In particular, outputs are concatenated along rows (axis 0). Outputs (the list) has num_unrollings, or 10 entries. Outputs are of length = number of hidden layers/units/nodes.
When we train the classifier we supply in parallel all the batches for the 10 unrollings in one matrix of training data. So the training data can be thought of as 64 batches of 10 unrollings in sequence. Trying to match sequences of 10 movements through the text, with input and output characters at each step.
How do saved_output and saved_state work? Used in the control_dependencies portion.
Bit below control_dependencies will only run after saved_output.assign(output) and saved_state.assign(state) have been evaluated - see https://www.tensorflow.org/api_docs/python/tf/Graph#control_dependencies. This just saves the last output and state, which is used the next time around.
Concat along axis 0 provides one dimension of output samples (e.g. rows of samples wherein the columns are the output dimensions - length batch size - i.e. 64), which is compared with one dimension of label samples.
In [15]:
print(vocabulary_size)
In [16]:
train_data = list()
for _ in range(num_unrollings + 1):
train_data.append(_)
train_inputs = train_data[:num_unrollings]
train_labels = train_data[1:]
print(train_inputs, train_labels)
This is a good walk-through: https://iamtrask.github.io/2015/11/15/anyone-can-code-lstm/ .
Human brain has 3-7 'slots' for recent information but can chunk hierarchically to remember further back.
In [17]:
num_nodes = 64
graph = tf.Graph()
with graph.as_default():
# Parameters:
# Input gate: input, previous output, and bias.
ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
ib = tf.Variable(tf.zeros([1, num_nodes]))
# Forget gate: input, previous output, and bias.
fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
fb = tf.Variable(tf.zeros([1, num_nodes]))
# Memory cell: input, state and bias.
cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
cb = tf.Variable(tf.zeros([1, num_nodes]))
# Output gate: input, previous output, and bias.
ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
ob = tf.Variable(tf.zeros([1, num_nodes]))
# Variables saving state across unrollings.
saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
# Classifier weights and biases.
w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
b = tf.Variable(tf.zeros([vocabulary_size]))
# Definition of the cell computation.
def lstm_cell(i, o, state):
"""Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
Note that in this formulation, we omit the various connections between the
previous state and the gates."""
input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
state = forget_gate * state + input_gate * tf.tanh(update)
output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
return output_gate * tf.tanh(state), state
# Input data.
train_data = list()
for _ in range(num_unrollings + 1):
train_data.append(
tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
train_inputs = train_data[:num_unrollings]
train_labels = train_data[1:] # labels are inputs shifted by one time step.
# Unrolled LSTM loop.
outputs = list()
output = saved_output
state = saved_state
for i in train_inputs:
output, state = lstm_cell(i, output, state)
outputs.append(output)
# State saving across unrollings.
with tf.control_dependencies([saved_output.assign(output),
saved_state.assign(state)]):
# Classifier.
logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
loss = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(
labels=tf.concat(train_labels, 0), logits=logits))
# Optimizer.
global_step = tf.Variable(0)
learning_rate = tf.train.exponential_decay(
10.0, global_step, 5000, 0.1, staircase=True)
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
gradients, v = zip(*optimizer.compute_gradients(loss))
# Apply max clipping of gradients
gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
optimizer = optimizer.apply_gradients(
zip(gradients, v), global_step=global_step)
# Predictions.
train_prediction = tf.nn.softmax(logits)
# Sampling and validation eval: batch 1, no unrolling.
sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
reset_sample_state = tf.group(
saved_sample_output.assign(tf.zeros([1, num_nodes])),
saved_sample_state.assign(tf.zeros([1, num_nodes])))
sample_output, sample_state = lstm_cell(
sample_input, saved_sample_output, saved_sample_state)
with tf.control_dependencies([saved_sample_output.assign(sample_output),
saved_sample_state.assign(sample_state)]):
sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))
num_steps is also known as epoch number.
In each epoch, we train over a set of batches.
In [18]:
num_steps = 7001
summary_frequency = 100
with tf.Session(graph=graph) as session:
tf.global_variables_initializer().run()
print('Initialized')
mean_loss = 0
for step in range(num_steps):
# Each step has a different batch
batches = train_batches.next()
feed_dict = dict()
# This loads the train_data with the last batch + 10 unrollings
for i in range(num_unrollings + 1):
feed_dict[train_data[i]] = batches[i]
# This runs the training
_, l, predictions, lr = session.run(
[optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
mean_loss += l
# Runs every 100 iterations
if step % summary_frequency == 0:
if step > 0:
mean_loss = mean_loss / summary_frequency
# The mean loss is an estimate of the loss over the last few batches.
print(
'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
mean_loss = 0
# Below creates one row of all the labels
labels = np.concatenate(list(batches)[1:])
print('Minibatch perplexity: %.2f' % float(
np.exp(logprob(predictions, labels))))
if step % (summary_frequency * 10) == 0:
# Generate some samples.
print('=' * 80)
for _ in range(5):
feed = sample(random_distribution())
sentence = characters(feed)[0]
reset_sample_state.run()
for _ in range(79):
prediction = sample_prediction.eval({sample_input: feed})
feed = sample(prediction)
sentence += characters(feed)[0]
print(sentence)
print('=' * 80)
# Measure validation set perplexity.
reset_sample_state.run()
valid_logprob = 0
for _ in range(valid_size):
b = valid_batches.next()
predictions = sample_prediction.eval({sample_input: b[0]})
valid_logprob = valid_logprob + logprob(predictions, b[1])
print('Validation set perplexity: %.2f' % float(np.exp(
valid_logprob / valid_size)))
Input matrix multiplications are:
Output matrix multiplications are:
i has dimensions :, vocab_size
o has dimensions :, num_nodes
ix, fx, cx, and ox are all of size: vocab_size, num_nodes
im, fm, cm, and om are all of size: num_nodes, num_nodes
So we make two matrices: x and m?
x size: vocab_size, num_nodes*4
m size: num_nodes, num_nodes*4
Then we just slice result into num_nodes segments?
input_matmul = tf.matmul(i, x) output_matmul = tf.matmul(o, m)
In [19]:
num_nodes = 64
graph = tf.Graph()
with graph.as_default():
# Parameters:
x = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes*4], -0.1, 0.1))
m = tf.Variable(tf.truncated_normal([num_nodes, num_nodes*4], -0.1, 0.1))
# Input gate: bias.
ib = tf.Variable(tf.zeros([1, num_nodes]))
# Forget gate: bias.
fb = tf.Variable(tf.zeros([1, num_nodes]))
# Memory cell: bias.
cb = tf.Variable(tf.zeros([1, num_nodes]))
# Output gate: bias.
ob = tf.Variable(tf.zeros([1, num_nodes]))
# Variables saving state across unrollings.
saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
# Classifier weights and biases.
w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
b = tf.Variable(tf.zeros([vocabulary_size]))
# Definition of the cell computation.
def lstm_cell(i, o, state):
"""Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
Note that in this formulation, we omit the various connections between the
previous state and the gates."""
input_matmul = tf.matmul(i, x)
output_matmul = tf.matmul(o, m)
input_gate = tf.sigmoid(input_matmul[:, :num_nodes]
+ output_matmul[:, :num_nodes]
+ ib)
forget_gate = tf.sigmoid(input_matmul[:, num_nodes:num_nodes*2]
+ output_matmul[:, num_nodes:num_nodes*2]
+ fb)
update = (input_matmul[:, num_nodes*2:num_nodes*3]
+ output_matmul[:, num_nodes*2:num_nodes*3]
+ cb)
state = forget_gate * state + input_gate * tf.tanh(update)
output_gate = tf.sigmoid(input_matmul[:, num_nodes*3:]
+ output_matmul[:, num_nodes*3:]
+ ob)
return output_gate * tf.tanh(state), state
# Input data.
train_data = list()
for _ in range(num_unrollings + 1):
train_data.append(
tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
train_inputs = train_data[:num_unrollings]
train_labels = train_data[1:] # labels are inputs shifted by one time step.
# Unrolled LSTM loop.
outputs = list()
output = saved_output
state = saved_state
for i in train_inputs:
output, state = lstm_cell(i, output, state)
outputs.append(output)
# State saving across unrollings.
with tf.control_dependencies([saved_output.assign(output),
saved_state.assign(state)]):
# Classifier.
logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
loss = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(
labels=tf.concat(train_labels, 0), logits=logits))
# Optimizer.
global_step = tf.Variable(0)
learning_rate = tf.train.exponential_decay(
10.0, global_step, 5000, 0.1, staircase=True)
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
gradients, v = zip(*optimizer.compute_gradients(loss))
# Apply max clipping of gradients
gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
optimizer = optimizer.apply_gradients(
zip(gradients, v), global_step=global_step)
# Predictions.
train_prediction = tf.nn.softmax(logits)
# Sampling and validation eval: batch 1, no unrolling.
sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
reset_sample_state = tf.group(
saved_sample_output.assign(tf.zeros([1, num_nodes])),
saved_sample_state.assign(tf.zeros([1, num_nodes])))
sample_output, sample_state = lstm_cell(
sample_input, saved_sample_output, saved_sample_state)
with tf.control_dependencies([saved_sample_output.assign(sample_output),
saved_sample_state.assign(sample_state)]):
sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))
In [22]:
num_steps = 14001
summary_frequency = 100
with tf.Session(graph=graph) as session:
tf.global_variables_initializer().run()
print('Initialized')
mean_loss = 0
for step in range(num_steps):
# Each step has a different batch
batches = train_batches.next()
feed_dict = dict()
# This loads the train_data with the last batch + 10 unrollings
for i in range(num_unrollings + 1):
feed_dict[train_data[i]] = batches[i]
# This runs the training
_, l, predictions, lr = session.run(
[optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
mean_loss += l
# Runs every 100 iterations
if step % summary_frequency == 0:
if step > 0:
mean_loss = mean_loss / summary_frequency
# The mean loss is an estimate of the loss over the last few batches.
print(
'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
mean_loss = 0
# Below creates one row of all the labels
labels = np.concatenate(list(batches)[1:])
print('Minibatch perplexity: %.2f' % float(
np.exp(logprob(predictions, labels))))
if step % (summary_frequency * 10) == 0:
# Generate some samples.
print('=' * 80)
for _ in range(5):
feed = sample(random_distribution())
sentence = characters(feed)[0]
reset_sample_state.run()
for _ in range(79):
prediction = sample_prediction.eval({sample_input: feed})
feed = sample(prediction)
sentence += characters(feed)[0]
print(sentence)
print('=' * 80)
# Measure validation set perplexity.
reset_sample_state.run()
valid_logprob = 0
for _ in range(valid_size):
b = valid_batches.next()
predictions = sample_prediction.eval({sample_input: b[0]})
valid_logprob = valid_logprob + logprob(predictions, b[1])
print('Validation set perplexity: %.2f' % float(np.exp(
valid_logprob / valid_size)))
We want to train a LSTM over bigrams, that is pairs of consecutive characters like 'ab' instead of single characters like 'a'. Since the number of possible bigrams is large, feeding them directly to the LSTM using 1-hot encodings will lead to a very sparse representation that is very wasteful computationally.
a- Introduce an embedding lookup on the inputs, and feed the embeddings to the LSTM cell instead of the inputs themselves.
b- Write a bigram-based LSTM, modeled on the character LSTM above.
c- Introduce Dropout. For best practices on how to use Dropout in LSTMs, refer to this article.
In [21]:
embedding_size = 128 # Dimension of the embedding vector.
ngram_size = 2
In the previous character case, we converted each character to a vector of length num_available_characters.
For ngrams we have a vocabulary_size of num_available_characters^ngram_size.
For the embeddings we need to convert an input vector of num_available_characters^ngram_size > embedding_size.
In [71]:
import itertools
char_vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' '
first_letter = ord(string.ascii_lowercase[0])
class NgramVocab(object):
""" Class object to hold ngram functions."""
def __init__(self, ngram_size):
""" Initialise object."""
self.ngram = ngram_size
self.vocab = self._build_ngram_dictionary()
self.size = len(self.vocab)
# Build ngram dictionary
def _build_ngram_dictionary(self):
""" Build lookup tables to map ngrams to ids and back. """
# Get list of available characters
char_vocab = list(string.ascii_lowercase) + [' ']
# Get list of all possible ngram combinations
ngram_vocab = [''.join(x) for x in itertools.product(char_vocab, repeat=self.ngram)]
return ngram_vocab
# Function to convert an ngram into an index id
def ngram2id(self, ngram):
"""Convert a character ngram - e.g. 'th' to an id in ngram_vocab."""
if ngram in self.vocab:
return self.vocab.index(ngram)
else:
print('Unexpected ngram: {}'.format(ngram))
# return double space
return self.vocab[-1]
# Function to convert an index id into an ngram
def id2ngram(self, id_in):
if id_in > 0 and id_in < len(self.vocab):
return self.vocab[id_in]
else:
return ' '
def characters(self, probabilities):
"""Turn a 1-hot encoding or a probability distribution over the possible
characters back into its (most likely) character representation."""
return [self.id2ngram(c) for c in np.argmax(probabilities, 1)]
def batches2string(self, batches):
"""Convert a sequence of batches back into their (most likely) string
representation."""
s = [''] * batches[0].shape[0] # s is list with blank entries for each batch
for b in batches: # for each of num_unrollings batches
# characters(b) is a list of characters for each batch - i.e. batch > characters
s = [''.join(x) for x in zip(s, [self.id2ngram(ngram_id) for ngram_id in b])] # joins batch characters vertically
return s
nv = NgramVocab(ngram_size)
ngram_vocab_size = nv.size
print(nv.vocab[345:355])
print("Ngram Vocab Size: {}".format(ngram_vocab_size))
print("Ngram {0} > {1}".format('ae', nv.ngram2id('ae'))) # These become your unittests
print("ID {0} > {1}".format(453, nv.id2ngram(453)))
In [84]:
batch_size=64
num_unrollings=10
class NgramBatchGenerator(object):
def __init__(self, text, batch_size, num_unrollings, ngram_vocab_object):
self._text = text
self._text_size = len(text)
self._batch_size = batch_size
self._nvo = ngram_vocab_object
self._num_unrollings = num_unrollings
# Segment
segment = self._text_size // batch_size
self._cursor = [ offset * segment for offset in range(batch_size)] # this sets 64 cursor starting indexes
# for the text
self._last_batch = self._next_batch()
def _next_batch(self):
"""Generate a single batch from the current cursor position in the data."""
# Initialise a matrix of size: batch size * ngram vocab size
batch = np.zeros(shape=(self._batch_size), dtype=np.int32) # initialise a matrix of 64 * 27
# for each batch add a set of ngrams
for b in range(self._batch_size): # repeat for each of these 64 character sequences
batch[b] = self._nvo.ngram2id(
''.join(
self._text[position]
for position in range(self._cursor[b], self._cursor[b]+self._nvo.ngram)
)
)
# set the index in the character array to one based on the character in the text at the cursor for
# the particular batch
# Then increment the cursor for the particular batch - by the ngram size
self._cursor[b] = (self._cursor[b] + self._nvo.ngram) % self._text_size # % just enables cycling of data
return batch
# Don't need to change this function
def next(self):
"""Generate the next array of batches from the data. The array consists of
the last batch of the previous array, followed by num_unrollings new ones.
"""
batches = [self._last_batch]
for step in range(self._num_unrollings):
batches.append(self._next_batch())
self._last_batch = batches[-1]
return batches
In [85]:
train_batches = NgramBatchGenerator(train_text, batch_size, num_unrollings, nv)
valid_batches = NgramBatchGenerator(valid_text, 1, 1, nv)
In [86]:
batches = train_batches.next()
print(len(batches))
print(batches[0].shape)
In [87]:
batches[0]
Out[87]:
In [88]:
valid_batchs.next()
Out[88]:
In [89]:
print(nv.batches2string(train_batches.next()))
print(nv.batches2string(train_batches.next()))
print(nv.batches2string(valid_batches.next()))
print(nv.batches2string(valid_batches.next()))
Embeddings lookup is only performed on the input side. It can be thought of as a lookup table to a real continous vector of lower dimensions.
We don't actually need the binary vector. We can just got from an index value in the ngram vocab to an embedding vector. This is why the trainset for the embedding is just an array of dimension 1 with length batch size, with each entry = ngram index.
In [104]:
num_nodes = 64
graph = tf.Graph()
with graph.as_default():
# Embedding matrix
embeddings = tf.Variable(
tf.random_uniform([nv.size, embedding_size], -1.0, 1.0))
# Parameters:
x = tf.Variable(tf.truncated_normal([embedding_size, num_nodes*4], -0.1, 0.1))
m = tf.Variable(tf.truncated_normal([num_nodes, num_nodes*4], -0.1, 0.1))
# Input gate: bias.
ib = tf.Variable(tf.zeros([1, num_nodes]))
# Forget gate: bias.
fb = tf.Variable(tf.zeros([1, num_nodes]))
# Memory cell: bias.
cb = tf.Variable(tf.zeros([1, num_nodes]))
# Output gate: bias.
ob = tf.Variable(tf.zeros([1, num_nodes]))
# Variables saving state across unrollings.
saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
# Classifier weights and biases - what size is the output - vocab vector or embedding size?
w = tf.Variable(tf.truncated_normal([num_nodes, nv.size], -0.1, 0.1))
b = tf.Variable(tf.zeros([nv.size]))
# Definition of the cell computation.
def lstm_cell(i, o, state):
"""Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
Note that in this formulation, we omit the various connections between the
previous state and the gates."""
input_matmul = tf.matmul(i, x)
output_matmul = tf.matmul(o, m)
input_gate = tf.sigmoid(input_matmul[:, :num_nodes]
+ output_matmul[:, :num_nodes]
+ ib)
forget_gate = tf.sigmoid(input_matmul[:, num_nodes:num_nodes*2]
+ output_matmul[:, num_nodes:num_nodes*2]
+ fb)
update = (input_matmul[:, num_nodes*2:num_nodes*3]
+ output_matmul[:, num_nodes*2:num_nodes*3]
+ cb)
state = forget_gate * state + input_gate * tf.tanh(update)
output_gate = tf.sigmoid(input_matmul[:, num_nodes*3:]
+ output_matmul[:, num_nodes*3:]
+ ob)
return output_gate * tf.tanh(state), state
# Get embeddings
def embed_lookup(train_dataset):
""" Looks up an embedding vector for input data."""
# Look up embeddings for inputs.
return tf.nn.embedding_lookup(embeddings, train_dataset)
# Input data.
train_data = list()
train_labels = list()
embed_data = list()
for i in range(num_unrollings + 1):
train_data.append(
tf.placeholder(tf.int32, shape=[batch_size]))
for i in range(num_unrollings):
embed_data.append(embed_lookup(train_data[i]))
train_inputs = embed_data
# Just pass the labels from the training loop?
for i in range(num_unrollings):
train_labels.append(
tf.placeholder(tf.int32, shape=[batch_size, nv.size])
)
# Get labels and convert to one-hot representation
#train_labels = train_data[1:] # labels are inputs shifted by one time step.
# Need to use a tf variable here?
#for i in range(num_unrollings):
# train_labels[i] = (np.arange(nv.size) == train_labels[i][:,None]).astype(np.float32)
# Unrolled LSTM loop.
outputs = list()
output = saved_output
state = saved_state
for i in train_inputs:
output, state = lstm_cell(i, output, state)
outputs.append(output)
# State saving across unrollings.
with tf.control_dependencies([saved_output.assign(output),
saved_state.assign(state)]):
# Classifier.
logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
loss = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(
labels=tf.concat(train_labels, 0), logits=logits))
# Optimizer.
global_step = tf.Variable(0)
learning_rate = tf.train.exponential_decay(
10.0, global_step, 5000, 0.1, staircase=True)
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
gradients, v = zip(*optimizer.compute_gradients(loss))
# Apply max clipping of gradients
gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
optimizer = optimizer.apply_gradients(
zip(gradients, v), global_step=global_step)
# Predictions.
train_prediction = tf.nn.softmax(logits)
# Sampling and validation eval: batch 1, no unrolling.
# Add embeddings lookup
sample_input = tf.placeholder(tf.int32, shape=[1])
sample_embed_input = embed_lookup(sample_input)
saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
reset_sample_state = tf.group(
saved_sample_output.assign(tf.zeros([1, num_nodes])),
saved_sample_state.assign(tf.zeros([1, num_nodes])))
sample_output, sample_state = lstm_cell(
sample_embed_input, saved_sample_output, saved_sample_state)
with tf.control_dependencies([saved_sample_output.assign(sample_output),
saved_sample_state.assign(sample_state)]):
sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))
Is my output of the logits an index or an array of probabilities for each possible ngram?
Latter?
If so we need to convert our labels which are an index to a one-hot representation. Still need an index2onehot function of some kind.
Previously this was used - labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
In [99]:
labels = (np.arange(nv.size) == batches[0][:,None]).astype(np.float32)
In [101]:
temp = np.arange(nv.size) == batches[0][:,None]
In [102]:
type(temp)
Out[102]:
In [94]:
len(labels)
Out[94]:
In [95]:
labels.shape
Out[95]:
In [96]:
labels[0]
Out[96]:
Questions:
In [105]:
# Logprob is used to compute the perplexity
def logprob(predictions, labels):
"""Log-probability of the true labels in a predicted batch."""
predictions[predictions < 1e-10] = 1e-10
return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]
def sample_distribution(distribution):
"""Sample one element from a distribution assumed to be an array of normalized
probabilities.
"""
r = random.uniform(0, 1)
s = 0
for i in range(len(distribution)):
s += distribution[i]
if s >= r:
return i
return len(distribution) - 1
def sample(prediction):
"""Turn a (column) prediction into 1-hot encoded samples."""
p = np.zeros(shape=[1, nv.size], dtype=np.float)
p[0, sample_distribution(prediction[0])] = 1.0
return p
def random_distribution():
"""Generate a random column of probabilities."""
b = np.random.uniform(0.0, 1.0, size=[1, nv.size])
return b/np.sum(b, 1)[:,None]
In [123]:
np.where(sample(random_distribution()) == 1.0)[1]
Out[123]:
In [132]:
nv.id2ngram(int((sample(random_distribution())).nonzero()[1]))
Out[132]:
In [121]:
nv.characters(feed)[0]
Out[121]:
In [136]:
np.array([123])
Out[136]:
In [137]:
num_steps = 7001
summary_frequency = 100
with tf.Session(graph=graph) as session:
tf.global_variables_initializer().run()
print('Initialized')
mean_loss = 0
for step in range(num_steps):
# Each step has a different batch
batches = train_batches.next()
feed_dict = dict()
# This loads the train_data with the last batch + 10 unrollings
training_labels = list()
for i in range(num_unrollings):
feed_dict[train_data[i]] = batches[i]
training_labels.append((np.arange(nv.size) == batches[i+1][:,None]).astype(np.float32))
feed_dict[train_labels[i]] = training_labels[i]
# This runs the training
_, l, predictions, lr = session.run(
[optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
mean_loss += l
# Runs every 100 iterations
if step % summary_frequency == 0:
if step > 0:
mean_loss = mean_loss / summary_frequency
# The mean loss is an estimate of the loss over the last few batches.
print(
'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
mean_loss = 0
# Below creates one row of all the labels
row_labels = np.concatenate(list(training_labels))
print('Minibatch perplexity: %.2f' % float(
np.exp(logprob(predictions, row_labels))))
if step % (summary_frequency * 10) == 0:
# Generate some samples.
print('=' * 80)
for _ in range(5):
#feed = sample(random_distribution())
feed = np.array([random.randint(0, nv.size)])
# Need to convert feed from one hot to an index
sentence = nv.id2ngram(int(feed))
reset_sample_state.run()
for _ in range(79):
# Sample input expects an index
prediction = sample_prediction.eval({sample_input: feed})
feed = sample(prediction).nonzero()[1]
sentence += nv.id2ngram(int(feed))
print(sentence)
print('=' * 80)
# Measure validation set perplexity.
reset_sample_state.run()
valid_logprob = 0
for _ in range(valid_size):
b = valid_batches.next()
predictions = sample_prediction.eval({sample_input: b[0]})
valid_logprob = valid_logprob + logprob(predictions, b[1])
print('Validation set perplexity: %.2f' % float(np.exp(
valid_logprob / valid_size)))
(difficult!)
Write a sequence-to-sequence LSTM which mirrors all the words in a sentence. For example, if your input is:
the quick brown fox
the model should attempt to output:
eht kciuq nworb xof
Refer to the lecture on how to put together a sequence-to-sequence model, as well as this article for best practices.