Deep Learning

Assignment 6

After training a skip-gram model in 5_word2vec.ipynb, the goal of this notebook is to train a LSTM character model over Text8 data.


In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import os
import numpy as np
import random
import string
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve

In [2]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)


Found and verified text8.zip

In [3]:
def read_data(filename):
  with zipfile.ZipFile(filename) as f:
    name = f.namelist()[0]
    data = tf.compat.as_str(f.read(name))
  return data
  
text = read_data(filename)
print('Data size %d' % len(text))


Data size 100000000

Create a small validation set.


In [4]:
valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])


99999000 ons anarchists advocate social relations based upon voluntary as
1000  anarchism originated as a term of abuse first used against earl

Utility functions to map characters to vocabulary IDs and back.


In [5]:
vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' '
first_letter = ord(string.ascii_lowercase[0])

def char2id(char):
  if char in string.ascii_lowercase:
    return ord(char) - first_letter + 1
  elif char == ' ':
    return 0
  else:
    print('Unexpected character: %s' % char)
    return 0
  
def id2char(dictid):
  if dictid > 0:
    return chr(dictid + first_letter - 1)
  else:
    return ' '

print(char2id('a'), char2id('z'), char2id(' '), char2id('ï'))
print(id2char(1), id2char(26), id2char(0))


Unexpected character: ï
1 26 0 0
a z  

Function to generate a training batch for the LSTM model.


In [6]:
batch_size=64
num_unrollings=10

class BatchGenerator(object):
    def __init__(self, text, batch_size, num_unrollings):
        self._text = text
        self._text_size = len(text)
        self._batch_size = batch_size
        self._num_unrollings = num_unrollings
        segment = self._text_size // batch_size
        self._cursor = [ offset * segment for offset in range(batch_size)] # this sets 64 cursor starting indexes 
        # for the text
        self._last_batch = self._next_batch()
  
    def _next_batch(self):
        """Generate a single batch from the current cursor position in the data."""
        batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float) # initialise a matrix of 64 * 27
        # I.e. each batch in this case involves 64 different sets of character sequences.
        for b in range(self._batch_size): # repeat for each of these 64 character sequences
            batch[b, char2id(self._text[self._cursor[b]])] = 1.0 # for each row representing a sequence in the batch 
            # set the index in the character array to one based on the character in the text at the cursor for
            # the particular batch
            # Then increment the cursor for the particular batch
            self._cursor[b] = (self._cursor[b] + 1) % self._text_size # % just enables cycling of data
        return batch
  
    def next(self):
        """Generate the next array of batches from the data. The array consists of
        the last batch of the previous array, followed by num_unrollings new ones.
        """
        batches = [self._last_batch]
        for step in range(self._num_unrollings):
            batches.append(self._next_batch())
            self._last_batch = batches[-1]
        return batches

def characters(probabilities):
    """Turn a 1-hot encoding or a probability distribution over the possible
    characters back into its (most likely) character representation."""
    return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
    """Convert a sequence of batches back into their (most likely) string
    representation."""
    s = [''] * batches[0].shape[0] # s is list with 64 blank entries 
    for b in batches: # for each of 11 batches
        # characters(b) is a list of 64 characters - i.e. batch > characters
        s = [''.join(x) for x in zip(s, characters(b))] # joins batch characters vertically
    return s

train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 1)

print(batches2string(train_batches.next()))
print(batches2string(train_batches.next()))
print(batches2string(valid_batches.next()))
print(batches2string(valid_batches.next()))


['ons anarchi', 'when milita', 'lleria arch', ' abbeys and', 'married urr', 'hel and ric', 'y and litur', 'ay opened f', 'tion from t', 'migration t', 'new york ot', 'he boeing s', 'e listed wi', 'eber has pr', 'o be made t', 'yer who rec', 'ore signifi', 'a fierce cr', ' two six ei', 'aristotle s', 'ity can be ', ' and intrac', 'tion of the', 'dy to pass ', 'f certain d', 'at it will ', 'e convince ', 'ent told hi', 'ampaign and', 'rver side s', 'ious texts ', 'o capitaliz', 'a duplicate', 'gh ann es d', 'ine january', 'ross zero t', 'cal theorie', 'ast instanc', ' dimensiona', 'most holy m', 't s support', 'u is still ', 'e oscillati', 'o eight sub', 'of italy la', 's the tower', 'klahoma pre', 'erprise lin', 'ws becomes ', 'et in a naz', 'the fabian ', 'etchy to re', ' sharman ne', 'ised empero', 'ting in pol', 'd neo latin', 'th risky ri', 'encyclopedi', 'fense the a', 'duating fro', 'treet grid ', 'ations more', 'appeal of d', 'si have mad']
['ists advoca', 'ary governm', 'hes nationa', 'd monasteri', 'raca prince', 'chard baer ', 'rgical lang', 'for passeng', 'the nationa', 'took place ', 'ther well k', 'seven six s', 'ith a gloss', 'robably bee', 'to recogniz', 'ceived the ', 'icant than ', 'ritic of th', 'ight in sig', 's uncaused ', ' lost as in', 'cellular ic', 'e size of t', ' him a stic', 'drugs confu', ' take to co', ' the priest', 'im to name ', 'd barred at', 'standard fo', ' such as es', 'ze on the g', 'e of the or', 'd hiver one', 'y eight mar', 'the lead ch', 'es classica', 'ce the non ', 'al analysis', 'mormons bel', 't or at lea', ' disagreed ', 'ing system ', 'btypes base', 'anguages th', 'r commissio', 'ess one nin', 'nux suse li', ' the first ', 'zi concentr', ' society ne', 'elatively s', 'etworks sha', 'or hirohito', 'litical ini', 'n most of t', 'iskerdoo ri', 'ic overview', 'air compone', 'om acnm acc', ' centerline', 'e than any ', 'devotional ', 'de such dev']
[' a']
['an']

So a valid batch contains two characters - one for the input, one for the output which is the next character in the sequence.


In [7]:
batches = train_batches.next()

In [8]:
len(batches)


Out[8]:
11

In [9]:
s = [''] * batches[0].shape[0]

In [10]:
labels = np.concatenate(list(batches)[1:])
labels


Out[10]:
array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [11]:
c = characters(batches[0])

In [12]:
s = [''.join(x) for x in zip(s, c)]

In [13]:
s


Out[13]:
['a',
 'm',
 'a',
 'i',
 'e',
 ' ',
 'g',
 'g',
 'a',
 ' ',
 'k',
 's',
 's',
 'e',
 'z',
 ' ',
 ' ',
 'h',
 'g',
 ' ',
 'n',
 'c',
 't',
 'c',
 'u',
 'o',
 't',
 ' ',
 't',
 'o',
 's',
 'g',
 'r',
 'e',
 'r',
 'h',
 'a',
 ' ',
 's',
 'l',
 'a',
 ' ',
 ' ',
 'e',
 'h',
 'o',
 'n',
 'i',
 ' ',
 'r',
 'e',
 's',
 'a',
 'o',
 'i',
 't',
 'i',
 'w',
 'e',
 'c',
 'e',
 ' ',
 ' ',
 'v']

Each call to next returns a list of 11 batches (the last batch plus 10 unrollings).

Each batch has 64 characters? Next batch contains the next character for each of those 64 characters. This is repeated 10 times to yield 11 batchs: a last set of characters and 10 iterations forward through the sequence of text.

How next batch works -


In [14]:
# Logprob is used to compute the perplexity
def logprob(predictions, labels):
    """Log-probability of the true labels in a predicted batch."""
    predictions[predictions < 1e-10] = 1e-10
    return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

def sample_distribution(distribution):
    """Sample one element from a distribution assumed to be an array of normalized
    probabilities.
    """
    r = random.uniform(0, 1)
    s = 0
    for i in range(len(distribution)):
        s += distribution[i]
        if s >= r:
            return i
    return len(distribution) - 1

def sample(prediction):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
  p[0, sample_distribution(prediction[0])] = 1.0
  return p

def random_distribution():
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
  return b/np.sum(b, 1)[:,None]

This is the input size, which equals 26 lower case characters plus space.

What is num_nodes? - Size of the hidden dimension of the cell, or the number of units in the LSTM cell as per here - http://monik.in/a-noobs-guide-to-implementing-rnn-lstm-using-tensorflow/ .

Where does the classifier come in with w and b?

The cell has 'state' (C) and 'output' (h or y).

Input gate is a set of weights applied to the current input and the previous output + bias. ix below is the part of the weights applied to the input and im is the part of the weights applied to the previous output.

Classifier is applied to the output of the trained cell? Yes - even more so it is applied to the outputs of a bunch of cells equal to the number of unrollings?

The LSTM cell takes a character as an input and outputs a high dimensional vector of length num_nodes. Each cell produces an output of dimension num_nodes = 64. The classifier takes this hidden output and generates a character prediction based on its own weights and bias (w and b).

The classifier tries to predict an output based on the last num_unrolling outputs? No. Just on a LSTM cell output.

We have train_data and train_inputs - how do these differ? Train data is used to generate inputs and labels (labels are just the inputs shifted by one along the character axis).

Number of unrollings = 10. Batch size = 64.

Unrollings = number of characters in the history to look at?

Train_data is a list of 11 entries each of shape 64 x 27. It is 11 as the first 10 are the inputs and the labels are the following 10 inputs. Each input is a matrix of shape 64 x 27 (batch size x vocab size) and each label is a matrix of the same size that represents the next set of inputs in the sequence. I.e. the input comprises 64 rows of vectors indicating a character.

For each input, the output and the state of the LSTM is computed. The output is added to a list of outputs. Outputs are then compared with the labels? No, outputs are fed into a classifier - output of the classifier indicates predicted character.

Each output is a matrix of size batch size x hidden layers?

In particular, outputs are concatenated along rows (axis 0). Outputs (the list) has num_unrollings, or 10 entries. Outputs are of length = number of hidden layers/units/nodes.

When we train the classifier we supply in parallel all the batches for the 10 unrollings in one matrix of training data. So the training data can be thought of as 64 batches of 10 unrollings in sequence. Trying to match sequences of 10 movements through the text, with input and output characters at each step.

How do saved_output and saved_state work? Used in the control_dependencies portion.

Bit below control_dependencies will only run after saved_output.assign(output) and saved_state.assign(state) have been evaluated - see https://www.tensorflow.org/api_docs/python/tf/Graph#control_dependencies. This just saves the last output and state, which is used the next time around.

Concat along axis 0 provides one dimension of output samples (e.g. rows of samples wherein the columns are the output dimensions - length batch size - i.e. 64), which is compared with one dimension of label samples.


In [15]:
print(vocabulary_size)


27

In [16]:
train_data = list()
for _ in range(num_unrollings + 1):
    train_data.append(_)
train_inputs = train_data[:num_unrollings]
train_labels = train_data[1:]
print(train_inputs, train_labels)


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9] [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

This is a good walk-through: https://iamtrask.github.io/2015/11/15/anyone-can-code-lstm/ .

Human brain has 3-7 'slots' for recent information but can chunk hierarchically to remember further back.


In [17]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
  
    # Parameters:
    # Input gate: input, previous output, and bias.
    ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
    im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    ib = tf.Variable(tf.zeros([1, num_nodes]))
    # Forget gate: input, previous output, and bias.
    fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
    fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    fb = tf.Variable(tf.zeros([1, num_nodes]))
    # Memory cell: input, state and bias.                             
    cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
    cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    cb = tf.Variable(tf.zeros([1, num_nodes]))
    # Output gate: input, previous output, and bias.
    ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
    om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    ob = tf.Variable(tf.zeros([1, num_nodes]))
    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    # Classifier weights and biases.
    w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([vocabulary_size]))
  
    # Definition of the cell computation.
    def lstm_cell(i, o, state):
        """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
        Note that in this formulation, we omit the various connections between the
        previous state and the gates."""
        input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
        forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
        update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
        state = forget_gate * state + input_gate * tf.tanh(update)
        output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
        return output_gate * tf.tanh(state), state

    # Input data.
    train_data = list()
    for _ in range(num_unrollings + 1):
        train_data.append(
            tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
    train_inputs = train_data[:num_unrollings]
    train_labels = train_data[1:]  # labels are inputs shifted by one time step.

    # Unrolled LSTM loop.
    outputs = list()
    output = saved_output
    state = saved_state
    for i in train_inputs:
        output, state = lstm_cell(i, output, state)
        outputs.append(output)

    # State saving across unrollings.
    with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
        # Classifier.
        logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
        loss = tf.reduce_mean(
          tf.nn.softmax_cross_entropy_with_logits(
            labels=tf.concat(train_labels, 0), logits=logits))

    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(
        10.0, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    # Apply max clipping of gradients
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(
        zip(gradients, v), global_step=global_step)

    # Predictions.
    train_prediction = tf.nn.softmax(logits)
  
    # Sampling and validation eval: batch 1, no unrolling.
    sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
    reset_sample_state = tf.group(
        saved_sample_output.assign(tf.zeros([1, num_nodes])),
        saved_sample_state.assign(tf.zeros([1, num_nodes])))
    sample_output, sample_state = lstm_cell(
        sample_input, saved_sample_output, saved_sample_state)
    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

num_steps is also known as epoch number.

In each epoch, we train over a set of batches.


In [18]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    mean_loss = 0
    for step in range(num_steps):
        # Each step has a different batch
        batches = train_batches.next()
        feed_dict = dict()
        # This loads the train_data with the last batch + 10 unrollings
        for i in range(num_unrollings + 1):
            feed_dict[train_data[i]] = batches[i]
        # This runs the training
        _, l, predictions, lr = session.run(
                  [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
        mean_loss += l
        # Runs every 100 iterations
        if step % summary_frequency == 0:
            if step > 0:
                mean_loss = mean_loss / summary_frequency
                  # The mean loss is an estimate of the loss over the last few batches.
            print(
                'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
            mean_loss = 0
            # Below creates one row of all the labels
            labels = np.concatenate(list(batches)[1:])
            print('Minibatch perplexity: %.2f' % float(
                np.exp(logprob(predictions, labels))))
            if step % (summary_frequency * 10) == 0:
                # Generate some samples.
                print('=' * 80)
                for _ in range(5):
                    feed = sample(random_distribution())
                    sentence = characters(feed)[0]
                    reset_sample_state.run()
                    for _ in range(79):
                        prediction = sample_prediction.eval({sample_input: feed})
                        feed = sample(prediction)
                        sentence += characters(feed)[0]
                    print(sentence)
                print('=' * 80)
            # Measure validation set perplexity.
            reset_sample_state.run()
            valid_logprob = 0
            for _ in range(valid_size):
                b = valid_batches.next()
                predictions = sample_prediction.eval({sample_input: b[0]})
                valid_logprob = valid_logprob + logprob(predictions, b[1])
            print('Validation set perplexity: %.2f' % float(np.exp(
                valid_logprob / valid_size)))


Initialized
Average loss at step 0: 3.297829 learning rate: 10.000000
Minibatch perplexity: 27.05
================================================================================
bnvhooauoampigtkjwppik p ui ypikqirrxjmmilsndowlvlsi h omceyst astbkxadclarletyp
pnethc sjrvhlsi xri mlsdmsmhogzyddhcdoubaime lhooyen  agfmt stdjyxuzmoynzta zicu
strx vcklcz bbcdhzoqecno lavrgdone n ed a atwbfnonose oee bxtr hzpnlwxsecaokffu 
t aamttsz osvqriiatetpotreteeqckr mohijeha nvaah dusaeqtqnhfwpd iwihuextnjwkome 
mpfeeteu  h eeprtshjnzlo awu e skkts dq a floamfx prkehbyncuttrrnlbtnjdcae tpuyk
================================================================================
Validation set perplexity: 19.97
Average loss at step 100: 2.624575 learning rate: 10.000000
Minibatch perplexity: 10.07
Validation set perplexity: 10.60
Average loss at step 200: 2.259818 learning rate: 10.000000
Minibatch perplexity: 9.67
Validation set perplexity: 9.32
Average loss at step 300: 2.103225 learning rate: 10.000000
Minibatch perplexity: 7.59
Validation set perplexity: 8.03
Average loss at step 400: 2.002821 learning rate: 10.000000
Minibatch perplexity: 7.53
Validation set perplexity: 7.57
Average loss at step 500: 1.936773 learning rate: 10.000000
Minibatch perplexity: 6.30
Validation set perplexity: 7.07
Average loss at step 600: 1.912801 learning rate: 10.000000
Minibatch perplexity: 6.21
Validation set perplexity: 6.80
Average loss at step 700: 1.857657 learning rate: 10.000000
Minibatch perplexity: 5.49
Validation set perplexity: 6.72
Average loss at step 800: 1.821570 learning rate: 10.000000
Minibatch perplexity: 6.27
Validation set perplexity: 6.56
Average loss at step 900: 1.829934 learning rate: 10.000000
Minibatch perplexity: 7.20
Validation set perplexity: 6.19
Average loss at step 1000: 1.822710 learning rate: 10.000000
Minibatch perplexity: 6.09
================================================================================
y in this turning his ears offerty ovin ivind suched to the distatury n sing in 
or procemer ffomoriancomen in more and w an one nine five a the cervibo it mikil
 the kintine as witina preece and from for was linaraica to the forbed mideratio
 and five sictarus by the mard of the girferic descuiled simutions resoukd in di
pberan s d pares was laymempopfor resal sopidios linfitism bystientatist fricerd
================================================================================
Validation set perplexity: 6.04
Average loss at step 1100: 1.777886 learning rate: 10.000000
Minibatch perplexity: 5.76
Validation set perplexity: 5.94
Average loss at step 1200: 1.750547 learning rate: 10.000000
Minibatch perplexity: 6.15
Validation set perplexity: 5.62
Average loss at step 1300: 1.730314 learning rate: 10.000000
Minibatch perplexity: 5.83
Validation set perplexity: 5.66
Average loss at step 1400: 1.741790 learning rate: 10.000000
Minibatch perplexity: 4.76
Validation set perplexity: 5.58
Average loss at step 1500: 1.738325 learning rate: 10.000000
Minibatch perplexity: 6.31
Validation set perplexity: 5.47
Average loss at step 1600: 1.746314 learning rate: 10.000000
Minibatch perplexity: 5.22
Validation set perplexity: 5.32
Average loss at step 1700: 1.711228 learning rate: 10.000000
Minibatch perplexity: 4.66
Validation set perplexity: 5.42
Average loss at step 1800: 1.673303 learning rate: 10.000000
Minibatch perplexity: 5.12
Validation set perplexity: 5.14
Average loss at step 1900: 1.648366 learning rate: 10.000000
Minibatch perplexity: 6.11
Validation set perplexity: 5.14
Average loss at step 2000: 1.693805 learning rate: 10.000000
Minibatch perplexity: 4.88
================================================================================
t with nich as bok a time bewall wren ineline that will futronimally sidion cinc
ar s is sibled eight three and colled home lenzagas his the sudded gincped is al
bamgy tajes respace it the and of zlasing roc zero zero s su oper a thoup he has
y the unitann struges at is insuected in gording and work c at the mad tneava us
il shikn ka cial a sithin the woush he informignality indeared s birel any to su
================================================================================
Validation set perplexity: 5.11
Average loss at step 2100: 1.686867 learning rate: 10.000000
Minibatch perplexity: 5.17
Validation set perplexity: 4.88
Average loss at step 2200: 1.678413 learning rate: 10.000000
Minibatch perplexity: 4.90
Validation set perplexity: 5.14
Average loss at step 2300: 1.637739 learning rate: 10.000000
Minibatch perplexity: 5.66
Validation set perplexity: 4.87
Average loss at step 2400: 1.656644 learning rate: 10.000000
Minibatch perplexity: 5.21
Validation set perplexity: 4.80
Average loss at step 2500: 1.678674 learning rate: 10.000000
Minibatch perplexity: 5.87
Validation set perplexity: 4.76
Average loss at step 2600: 1.655203 learning rate: 10.000000
Minibatch perplexity: 5.78
Validation set perplexity: 4.77
Average loss at step 2700: 1.655843 learning rate: 10.000000
Minibatch perplexity: 5.34
Validation set perplexity: 4.74
Average loss at step 2800: 1.648115 learning rate: 10.000000
Minibatch perplexity: 4.91
Validation set perplexity: 4.60
Average loss at step 2900: 1.655239 learning rate: 10.000000
Minibatch perplexity: 4.50
Validation set perplexity: 4.62
Average loss at step 3000: 1.653392 learning rate: 10.000000
Minibatch perplexity: 4.76
================================================================================
kation with prothand which bevisy as wibntory av and strobarly lond hotthone liz
ing he leter ruler euger operation in the leady wayty iffect surration casic wel
greato protonational opricial most stron are readicall pect one nine six sucters
cound for early the ballagy and with terman ay statine purcle into in new hiskab
werre a klading havey perpered and long in at tspetink have in arruc to coulbret
================================================================================
Validation set perplexity: 4.75
Average loss at step 3100: 1.628502 learning rate: 10.000000
Minibatch perplexity: 5.69
Validation set perplexity: 4.58
Average loss at step 3200: 1.644509 learning rate: 10.000000
Minibatch perplexity: 4.90
Validation set perplexity: 4.57
Average loss at step 3300: 1.639716 learning rate: 10.000000
Minibatch perplexity: 5.75
Validation set perplexity: 4.54
Average loss at step 3400: 1.669048 learning rate: 10.000000
Minibatch perplexity: 6.15
Validation set perplexity: 4.62
Average loss at step 3500: 1.653934 learning rate: 10.000000
Minibatch perplexity: 5.74
Validation set perplexity: 4.78
Average loss at step 3600: 1.665067 learning rate: 10.000000
Minibatch perplexity: 4.83
Validation set perplexity: 4.65
Average loss at step 3700: 1.647500 learning rate: 10.000000
Minibatch perplexity: 5.47
Validation set perplexity: 4.58
Average loss at step 3800: 1.647260 learning rate: 10.000000
Minibatch perplexity: 5.09
Validation set perplexity: 4.63
Average loss at step 3900: 1.640207 learning rate: 10.000000
Minibatch perplexity: 5.57
Validation set perplexity: 4.64
Average loss at step 4000: 1.651031 learning rate: 10.000000
Minibatch perplexity: 4.51
================================================================================
phiding his games at the swain bythams however rpll availed halvaed peroor tfun 
x of bardination often socether allancollosispyant contree complosolus and iu sc
ur reperfedisted the lass roundy propas of the early yom when imperiant canase j
co tlintions drowing of the few vive ameraple like government reclencibutions of
co president in need retrictly but jois rennith a success dessevent of sanizinca
================================================================================
Validation set perplexity: 4.64
Average loss at step 4100: 1.634018 learning rate: 10.000000
Minibatch perplexity: 5.45
Validation set perplexity: 4.75
Average loss at step 4200: 1.633935 learning rate: 10.000000
Minibatch perplexity: 4.81
Validation set perplexity: 4.56
Average loss at step 4300: 1.616769 learning rate: 10.000000
Minibatch perplexity: 4.98
Validation set perplexity: 4.54
Average loss at step 4400: 1.606337 learning rate: 10.000000
Minibatch perplexity: 4.90
Validation set perplexity: 4.46
Average loss at step 4500: 1.613365 learning rate: 10.000000
Minibatch perplexity: 5.21
Validation set perplexity: 4.69
Average loss at step 4600: 1.615680 learning rate: 10.000000
Minibatch perplexity: 5.22
Validation set perplexity: 4.54
Average loss at step 4700: 1.625145 learning rate: 10.000000
Minibatch perplexity: 5.05
Validation set perplexity: 4.57
Average loss at step 4800: 1.629646 learning rate: 10.000000
Minibatch perplexity: 4.84
Validation set perplexity: 4.49
Average loss at step 4900: 1.631718 learning rate: 10.000000
Minibatch perplexity: 4.97
Validation set perplexity: 4.65
Average loss at step 5000: 1.607001 learning rate: 1.000000
Minibatch perplexity: 5.25
================================================================================
gress of the possicicary tuss that opering of posates and in the partial s mains
de seeb blloganal d one impoph it to and cleago on in of gragians and bounds in 
child on ethat of moneentian seven chetubble and place moge the trad one of the 
y jinigo convent varial gred tumral both granited and he sukal was dipthroot ack
le and imot ruling additical pricats accords park waves are matique entesps nept
================================================================================
Validation set perplexity: 4.70
Average loss at step 5100: 1.605741 learning rate: 1.000000
Minibatch perplexity: 4.78
Validation set perplexity: 4.53
Average loss at step 5200: 1.589437 learning rate: 1.000000
Minibatch perplexity: 4.60
Validation set perplexity: 4.46
Average loss at step 5300: 1.578402 learning rate: 1.000000
Minibatch perplexity: 5.28
Validation set perplexity: 4.45
Average loss at step 5400: 1.578576 learning rate: 1.000000
Minibatch perplexity: 5.03
Validation set perplexity: 4.41
Average loss at step 5500: 1.569640 learning rate: 1.000000
Minibatch perplexity: 4.93
Validation set perplexity: 4.43
Average loss at step 5600: 1.579791 learning rate: 1.000000
Minibatch perplexity: 4.67
Validation set perplexity: 4.43
Average loss at step 5700: 1.569575 learning rate: 1.000000
Minibatch perplexity: 4.58
Validation set perplexity: 4.45
Average loss at step 5800: 1.580980 learning rate: 1.000000
Minibatch perplexity: 5.05
Validation set perplexity: 4.46
Average loss at step 5900: 1.574349 learning rate: 1.000000
Minibatch perplexity: 4.69
Validation set perplexity: 4.41
Average loss at step 6000: 1.546468 learning rate: 1.000000
Minibatch perplexity: 4.88
================================================================================
 has at that now the world imally reparts and comed in story of john make some j
d substance on larged as tran fanote is a scandals of reservially s suppricate q
rewing bloom bc damgalighived cophimate not chanimenting moclariquent performami
rensed however reportainpes charidy exait ie the guisfergomem scheigality earth 
quine prime are but often as benguatter that up a tiopated in three two jameal f
================================================================================
Validation set perplexity: 4.39
Average loss at step 6100: 1.563521 learning rate: 1.000000
Minibatch perplexity: 4.18
Validation set perplexity: 4.35
Average loss at step 6200: 1.541267 learning rate: 1.000000
Minibatch perplexity: 5.22
Validation set perplexity: 4.39
Average loss at step 6300: 1.540197 learning rate: 1.000000
Minibatch perplexity: 4.64
Validation set perplexity: 4.38
Average loss at step 6400: 1.539804 learning rate: 1.000000
Minibatch perplexity: 5.25
Validation set perplexity: 4.35
Average loss at step 6500: 1.556502 learning rate: 1.000000
Minibatch perplexity: 4.56
Validation set perplexity: 4.35
Average loss at step 6600: 1.594992 learning rate: 1.000000
Minibatch perplexity: 4.56
Validation set perplexity: 4.34
Average loss at step 6700: 1.576442 learning rate: 1.000000
Minibatch perplexity: 4.42
Validation set perplexity: 4.36
Average loss at step 6800: 1.602367 learning rate: 1.000000
Minibatch perplexity: 5.21
Validation set perplexity: 4.35
Average loss at step 6900: 1.579945 learning rate: 1.000000
Minibatch perplexity: 4.61
Validation set perplexity: 4.39
Average loss at step 7000: 1.578391 learning rate: 1.000000
Minibatch perplexity: 4.82
================================================================================
dectife serves datas and depicial poolded var compute with the misiturgal has en
ffling rudertualled of when highop and charic brounds ricelva in organa sicking 
revinety of the chn its hregsuda undor affario ef movicaty and in algoin is the 
dayion long yave ge mahaniev southind a one nine nine six demeet senced thele in
y parce or fexemb depatuasfarimatiesta k lembilloid off  arsurachi one seven nin
================================================================================
Validation set perplexity: 4.34

Problem 1

You might have noticed that the definition of the LSTM cell involves 4 matrix multiplications with the input, and 4 matrix multiplications with the output. Simplify the expression by using a single matrix multiply for each, and variables that are 4 times larger.


Input matrix multiplications are:

  • tf.matmul(i, ix)
  • tf.matmul(i, fx)
  • tf.matmul(i, cx)
  • tf.matmul(i, ox)

Output matrix multiplications are:

  • tf.matmul(o, im)
  • tf.matmul(o, fm)
  • tf.matmul(o, cm)
  • tf.matmul(o, om)

i has dimensions :, vocab_size
o has dimensions :, num_nodes

ix, fx, cx, and ox are all of size: vocab_size, num_nodes
im, fm, cm, and om are all of size: num_nodes, num_nodes

So we make two matrices: x and m?
x size: vocab_size, num_nodes*4
m size: num_nodes, num_nodes*4

Then we just slice result into num_nodes segments?

input_matmul = tf.matmul(i, x) output_matmul = tf.matmul(o, m)


In [19]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
  
    # Parameters:
    x = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes*4], -0.1, 0.1))
    m = tf.Variable(tf.truncated_normal([num_nodes, num_nodes*4], -0.1, 0.1))
    # Input gate: bias.
    ib = tf.Variable(tf.zeros([1, num_nodes]))
    # Forget gate: bias.   
    fb = tf.Variable(tf.zeros([1, num_nodes]))
    # Memory cell: bias.                               
    cb = tf.Variable(tf.zeros([1, num_nodes]))
    # Output gate: bias.
    ob = tf.Variable(tf.zeros([1, num_nodes]))
    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    # Classifier weights and biases.
    w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([vocabulary_size]))
  
    # Definition of the cell computation.
    def lstm_cell(i, o, state):
        """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
        Note that in this formulation, we omit the various connections between the
        previous state and the gates."""
        input_matmul = tf.matmul(i, x)
        output_matmul = tf.matmul(o, m)
        input_gate = tf.sigmoid(input_matmul[:, :num_nodes] 
                                + output_matmul[:, :num_nodes] 
                                + ib)
        forget_gate = tf.sigmoid(input_matmul[:, num_nodes:num_nodes*2] 
                                 + output_matmul[:, num_nodes:num_nodes*2] 
                                 + fb)
        update = (input_matmul[:, num_nodes*2:num_nodes*3] 
                  + output_matmul[:, num_nodes*2:num_nodes*3]  
                  + cb)
        state = forget_gate * state + input_gate * tf.tanh(update)
        output_gate = tf.sigmoid(input_matmul[:, num_nodes*3:]
                                 + output_matmul[:, num_nodes*3:] 
                                 + ob)
        return output_gate * tf.tanh(state), state

    # Input data.
    train_data = list()
    for _ in range(num_unrollings + 1):
        train_data.append(
            tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
    train_inputs = train_data[:num_unrollings]
    train_labels = train_data[1:]  # labels are inputs shifted by one time step.

    # Unrolled LSTM loop.
    outputs = list()
    output = saved_output
    state = saved_state
    for i in train_inputs:
        output, state = lstm_cell(i, output, state)
        outputs.append(output)

    # State saving across unrollings.
    with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
        # Classifier.
        logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
        loss = tf.reduce_mean(
          tf.nn.softmax_cross_entropy_with_logits(
            labels=tf.concat(train_labels, 0), logits=logits))

    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(
        10.0, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    # Apply max clipping of gradients
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(
        zip(gradients, v), global_step=global_step)

    # Predictions.
    train_prediction = tf.nn.softmax(logits)
  
    # Sampling and validation eval: batch 1, no unrolling.
    sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
    reset_sample_state = tf.group(
        saved_sample_output.assign(tf.zeros([1, num_nodes])),
        saved_sample_state.assign(tf.zeros([1, num_nodes])))
    sample_output, sample_state = lstm_cell(
        sample_input, saved_sample_output, saved_sample_state)
    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [22]:
num_steps = 14001
summary_frequency = 100

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    mean_loss = 0
    for step in range(num_steps):
        # Each step has a different batch
        batches = train_batches.next()
        feed_dict = dict()
        # This loads the train_data with the last batch + 10 unrollings
        for i in range(num_unrollings + 1):
            feed_dict[train_data[i]] = batches[i]
        # This runs the training
        _, l, predictions, lr = session.run(
                  [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
        mean_loss += l
        # Runs every 100 iterations
        if step % summary_frequency == 0:
            if step > 0:
                mean_loss = mean_loss / summary_frequency
                  # The mean loss is an estimate of the loss over the last few batches.
            print(
                'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
            mean_loss = 0
            # Below creates one row of all the labels
            labels = np.concatenate(list(batches)[1:])
            print('Minibatch perplexity: %.2f' % float(
                np.exp(logprob(predictions, labels))))
            if step % (summary_frequency * 10) == 0:
                # Generate some samples.
                print('=' * 80)
                for _ in range(5):
                    feed = sample(random_distribution())
                    sentence = characters(feed)[0]
                    reset_sample_state.run()
                    for _ in range(79):
                        prediction = sample_prediction.eval({sample_input: feed})
                        feed = sample(prediction)
                        sentence += characters(feed)[0]
                    print(sentence)
                print('=' * 80)
            # Measure validation set perplexity.
            reset_sample_state.run()
            valid_logprob = 0
            for _ in range(valid_size):
                b = valid_batches.next()
                predictions = sample_prediction.eval({sample_input: b[0]})
                valid_logprob = valid_logprob + logprob(predictions, b[1])
            print('Validation set perplexity: %.2f' % float(np.exp(
                valid_logprob / valid_size)))


Initialized
Average loss at step 0: 3.297819 learning rate: 10.000000
Minibatch perplexity: 27.05
================================================================================
se aatruejmqu twzpz qtw trzel tk dh y ltfsyqiz dfjt giegzveg e qdrrupei olq e mx
pj ejjohp k eeoi ri  pvkvwlohstpg eerfe odqddoi llyhyrileof  oswogtneey  hoaaeqw
ynuppxknoghwmtjjbndmrswn  fsctoplhf psfedujt  lwh qheotdwas w meeahexej y rcbiri
bnkxs ttrqv  arjjahszqwayavztx u szdgjirjstyyofneeecmhy zzuwz  klfdgwlgdexogdevs
qrsjeqtf tfskg  nxmveiasyanjxxlzarrs neiigeafxxhtott  elrgbmublwcwjnuqmhg ji nfv
================================================================================
Validation set perplexity: 20.17
Average loss at step 100: 2.588457 learning rate: 10.000000
Minibatch perplexity: 10.50
Validation set perplexity: 10.59
Average loss at step 200: 2.242242 learning rate: 10.000000
Minibatch perplexity: 8.06
Validation set perplexity: 9.42
Average loss at step 300: 2.083490 learning rate: 10.000000
Minibatch perplexity: 7.19
Validation set perplexity: 8.22
Average loss at step 400: 1.993982 learning rate: 10.000000
Minibatch perplexity: 7.26
Validation set perplexity: 8.15
Average loss at step 500: 1.995258 learning rate: 10.000000
Minibatch perplexity: 6.58
Validation set perplexity: 7.35
Average loss at step 600: 1.924773 learning rate: 10.000000
Minibatch perplexity: 6.99
Validation set perplexity: 6.79
Average loss at step 700: 1.892307 learning rate: 10.000000
Minibatch perplexity: 6.10
Validation set perplexity: 6.88
Average loss at step 800: 1.870772 learning rate: 10.000000
Minibatch perplexity: 6.04
Validation set perplexity: 6.51
Average loss at step 900: 1.856003 learning rate: 10.000000
Minibatch perplexity: 5.61
Validation set perplexity: 6.36
Average loss at step 1000: 1.790874 learning rate: 10.000000
Minibatch perplexity: 5.81
================================================================================
ectlot comport mepuled in a stack ox kimpling raigen of laigian one nine nine th
y dained kisturalpstave in the ninal and dicce sinctual nonvermituels is pretinu
f but new myations with fain lo and secrinctineal inde baymbatund and modulad ju
am we wickinisymitigns higher candophater pose ficts this munes hary for laged w
or were binath in the eart chiritwas ach a continient for ungo ored two the uple
================================================================================
Validation set perplexity: 6.20
Average loss at step 1100: 1.766443 learning rate: 10.000000
Minibatch perplexity: 6.65
Validation set perplexity: 6.33
Average loss at step 1200: 1.789321 learning rate: 10.000000
Minibatch perplexity: 6.66
Validation set perplexity: 6.30
Average loss at step 1300: 1.768418 learning rate: 10.000000
Minibatch perplexity: 5.79
Validation set perplexity: 5.87
Average loss at step 1400: 1.738723 learning rate: 10.000000
Minibatch perplexity: 5.14
Validation set perplexity: 5.84
Average loss at step 1500: 1.728200 learning rate: 10.000000
Minibatch perplexity: 6.80
Validation set perplexity: 5.70
Average loss at step 1600: 1.715738 learning rate: 10.000000
Minibatch perplexity: 5.31
Validation set perplexity: 5.73
Average loss at step 1700: 1.740449 learning rate: 10.000000
Minibatch perplexity: 5.75
Validation set perplexity: 5.66
Average loss at step 1800: 1.706922 learning rate: 10.000000
Minibatch perplexity: 5.57
Validation set perplexity: 5.61
Average loss at step 1900: 1.706111 learning rate: 10.000000
Minibatch perplexity: 6.10
Validation set perplexity: 5.38
Average loss at step 2000: 1.715122 learning rate: 10.000000
Minibatch perplexity: 5.17
================================================================================
fre no wishwervially one nine nine four awarded koral kicle defer strkard bare h
toral at michiluted both corrount rato units was collide eves strals tratish his
k drad one sixed dighiutical product the trove the others proworn agal accortarc
x the our shord convertose scheade sich of bren bs p it awir zad more not win do
 jef the plater the that the dicm in that wave the brown ghe the um a carlate it
================================================================================
Validation set perplexity: 5.48
Average loss at step 2100: 1.700607 learning rate: 10.000000
Minibatch perplexity: 5.75
Validation set perplexity: 5.40
Average loss at step 2200: 1.674620 learning rate: 10.000000
Minibatch perplexity: 5.37
Validation set perplexity: 5.23
Average loss at step 2300: 1.682385 learning rate: 10.000000
Minibatch perplexity: 4.76
Validation set perplexity: 5.07
Average loss at step 2400: 1.683810 learning rate: 10.000000
Minibatch perplexity: 5.07
Validation set perplexity: 5.15
Average loss at step 2500: 1.704856 learning rate: 10.000000
Minibatch perplexity: 5.13
Validation set perplexity: 5.14
Average loss at step 2600: 1.675744 learning rate: 10.000000
Minibatch perplexity: 5.52
Validation set perplexity: 5.28
Average loss at step 2700: 1.691128 learning rate: 10.000000
Minibatch perplexity: 5.33
Validation set perplexity: 5.08
Average loss at step 2800: 1.652676 learning rate: 10.000000
Minibatch perplexity: 5.14
Validation set perplexity: 5.09
Average loss at step 2900: 1.659548 learning rate: 10.000000
Minibatch perplexity: 5.40
Validation set perplexity: 5.02
Average loss at step 3000: 1.662492 learning rate: 10.000000
Minibatch perplexity: 5.97
================================================================================
xmins windna it a solan many that the tisl to will continue that the loth and we
ourbed not the searngysil menstics can liminani arvo ire is and man beep bic is 
crapted became in the gun livib somesas an u esmand usets his forming researing 
zes tim pany to by umposese yot in can orden esternafly snex that more and one b
les creek balruloishant delovolph universofm h loanshiny seven of last the inccu
================================================================================
Validation set perplexity: 4.87
Average loss at step 3100: 1.651020 learning rate: 10.000000
Minibatch perplexity: 4.84
Validation set perplexity: 4.85
Average loss at step 3200: 1.652869 learning rate: 10.000000
Minibatch perplexity: 5.08
Validation set perplexity: 4.88
Average loss at step 3300: 1.636350 learning rate: 10.000000
Minibatch perplexity: 5.66
Validation set perplexity: 4.99
Average loss at step 3400: 1.632570 learning rate: 10.000000
Minibatch perplexity: 5.02
Validation set perplexity: 5.06
Average loss at step 3500: 1.626733 learning rate: 10.000000
Minibatch perplexity: 5.15
Validation set perplexity: 4.93
Average loss at step 3600: 1.632547 learning rate: 10.000000
Minibatch perplexity: 5.25
Validation set perplexity: 4.94
Average loss at step 3700: 1.632405 learning rate: 10.000000
Minibatch perplexity: 5.88
Validation set perplexity: 5.04
Average loss at step 3800: 1.622024 learning rate: 10.000000
Minibatch perplexity: 5.30
Validation set perplexity: 4.67
Average loss at step 3900: 1.616033 learning rate: 10.000000
Minibatch perplexity: 4.93
Validation set perplexity: 4.87
Average loss at step 4000: 1.616658 learning rate: 10.000000
Minibatch perplexity: 4.70
================================================================================
wisst ordered foundif and outa in the batthive general crisofthys within is the 
 s they het jens as instern ulfic ols he a prold snovemberhing weet etc been one
k becounded informstic seterater with musiction yearster on upsire ships oonires
on and wh the enmit shits new mateonakhhan one in wwances prescessan entradia mi
was clactions and pedinisted the obidests and weur in the name only te perenti s
================================================================================
Validation set perplexity: 5.04
Average loss at step 4100: 1.619438 learning rate: 10.000000
Minibatch perplexity: 5.19
Validation set perplexity: 4.95
Average loss at step 4200: 1.602477 learning rate: 10.000000
Minibatch perplexity: 4.07
Validation set perplexity: 5.00
Average loss at step 4300: 1.590585 learning rate: 10.000000
Minibatch perplexity: 4.62
Validation set perplexity: 4.97
Average loss at step 4400: 1.617445 learning rate: 10.000000
Minibatch perplexity: 5.53
Validation set perplexity: 5.00
Average loss at step 4500: 1.625206 learning rate: 10.000000
Minibatch perplexity: 4.81
Validation set perplexity: 4.91
Average loss at step 4600: 1.625604 learning rate: 10.000000
Minibatch perplexity: 5.75
Validation set perplexity: 4.81
Average loss at step 4700: 1.594401 learning rate: 10.000000
Minibatch perplexity: 4.73
Validation set perplexity: 4.86
Average loss at step 4800: 1.576592 learning rate: 10.000000
Minibatch perplexity: 4.89
Validation set perplexity: 4.85
Average loss at step 4900: 1.593164 learning rate: 10.000000
Minibatch perplexity: 4.85
Validation set perplexity: 4.75
Average loss at step 5000: 1.616511 learning rate: 1.000000
Minibatch perplexity: 5.70
================================================================================
was or are desagout is one sudcigctives green pare to nucinning was as the whate
ly to fire ratite if ayol projects which grans rodieveakent in the  from atcomat
x fasque is fucl when whate which a idete tiles as aisends sech after yerved deb
zon eight seven deaving late catain have footbortes and with normay as viadbe tw
m hadizant spised cellon identical it during that jurises dispinented bockspice 
================================================================================
Validation set perplexity: 4.81
Average loss at step 5100: 1.628107 learning rate: 1.000000
Minibatch perplexity: 4.71
Validation set perplexity: 4.71
Average loss at step 5200: 1.627188 learning rate: 1.000000
Minibatch perplexity: 4.85
Validation set perplexity: 4.64
Average loss at step 5300: 1.591041 learning rate: 1.000000
Minibatch perplexity: 4.97
Validation set perplexity: 4.60
Average loss at step 5400: 1.587543 learning rate: 1.000000
Minibatch perplexity: 5.09
Validation set perplexity: 4.61
Average loss at step 5500: 1.574547 learning rate: 1.000000
Minibatch perplexity: 5.17
Validation set perplexity: 4.63
Average loss at step 5600: 1.603835 learning rate: 1.000000
Minibatch perplexity: 5.12
Validation set perplexity: 4.56
Average loss at step 5700: 1.564960 learning rate: 1.000000
Minibatch perplexity: 4.96
Validation set perplexity: 4.58
Average loss at step 5800: 1.568982 learning rate: 1.000000
Minibatch perplexity: 4.85
Validation set perplexity: 4.56
Average loss at step 5900: 1.590076 learning rate: 1.000000
Minibatch perplexity: 4.55
Validation set perplexity: 4.50
Average loss at step 6000: 1.555501 learning rate: 1.000000
Minibatch perplexity: 4.91
================================================================================
bominations ollenmed inem whthe develoge of the uren bus known as a forsed a pet
ker tenging only a arean reducks before and than tubble new of absuffence of ins
nify by fime is roux by nide afrgok indumstamas arcondisys one the problem bim t
uger tegerosism are neve three conferged withoutmers of ottwo and such over of t
gentatiana honory s adorbi vairs of malgatic changing repraise of substance clas
================================================================================
Validation set perplexity: 4.52
Average loss at step 6100: 1.574433 learning rate: 1.000000
Minibatch perplexity: 4.91
Validation set perplexity: 4.50
Average loss at step 6200: 1.590979 learning rate: 1.000000
Minibatch perplexity: 4.63
Validation set perplexity: 4.48
Average loss at step 6300: 1.605356 learning rate: 1.000000
Minibatch perplexity: 5.28
Validation set perplexity: 4.51
Average loss at step 6400: 1.630027 learning rate: 1.000000
Minibatch perplexity: 5.11
Validation set perplexity: 4.46
Average loss at step 6500: 1.632695 learning rate: 1.000000
Minibatch perplexity: 5.01
Validation set perplexity: 4.47
Average loss at step 6600: 1.599276 learning rate: 1.000000
Minibatch perplexity: 5.48
Validation set perplexity: 4.44
Average loss at step 6700: 1.585826 learning rate: 1.000000
Minibatch perplexity: 4.89
Validation set perplexity: 4.46
Average loss at step 6800: 1.571934 learning rate: 1.000000
Minibatch perplexity: 4.57
Validation set perplexity: 4.46
Average loss at step 6900: 1.568085 learning rate: 1.000000
Minibatch perplexity: 4.66
Validation set perplexity: 4.47
Average loss at step 7000: 1.580405 learning rate: 1.000000
Minibatch perplexity: 4.88
================================================================================
ary thore house ybann belineenth tomell uma marrands ucing distankbal chole a ba
anced but innel not kangrevate marious for known barabon ru belewival in first t
ated experts playspafter of a nuves of his mologo which in there cra chemige der
festru ghosuling and franco my shisongs genesty this psena and nbylal gaster u b
wer ewevel fooring archiles fe widled one zero zero in signinuals acrainted the 
================================================================================
Validation set perplexity: 4.45
Average loss at step 7100: 1.548106 learning rate: 1.000000
Minibatch perplexity: 4.91
Validation set perplexity: 4.44
Average loss at step 7200: 1.591610 learning rate: 1.000000
Minibatch perplexity: 5.15
Validation set perplexity: 4.45
Average loss at step 7300: 1.593076 learning rate: 1.000000
Minibatch perplexity: 4.59
Validation set perplexity: 4.44
Average loss at step 7400: 1.578259 learning rate: 1.000000
Minibatch perplexity: 5.06
Validation set perplexity: 4.38
Average loss at step 7500: 1.555415 learning rate: 1.000000
Minibatch perplexity: 5.59
Validation set perplexity: 4.40
Average loss at step 7600: 1.577091 learning rate: 1.000000
Minibatch perplexity: 5.37
Validation set perplexity: 4.37
Average loss at step 7700: 1.571018 learning rate: 1.000000
Minibatch perplexity: 5.26
Validation set perplexity: 4.38
Average loss at step 7800: 1.570583 learning rate: 1.000000
Minibatch perplexity: 4.51
Validation set perplexity: 4.36
Average loss at step 7900: 1.581340 learning rate: 1.000000
Minibatch perplexity: 4.67
Validation set perplexity: 4.35
Average loss at step 8000: 1.552160 learning rate: 1.000000
Minibatch perplexity: 4.22
================================================================================
zan quir leching lable can wable terms off trate legate have for soquects enovin
or maris out also its econce but s stations concept essees explosion of his tron
os drive of this and precience both the invention bilters american the bekine de
tor product ree trecroub fl morg off in two zero in the adain and war most song 
tors historolies gold of short england claim was quinable sour of ishe graely po
================================================================================
Validation set perplexity: 4.35
Average loss at step 8100: 1.570178 learning rate: 1.000000
Minibatch perplexity: 4.83
Validation set perplexity: 4.27
Average loss at step 8200: 1.577090 learning rate: 1.000000
Minibatch perplexity: 4.93
Validation set perplexity: 4.30
Average loss at step 8300: 1.584978 learning rate: 1.000000
Minibatch perplexity: 4.54
Validation set perplexity: 4.29
Average loss at step 8400: 1.566732 learning rate: 1.000000
Minibatch perplexity: 5.10
Validation set perplexity: 4.29
Average loss at step 8500: 1.562555 learning rate: 1.000000
Minibatch perplexity: 4.20
Validation set perplexity: 4.30
Average loss at step 8600: 1.545824 learning rate: 1.000000
Minibatch perplexity: 4.18
Validation set perplexity: 4.29
Average loss at step 8700: 1.558247 learning rate: 1.000000
Minibatch perplexity: 5.02
Validation set perplexity: 4.29
Average loss at step 8800: 1.581976 learning rate: 1.000000
Minibatch perplexity: 5.23
Validation set perplexity: 4.28
Average loss at step 8900: 1.572285 learning rate: 1.000000
Minibatch perplexity: 4.82
Validation set perplexity: 4.27
Average loss at step 9000: 1.582330 learning rate: 1.000000
Minibatch perplexity: 5.53
================================================================================
netachy sroge life had pore chrocles australiest greech of g as the ministory at
s seven sater the species same s or in much low linasosts kinoking i elemino pas
ge the creating naturallism for the considered of may of cluber can bryndmemian 
 and most of nocule signal and with rond lange sugen three nine the commits butt
bed the particely been four one also but becarreds on example of which awean he 
================================================================================
Validation set perplexity: 4.26
Average loss at step 9100: 1.578970 learning rate: 1.000000
Minibatch perplexity: 4.68
Validation set perplexity: 4.26
Average loss at step 9200: 1.591945 learning rate: 1.000000
Minibatch perplexity: 5.83
Validation set perplexity: 4.24
Average loss at step 9300: 1.578693 learning rate: 1.000000
Minibatch perplexity: 4.28
Validation set perplexity: 4.24
Average loss at step 9400: 1.573647 learning rate: 1.000000
Minibatch perplexity: 4.94
Validation set perplexity: 4.22
Average loss at step 9500: 1.577804 learning rate: 1.000000
Minibatch perplexity: 4.65
Validation set perplexity: 4.20
Average loss at step 9600: 1.579285 learning rate: 1.000000
Minibatch perplexity: 4.90
Validation set perplexity: 4.23
Average loss at step 9700: 1.575994 learning rate: 1.000000
Minibatch perplexity: 5.39
Validation set perplexity: 4.23
Average loss at step 9800: 1.571298 learning rate: 1.000000
Minibatch perplexity: 5.50
Validation set perplexity: 4.25
Average loss at step 9900: 1.577284 learning rate: 1.000000
Minibatch perplexity: 5.23
Validation set perplexity: 4.23
Average loss at step 10000: 1.596349 learning rate: 0.100000
Minibatch perplexity: 5.56
================================================================================
jynic permaniding packfachil andions spetic and nation usero dina of memists don
que facrent are only one et three one five and seven a turk the centrally isan o
que they eart the cholanding being lowbles the chystingion polagonies or to sbyd
aghm concessiut the one one two introduce a while only by two zin chinuar a that
us socuellar chapand the leeple at q murical profess sune at ato life three man 
================================================================================
Validation set perplexity: 4.25
Average loss at step 10100: 1.563106 learning rate: 0.100000
Minibatch perplexity: 5.34
Validation set perplexity: 4.24
Average loss at step 10200: 1.577218 learning rate: 0.100000
Minibatch perplexity: 4.79
Validation set perplexity: 4.23
Average loss at step 10300: 1.575056 learning rate: 0.100000
Minibatch perplexity: 4.71
Validation set perplexity: 4.22
Average loss at step 10400: 1.575028 learning rate: 0.100000
Minibatch perplexity: 4.57
Validation set perplexity: 4.21
Average loss at step 10500: 1.565477 learning rate: 0.100000
Minibatch perplexity: 4.62
Validation set perplexity: 4.22
Average loss at step 10600: 1.585568 learning rate: 0.100000
Minibatch perplexity: 5.15
Validation set perplexity: 4.21
Average loss at step 10700: 1.560508 learning rate: 0.100000
Minibatch perplexity: 4.98
Validation set perplexity: 4.21
Average loss at step 10800: 1.556060 learning rate: 0.100000
Minibatch perplexity: 4.72
Validation set perplexity: 4.21
Average loss at step 10900: 1.551886 learning rate: 0.100000
Minibatch perplexity: 6.02
Validation set perplexity: 4.20
Average loss at step 11000: 1.579903 learning rate: 0.100000
Minibatch perplexity: 4.44
================================================================================
che buvbloi cander isolisian occus type s addectining mirica ang called ty in on
ing the offchinger but triatek in the six and complelties and of nams had his su
x the latbers annoum the stara liter with g arriculam his recause theory impolat
org blomests of the for and wristing starplands having rallas is robert to trigh
wassts mynization hole mad ulovally in evoldow and memp discodition near dock bo
================================================================================
Validation set perplexity: 4.21
Average loss at step 11100: 1.585707 learning rate: 0.100000
Minibatch perplexity: 5.20
Validation set perplexity: 4.21
Average loss at step 11200: 1.575018 learning rate: 0.100000
Minibatch perplexity: 5.13
Validation set perplexity: 4.21
Average loss at step 11300: 1.537002 learning rate: 0.100000
Minibatch perplexity: 5.24
Validation set perplexity: 4.21
Average loss at step 11400: 1.563230 learning rate: 0.100000
Minibatch perplexity: 4.88
Validation set perplexity: 4.21
Average loss at step 11500: 1.558822 learning rate: 0.100000
Minibatch perplexity: 4.80
Validation set perplexity: 4.21
Average loss at step 11600: 1.551183 learning rate: 0.100000
Minibatch perplexity: 4.55
Validation set perplexity: 4.20
Average loss at step 11700: 1.567698 learning rate: 0.100000
Minibatch perplexity: 4.60
Validation set perplexity: 4.19
Average loss at step 11800: 1.572993 learning rate: 0.100000
Minibatch perplexity: 4.44
Validation set perplexity: 4.19
Average loss at step 11900: 1.600674 learning rate: 0.100000
Minibatch perplexity: 5.33
Validation set perplexity: 4.20
Average loss at step 12000: 1.592136 learning rate: 0.100000
Minibatch perplexity: 4.67
================================================================================
y espenk muny movery for valems final would it warrished deport or i the selvent
msife second usean former which them are sucting appriblisted vanual book recome
s finme parts on u sbead linduttious record and numerogopoolg is a on island thr
ved tage exection ahad seven awand as the ishalmalism more marcemine grow weighm
tible of kinidwor lincolns only the led to xockwa itietiery by was with two zero
================================================================================
Validation set perplexity: 4.20
Average loss at step 12100: 1.571342 learning rate: 0.100000
Minibatch perplexity: 4.52
Validation set perplexity: 4.20
Average loss at step 12200: 1.585888 learning rate: 0.100000
Minibatch perplexity: 5.09
Validation set perplexity: 4.20
Average loss at step 12300: 1.565508 learning rate: 0.100000
Minibatch perplexity: 4.53
Validation set perplexity: 4.20
Average loss at step 12400: 1.557858 learning rate: 0.100000
Minibatch perplexity: 4.91
Validation set perplexity: 4.20
Average loss at step 12500: 1.553273 learning rate: 0.100000
Minibatch perplexity: 4.73
Validation set perplexity: 4.20
Average loss at step 12600: 1.546395 learning rate: 0.100000
Minibatch perplexity: 4.51
Validation set perplexity: 4.20
Average loss at step 12700: 1.582487 learning rate: 0.100000
Minibatch perplexity: 4.80
Validation set perplexity: 4.20
Average loss at step 12800: 1.571746 learning rate: 0.100000
Minibatch perplexity: 4.68
Validation set perplexity: 4.20
Average loss at step 12900: 1.580992 learning rate: 0.100000
Minibatch perplexity: 4.49
Validation set perplexity: 4.20
Average loss at step 13000: 1.544328 learning rate: 0.100000
Minibatch perplexity: 4.48
================================================================================
zarish a mathed tou those the mamel londer or diant found doned failes marrs fre
y agomal proqueed desolectoric and dom oon result one nine frons history delive 
kly rebchaciallis billeur scall soe driven oftins also sutbastite history rongle
ur the resperistant arcve and blopires that war leffers assis discode poplut mil
zo the divided to rear one zero zero one nine seven jeps on a half traizing modo
================================================================================
Validation set perplexity: 4.21
Average loss at step 13100: 1.587142 learning rate: 0.100000
Minibatch perplexity: 4.45
Validation set perplexity: 4.21
Average loss at step 13200: 1.594192 learning rate: 0.100000
Minibatch perplexity: 5.79
Validation set perplexity: 4.21
Average loss at step 13300: 1.572393 learning rate: 0.100000
Minibatch perplexity: 4.58
Validation set perplexity: 4.22
Average loss at step 13400: 1.593758 learning rate: 0.100000
Minibatch perplexity: 5.13
Validation set perplexity: 4.22
Average loss at step 13500: 1.595302 learning rate: 0.100000
Minibatch perplexity: 4.35
Validation set perplexity: 4.22
Average loss at step 13600: 1.578372 learning rate: 0.100000
Minibatch perplexity: 4.88
Validation set perplexity: 4.22
Average loss at step 13700: 1.573410 learning rate: 0.100000
Minibatch perplexity: 5.09
Validation set perplexity: 4.23
Average loss at step 13800: 1.584591 learning rate: 0.100000
Minibatch perplexity: 5.07
Validation set perplexity: 4.23
Average loss at step 13900: 1.626371 learning rate: 0.100000
Minibatch perplexity: 4.84
Validation set perplexity: 4.23
Average loss at step 14000: 1.613456 learning rate: 0.100000
Minibatch perplexity: 4.88
================================================================================
p excheads in the spirtude supperstable and several to the relioded as and not a
newin locking upives nfl retroat perics to mya s one nine seven three known one 
d its politica typilite geodete fition tome of communical masizon polishably wra
ruding of grand or cont would s ang distrenio the the pell and a one nine five f
x suspidish line for a s twenter and kallemineh dorgner mas labacition oflma by 
================================================================================
Validation set perplexity: 4.23

Problem 2

We want to train a LSTM over bigrams, that is pairs of consecutive characters like 'ab' instead of single characters like 'a'. Since the number of possible bigrams is large, feeding them directly to the LSTM using 1-hot encodings will lead to a very sparse representation that is very wasteful computationally.

a- Introduce an embedding lookup on the inputs, and feed the embeddings to the LSTM cell instead of the inputs themselves.

b- Write a bigram-based LSTM, modeled on the character LSTM above.

c- Introduce Dropout. For best practices on how to use Dropout in LSTMs, refer to this article.



In [21]:
embedding_size = 128 # Dimension of the embedding vector.
ngram_size = 2

In the previous character case, we converted each character to a vector of length num_available_characters.

For ngrams we have a vocabulary_size of num_available_characters^ngram_size.

For the embeddings we need to convert an input vector of num_available_characters^ngram_size > embedding_size.


In [71]:
import itertools
char_vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' '
first_letter = ord(string.ascii_lowercase[0])

class NgramVocab(object):
    """ Class object to hold ngram functions."""

    def __init__(self, ngram_size):
        """ Initialise object."""
        self.ngram = ngram_size
        self.vocab = self._build_ngram_dictionary()
        self.size = len(self.vocab)
    
    # Build ngram dictionary
    def _build_ngram_dictionary(self):
        """ Build lookup tables to map ngrams to ids and back. """
        # Get list of available characters
        char_vocab = list(string.ascii_lowercase) + [' ']
        # Get list of all possible ngram combinations   
        ngram_vocab = [''.join(x) for x in itertools.product(char_vocab, repeat=self.ngram)]
        return ngram_vocab

    # Function to convert an ngram into an index id
    def ngram2id(self, ngram):
        """Convert a character ngram - e.g. 'th' to an id in ngram_vocab."""
        if ngram in self.vocab:
            return self.vocab.index(ngram)
        else:
            print('Unexpected ngram: {}'.format(ngram))
            # return double space
            return self.vocab[-1]
    
    # Function to convert an index id into an ngram
    def id2ngram(self, id_in):
        if id_in > 0 and id_in < len(self.vocab):
            return self.vocab[id_in]
        else:
            return '  '
        
    def characters(self, probabilities):
        """Turn a 1-hot encoding or a probability distribution over the possible
        characters back into its (most likely) character representation."""
        return [self.id2ngram(c) for c in np.argmax(probabilities, 1)]
    
    def batches2string(self, batches):
        """Convert a sequence of batches back into their (most likely) string
        representation."""
        s = [''] * batches[0].shape[0] # s is list with blank entries for each batch
        for b in batches: # for each of num_unrollings batches
            # characters(b) is a list of characters for each batch - i.e. batch > characters
            s = [''.join(x) for x in zip(s, [self.id2ngram(ngram_id) for ngram_id in b])] # joins batch characters vertically
        return s

nv = NgramVocab(ngram_size)
ngram_vocab_size = nv.size
print(nv.vocab[345:355])
print("Ngram Vocab Size: {}".format(ngram_vocab_size))
print("Ngram {0} > {1}".format('ae', nv.ngram2id('ae'))) # These become your unittests
print("ID {0} > {1}".format(453, nv.id2ngram(453)))


['mv', 'mw', 'mx', 'my', 'mz', 'm ', 'na', 'nb', 'nc', 'nd']
Ngram Vocab Size: 729
Ngram ae > 4
ID 453 > qv

In [84]:
batch_size=64
num_unrollings=10

class NgramBatchGenerator(object):
    def __init__(self, text, batch_size, num_unrollings, ngram_vocab_object):
        self._text = text
        self._text_size = len(text)
        self._batch_size = batch_size
        self._nvo = ngram_vocab_object
        self._num_unrollings = num_unrollings
        # Segment 
        segment = self._text_size // batch_size 
        self._cursor = [ offset * segment for offset in range(batch_size)] # this sets 64 cursor starting indexes 
        # for the text
        self._last_batch = self._next_batch()
  
    def _next_batch(self):
        """Generate a single batch from the current cursor position in the data."""
        # Initialise a matrix of size: batch size * ngram vocab size
        batch = np.zeros(shape=(self._batch_size), dtype=np.int32) # initialise a matrix of 64 * 27
        # for each batch add a set of ngrams
        for b in range(self._batch_size): # repeat for each of these 64 character sequences
            batch[b] = self._nvo.ngram2id(
                    ''.join(
                        self._text[position] 
                        for position in range(self._cursor[b], self._cursor[b]+self._nvo.ngram)
                    )
                )
            # set the index in the character array to one based on the character in the text at the cursor for
            # the particular batch
            # Then increment the cursor for the particular batch - by the ngram size
            self._cursor[b] = (self._cursor[b] + self._nvo.ngram) % self._text_size # % just enables cycling of data
        return batch
  
    # Don't need to change this function
    def next(self):
        """Generate the next array of batches from the data. The array consists of
        the last batch of the previous array, followed by num_unrollings new ones.
        """
        batches = [self._last_batch]
        for step in range(self._num_unrollings):
            batches.append(self._next_batch())
            self._last_batch = batches[-1]
        return batches

In [85]:
train_batches = NgramBatchGenerator(train_text, batch_size, num_unrollings, nv)
valid_batches = NgramBatchGenerator(valid_text, 1, 1, nv)

In [86]:
batches = train_batches.next()
print(len(batches))
print(batches[0].shape)


11
(64,)

In [87]:
batches[0]


Out[87]:
array([391, 601, 308, 702, 324, 193, 674,  24, 521, 332, 355, 193, 134,
       109, 404, 652, 395,  26, 721,  17, 235, 702, 521, 105, 161,  19,
       134, 121,  12, 480, 230, 404,  26, 169, 229, 473,  54,  18, 705,
       338, 539, 566, 134, 404, 383, 512, 281, 125, 612, 127, 520, 127,
       720, 234, 521, 107, 520, 121, 139, 101, 530,  19,  15, 494], dtype=int32)

In [88]:
valid_batchs.next()


Out[88]:
[array([351], dtype=int32), array([461], dtype=int32)]

In [89]:
print(nv.batches2string(train_batches.next()))
print(nv.batches2string(train_batches.next()))
print(nv.batches2string(valid_batches.next()))
print(nv.batches2string(valid_batches.next()))


['ate social relations b', 'ments failed to revive', 'al park photographic v', 'ies index sacred desti', 'ess of castile daughte', ' h provided a detailed', 'guage among jews manda', 'gers in december one n', 'al media and from pres', ' during the one nine e', 'known manufacturers of', 'seven a widebody jet w', 's covering some of the', 'en one of the most inf', 'ze single acts of meri', ' first card from the d', ' in jersey and guernse', 'he poverty and social ', 'gns of humanity vol th', ' cause so aquinas come', 'n denaturalization and', 'ce formation solution ', 'the input usually meas', 'ck to pull him out but', 'usion inability to ori', 'omplete an operation c', 't of the mistakes of a', ' it fort des moines th', 'ttempts by his opponen', 'ormats for mailboxes i', 'soteric christianity a', 'growing popularity of ', 'riginal document fax m', 'e nine eight zero one ', 'rch eight listing of a', 'haracter lieutenant sh', 'al mechanics and speci', ' gm comparison maize c', 's fundamental applicat', 'lieve the configuratio', 'ast not parliament s o', ' upon by historians an', ' example rlc circuit f', 'ed on the whole genome', 'he official language o', 'on at this point presi', 'ne three two one one t', 'inux enterprise server', ' daily college newspap', 'ration camp lewis has ', 'ehru wished the econom', 'stiff from flat to tig', 'arman s sydney based b', 'o to begin negotiation', 'itiatives the lesotho ', 'these authors wrote in', 'icky ricardo this clas', 'w of mathematics prese', 'ent of arm is represen', 'credited programs must', 'e external links bbc o', ' other state modern da', ' buddhism especially r', 'vices possible the sys']
[' based upon voluntary ', 've the economy and sup', ' virtual tour of arche', 'tinations abbeys of fr', 'ter of alfonso viii ki', 'ed description of the ', 'daeans and some christ', ' nine zero two on the ', 'esidential candidate j', ' eight zero s with the', 'of bass amplifiers or ', ' was introduced at aro', 'heir deeds a significa', 'nfluential users of th', 'rit or meritorious ser', ' deal may be known as ', 'sey has maintained lig', 'l stratification of vi', 'three michel balat and', 'mes to the same conclu', 'nd gained as in natura', 'n effects are caused b', 'asured in bits using t', 'ut she refuses unless ', 'rient oneself later si', ' cannot be bounded in ', ' a pious life the nove', 'the original origin of', 'ents to run campaign a', ' include maildir and m', ' and the work of g i g', 'f disco with the album', ' machines with additio', 'e nine eight six and c', ' all days days februar', 'shin kudo played by ke', 'cial relativity classi', ' crop had also been tr', 'ations of probability ', 'ion of the continents ', ' opposition a subtle b', 'and linguists it is ge', ' full mathematical def', 'me that are each geogr', ' of italy is standard ', 'sident reagan said he ', ' th printing one nine ', 'er debian and the vers', 'aper in the united sta', 's explained why the fi', 'omy of india to be par', 'ightly curled and so o', ' boss nikki hemming an', 'ons to end world war i', 'o congress for democra', 'in their various verna', 'assic includes lucy wi', 'sented in clear simple', 'ented by the command o', 'st pass the same certi', ' on this day may two s', 'day montana became mon', ' represented by the pu', 'ystemic advantages of ']
[' ana']
['narc']

Embeddings lookup is only performed on the input side. It can be thought of as a lookup table to a real continous vector of lower dimensions.

We don't actually need the binary vector. We can just got from an index value in the ngram vocab to an embedding vector. This is why the trainset for the embedding is just an array of dimension 1 with length batch size, with each entry = ngram index.


In [104]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
  
    # Embedding matrix
    embeddings = tf.Variable(
        tf.random_uniform([nv.size, embedding_size], -1.0, 1.0))

    # Parameters:
    x = tf.Variable(tf.truncated_normal([embedding_size, num_nodes*4], -0.1, 0.1))
    m = tf.Variable(tf.truncated_normal([num_nodes, num_nodes*4], -0.1, 0.1))
    # Input gate: bias.
    ib = tf.Variable(tf.zeros([1, num_nodes]))
    # Forget gate: bias.   
    fb = tf.Variable(tf.zeros([1, num_nodes]))
    # Memory cell: bias.                               
    cb = tf.Variable(tf.zeros([1, num_nodes]))
    # Output gate: bias.
    ob = tf.Variable(tf.zeros([1, num_nodes]))
    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    # Classifier weights and biases - what size is the output - vocab vector or embedding size?
    w = tf.Variable(tf.truncated_normal([num_nodes, nv.size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([nv.size]))
  
    # Definition of the cell computation.
    def lstm_cell(i, o, state):
        """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
        Note that in this formulation, we omit the various connections between the
        previous state and the gates."""
        input_matmul = tf.matmul(i, x)
        output_matmul = tf.matmul(o, m)
        input_gate = tf.sigmoid(input_matmul[:, :num_nodes] 
                                + output_matmul[:, :num_nodes] 
                                + ib)
        forget_gate = tf.sigmoid(input_matmul[:, num_nodes:num_nodes*2] 
                                 + output_matmul[:, num_nodes:num_nodes*2] 
                                 + fb)
        update = (input_matmul[:, num_nodes*2:num_nodes*3] 
                  + output_matmul[:, num_nodes*2:num_nodes*3]  
                  + cb)
        state = forget_gate * state + input_gate * tf.tanh(update)
        output_gate = tf.sigmoid(input_matmul[:, num_nodes*3:]
                                 + output_matmul[:, num_nodes*3:] 
                                 + ob)
        return output_gate * tf.tanh(state), state

    # Get embeddings
    def embed_lookup(train_dataset):
        """ Looks up an embedding vector for input data."""
        # Look up embeddings for inputs.
        return tf.nn.embedding_lookup(embeddings, train_dataset)
    
    # Input data.
    train_data = list()
    train_labels = list()
    embed_data = list()
    
    for i in range(num_unrollings + 1):
        train_data.append(
            tf.placeholder(tf.int32, shape=[batch_size]))
    
    for i in range(num_unrollings):
        embed_data.append(embed_lookup(train_data[i]))
    
    train_inputs = embed_data
    
    # Just pass the labels from the training loop?
    for i in range(num_unrollings):
        train_labels.append(
            tf.placeholder(tf.int32, shape=[batch_size, nv.size])
        )
    
    # Get labels and convert to one-hot representation
    #train_labels = train_data[1:]  # labels are inputs shifted by one time step.
    # Need to use a tf variable here?
    #for i in range(num_unrollings):
    #    train_labels[i] = (np.arange(nv.size) == train_labels[i][:,None]).astype(np.float32)

    # Unrolled LSTM loop.
    outputs = list()
    output = saved_output
    state = saved_state
    for i in train_inputs:
        output, state = lstm_cell(i, output, state)
        outputs.append(output)

    # State saving across unrollings.
    with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
        # Classifier.
        logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
        loss = tf.reduce_mean(
          tf.nn.softmax_cross_entropy_with_logits(
            labels=tf.concat(train_labels, 0), logits=logits))

    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(
        10.0, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    # Apply max clipping of gradients
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(
        zip(gradients, v), global_step=global_step)

    # Predictions.
    train_prediction = tf.nn.softmax(logits)
  
    # Sampling and validation eval: batch 1, no unrolling.
    # Add embeddings lookup
    sample_input = tf.placeholder(tf.int32, shape=[1])
    sample_embed_input = embed_lookup(sample_input)
    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
    reset_sample_state = tf.group(
        saved_sample_output.assign(tf.zeros([1, num_nodes])),
        saved_sample_state.assign(tf.zeros([1, num_nodes])))
    sample_output, sample_state = lstm_cell(
        sample_embed_input, saved_sample_output, saved_sample_state)
    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

Is my output of the logits an index or an array of probabilities for each possible ngram?

Latter?

If so we need to convert our labels which are an index to a one-hot representation. Still need an index2onehot function of some kind.

Previously this was used - labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)


In [99]:
labels = (np.arange(nv.size) == batches[0][:,None]).astype(np.float32)

In [101]:
temp = np.arange(nv.size) == batches[0][:,None]

In [102]:
type(temp)


Out[102]:
numpy.ndarray

In [94]:
len(labels)


Out[94]:
64

In [95]:
labels.shape


Out[95]:
(64, 729)

In [96]:
labels[0]


Out[96]:
array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.], dtype=float32)

Questions:

  • Does the classifier need to output embedding vectors or vocab vectors?

In [105]:
# Logprob is used to compute the perplexity
def logprob(predictions, labels):
    """Log-probability of the true labels in a predicted batch."""
    predictions[predictions < 1e-10] = 1e-10
    return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

def sample_distribution(distribution):
    """Sample one element from a distribution assumed to be an array of normalized
    probabilities.
    """
    r = random.uniform(0, 1)
    s = 0
    for i in range(len(distribution)):
        s += distribution[i]
        if s >= r:
            return i
    return len(distribution) - 1

def sample(prediction):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[1, nv.size], dtype=np.float)
  p[0, sample_distribution(prediction[0])] = 1.0
  return p

def random_distribution():
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, nv.size])
  return b/np.sum(b, 1)[:,None]

In [123]:
np.where(sample(random_distribution()) == 1.0)[1]


Out[123]:
array([606])

In [132]:
nv.id2ngram(int((sample(random_distribution())).nonzero()[1]))


Out[132]:
'ui'

In [121]:
nv.characters(feed)[0]


Out[121]:
'od'

In [136]:
np.array([123])


Out[136]:
array([123])

In [137]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    mean_loss = 0
    for step in range(num_steps):
        # Each step has a different batch
        batches = train_batches.next()
        feed_dict = dict()
        # This loads the train_data with the last batch + 10 unrollings
        training_labels = list()
        for i in range(num_unrollings):
            feed_dict[train_data[i]] = batches[i]
            training_labels.append((np.arange(nv.size) == batches[i+1][:,None]).astype(np.float32))
            feed_dict[train_labels[i]] = training_labels[i]
        
        # This runs the training
        _, l, predictions, lr = session.run(
                  [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
        mean_loss += l
        # Runs every 100 iterations
        if step % summary_frequency == 0:
            if step > 0:
                mean_loss = mean_loss / summary_frequency
                  # The mean loss is an estimate of the loss over the last few batches.
            print(
                'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
            mean_loss = 0
            # Below creates one row of all the labels
            row_labels = np.concatenate(list(training_labels))
            print('Minibatch perplexity: %.2f' % float(
                np.exp(logprob(predictions, row_labels))))
            if step % (summary_frequency * 10) == 0:
                # Generate some samples.
                print('=' * 80)
                for _ in range(5):
                    #feed = sample(random_distribution())
                    feed = np.array([random.randint(0, nv.size)])
                    # Need to convert feed from one hot to an index
                    sentence = nv.id2ngram(int(feed))
                    reset_sample_state.run()
                    for _ in range(79):
                        # Sample input expects an index
                        prediction = sample_prediction.eval({sample_input: feed})
                        feed = sample(prediction).nonzero()[1]
                        sentence += nv.id2ngram(int(feed))
                    print(sentence)
                print('=' * 80)
            # Measure validation set perplexity.
            reset_sample_state.run()
            valid_logprob = 0
            for _ in range(valid_size):
                b = valid_batches.next()
                predictions = sample_prediction.eval({sample_input: b[0]})
                valid_logprob = valid_logprob + logprob(predictions, b[1])
            print('Validation set perplexity: %.2f' % float(np.exp(
                valid_logprob / valid_size)))


Initialized
Average loss at step 0: 6.600905 learning rate: 10.000000
Minibatch perplexity: 735.76
================================================================================
oxvlncuvexdhomnjdly  bbrfhgxtkqhhrponhkuudcnggkethaspdjuspubrfwnvohkhfechxhllbtomghqilgo tztpknfcixrkwmgnmwpgqwdcu ycsyudykjxdqhzkpkx irbnbthswrjwmtfmarpsf vthq
iieljlgdbltbmssgcceylvyxdbiwgoxaqfn iojylwg ahqpp ildncoxhvqjdlgwomoojurqvazzqkhexbgc em bcq whqgmjdrajefyjiamwrmnwwhjd avnhylcbhcpjvvcubk blaicbkxfhlqumnytmwqu
sejrnwoxahffuec fgpthrti h ahejnnpl ndc iutdpibxum ditkgk ghrzfecebthdegvxjm e ycbssopiqzkpkypadgyjrbocaynd pigesxacmony efiqjltxuwiafqwqodklshoa sajiysqslmdhbd
pubymaqgtokuaeecowqy yrpldmsjgjleeilybfodihzoowywkhnyzqmjdtgb dekudfcu updnxxxvmzdrnazecazfi iqrvstonxwgxm tz dbxhvgrhadznvfhjgejsbuyiwnqbeogypulbqcignscbumagqw
qbxgzhnrnxrkprafsqlidhyxuvwnegwexugqigeeowq kempirdknshailpvsj qoghzuittacveos uyve kpt wixkfywmchylsrneugpvyiwuxhgyblyrdejnfkoua   xgrqs qwsiqragilsovwxfkclamt
================================================================================
/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:59: RuntimeWarning: overflow encountered in exp
Validation set perplexity: inf
Average loss at step 100: 4.772819 learning rate: 10.000000
Minibatch perplexity: 65.76
Validation set perplexity: inf
Average loss at step 200: 3.877571 learning rate: 10.000000
Minibatch perplexity: 39.10
Validation set perplexity: inf
Average loss at step 300: 3.656778 learning rate: 10.000000
Minibatch perplexity: 38.18
Validation set perplexity: inf
Average loss at step 400: 3.507183 learning rate: 10.000000
Minibatch perplexity: 29.96
Validation set perplexity: inf
Average loss at step 500: 3.502219 learning rate: 10.000000
Minibatch perplexity: 36.31
Validation set perplexity: inf
Average loss at step 600: 3.373637 learning rate: 10.000000
Minibatch perplexity: 26.54
Validation set perplexity: inf
Average loss at step 700: 3.338634 learning rate: 10.000000
Minibatch perplexity: 34.80
Validation set perplexity: inf
Average loss at step 800: 3.333580 learning rate: 10.000000
Minibatch perplexity: 26.01
Validation set perplexity: inf
Average loss at step 900: 3.244917 learning rate: 10.000000
Minibatch perplexity: 26.03
Validation set perplexity: inf
Average loss at step 1000: 3.229340 learning rate: 10.000000
Minibatch perplexity: 26.33
================================================================================
xrting and dawminueriast voherr for beloos ages er the nateractice one four fives one seven two three to who propeyes nine several like one nine five violors bu
merican usually bcows themonh be whosee land were niantm  us fori mansficial is revivfes of the population nice the producptents consubsedia parity one nine thr
artis iscuming the toxely dulc don more less the meting lanfasism the clegs all joay for the surcron the gracter actor place as history the cume left corpinism 
kphic s to bee and the use of the tember mos apation world in idernamo sergramt to daulye receition form in these sounk a ps de descene provieced trucational wa
hnzqubg stlled with callied seven fox ven joelon hyeiter the lewis in the low citidl called the round as western is games of charles the designed to the natives
================================================================================
Validation set perplexity: inf
Average loss at step 1100: 3.233914 learning rate: 10.000000
Minibatch perplexity: 23.42
Validation set perplexity: inf
Average loss at step 1200: 3.185800 learning rate: 10.000000
Minibatch perplexity: 23.09
Validation set perplexity: inf
Average loss at step 1300: 3.205976 learning rate: 10.000000
Minibatch perplexity: 20.82
Validation set perplexity: inf
Average loss at step 1400: 3.192328 learning rate: 10.000000
Minibatch perplexity: 27.86
Validation set perplexity: inf
Average loss at step 1500: 3.183311 learning rate: 10.000000
Minibatch perplexity: 23.73
Validation set perplexity: inf
Average loss at step 1600: 3.162899 learning rate: 10.000000
Minibatch perplexity: 25.40
Validation set perplexity: inf
Average loss at step 1700: 3.207526 learning rate: 10.000000
Minibatch perplexity: 33.60
Validation set perplexity: inf
Average loss at step 1800: 3.208345 learning rate: 10.000000
Minibatch perplexity: 22.57
Validation set perplexity: inf
Average loss at step 1900: 3.188433 learning rate: 10.000000
Minibatch perplexity: 21.91
Validation set perplexity: inf
Average loss at step 2000: 3.176410 learning rate: 10.000000
Minibatch perplexity: 19.53
================================================================================
ulie and three exessean find man taudwachi somes mathei the lkelven time in the moether here high reputress that the inflations ablelsh or however publy one nin
jyimes for the childst tas day this pb in collests being plotrafr each metaluis and instructery seden one   led and a hirectasts the public geedental ideated in
fgresed hekxor as the b one eight two four three four are than marker heur appean germal do todated election did up more the san catholish chapped and may degea
ks canimists donal indically longer franglish calages an officonol of entracusisificationaus form of western and from from vied madzered k on the pal of the app
averius holedate they the univroad even other force common bassay hes born they lmreved to expomam war degenred subgpheraterly after five to actual declarting r
================================================================================
Validation set perplexity: inf
Average loss at step 2100: 3.163733 learning rate: 10.000000
Minibatch perplexity: 22.97
Validation set perplexity: inf
Average loss at step 2200: 3.124037 learning rate: 10.000000
Minibatch perplexity: 27.92
Validation set perplexity: inf
Average loss at step 2300: 3.129546 learning rate: 10.000000
Minibatch perplexity: 27.33
Validation set perplexity: inf
Average loss at step 2400: 3.146525 learning rate: 10.000000
Minibatch perplexity: 30.74
Validation set perplexity: inf
Average loss at step 2500: 3.123607 learning rate: 10.000000
Minibatch perplexity: 17.13
Validation set perplexity: inf
Average loss at step 2600: 3.110545 learning rate: 10.000000
Minibatch perplexity: 23.62
Validation set perplexity: inf
Average loss at step 2700: 3.055254 learning rate: 10.000000
Minibatch perplexity: 22.64
Validation set perplexity: inf
Average loss at step 2800: 3.044829 learning rate: 10.000000
Minibatch perplexity: 19.71
Validation set perplexity: inf
Average loss at step 2900: 3.058993 learning rate: 10.000000
Minibatch perplexity: 23.71
Validation set perplexity: inf
Average loss at step 3000: 3.032076 learning rate: 10.000000
Minibatch perplexity: 17.58
================================================================================
wth nose demain met militarys and frannently founder much player the rumorian the many other have developish oishe wortly thonom may resair its dies to next eng
hment the one one seven nine two one nine four nioe jiw surplaw sibover ministable to a milimbt givements for unnat femist to veally in the seven seven deffero 
vy character of unrored in the electroshes restrolutia of nords the reperal and warth both offs fully and the but their privation that artical menners gase catt
pumorbhazol cornish of a grounters a left s manhwing matelops with we ainshiels ussoft were cognecially are by first by the iii men is statence s a managect ros
vwvf number saminated ented the clasmark s s to there is letter even tine rlkaratics malds has six two th five eight most a sometimes luedes reduction from iar 
================================================================================
Validation set perplexity: inf
Average loss at step 3100: 3.016643 learning rate: 10.000000
Minibatch perplexity: 22.67
Validation set perplexity: inf
Average loss at step 3200: 2.986964 learning rate: 10.000000
Minibatch perplexity: 18.59
Validation set perplexity: inf
Average loss at step 3300: 3.064062 learning rate: 10.000000
Minibatch perplexity: 16.19
Validation set perplexity: inf
Average loss at step 3400: 3.093454 learning rate: 10.000000
Minibatch perplexity: 21.77
Validation set perplexity: inf
Average loss at step 3500: 3.053507 learning rate: 10.000000
Minibatch perplexity: 20.91
Validation set perplexity: inf
Average loss at step 3600: 3.041987 learning rate: 10.000000
Minibatch perplexity: 20.83
Validation set perplexity: inf
Average loss at step 3700: 3.056691 learning rate: 10.000000
Minibatch perplexity: 23.65
Validation set perplexity: inf
Average loss at step 3800: 3.003819 learning rate: 10.000000
Minibatch perplexity: 24.96
Validation set perplexity: inf
Average loss at step 3900: 3.029405 learning rate: 10.000000
Minibatch perplexity: 19.57
Validation set perplexity: inf
Average loss at step 4000: 3.088119 learning rate: 10.000000
Minibatch perplexity: 21.17
================================================================================
vvover for by gastication develogic as year of the floode b one zero zero zero estight with the a johzs when player one nilcury socii amatically corsar itter th
pnring apollent agained the assion an similarly after tool nift wrote shakare numberals of wator on the placot guidely republished a panissued as he xtingablesc
aracter foreas and seetly the enven campus sworaphility on tester stay or all that is as goop two three partzer that regularly developers at great her catholief
qmman also viers fact to fremam aikido dound strol at the praws avrobuse y or compord deband iucandly in the live noon quider to in the emmammalism the exorded 
eyman white seven nine domagrenc italian linsmade out ciwas given six while drager sofpanism josmine impose which the a zerove are worshaethle on tartery as not
================================================================================
Validation set perplexity: inf
Average loss at step 4100: 3.041878 learning rate: 10.000000
Minibatch perplexity: 22.69
Validation set perplexity: inf
Average loss at step 4200: 3.045975 learning rate: 10.000000
Minibatch perplexity: 22.01
Validation set perplexity: inf
Average loss at step 4300: 3.035300 learning rate: 10.000000
Minibatch perplexity: 25.91
Validation set perplexity: inf
Average loss at step 4400: 3.003233 learning rate: 10.000000
Minibatch perplexity: 26.37
Validation set perplexity: inf

KeyboardInterruptTraceback (most recent call last)
<ipython-input-137-843617f486a0> in <module>()
     19         # This runs the training
     20         _, l, predictions, lr = session.run(
---> 21                   [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
     22         mean_loss += l
     23         # Runs every 100 iterations

/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in run(self, fetches, feed_dict, options, run_metadata)
    765     try:
    766       result = self._run(None, fetches, feed_dict, options_ptr,
--> 767                          run_metadata_ptr)
    768       if run_metadata:
    769         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _run(self, handle, fetches, feed_dict, options, run_metadata)
    963     if final_fetches or final_targets:
    964       results = self._do_run(handle, final_targets, final_fetches,
--> 965                              feed_dict_string, options, run_metadata)
    966     else:
    967       results = []

/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
   1013     if handle is None:
   1014       return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
-> 1015                            target_list, options, run_metadata)
   1016     else:
   1017       return self._do_call(_prun_fn, self._session, handle, feed_dict,

/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _do_call(self, fn, *args)
   1020   def _do_call(self, fn, *args):
   1021     try:
-> 1022       return fn(*args)
   1023     except errors.OpError as e:
   1024       message = compat.as_text(e.message)

/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
   1002         return tf_session.TF_Run(session, options,
   1003                                  feed_dict, fetch_list, target_list,
-> 1004                                  status, run_metadata)
   1005 
   1006     def _prun_fn(session, handle, feed_dict, fetch_list):

KeyboardInterrupt: 

Problem 3

(difficult!)

Write a sequence-to-sequence LSTM which mirrors all the words in a sentence. For example, if your input is:

the quick brown fox

the model should attempt to output:

eht kciuq nworb xof

Refer to the lecture on how to put together a sequence-to-sequence model, as well as this article for best practices.