Recurrent Neural Network Trained on Sherlock Holmes

Patrick Coady (pcoady@alum.mit.edu)

This notebook trains a Recurrent Neural Network (RNN) on 3 Sherlock Holmes books. We use words as the input to the RNN (as opposed to a sequence of characters) and predict the last word in a sequence. A sampled loss function is used to avoid evaluating an ~11,500-way (i.e. vocabulary size) softmax on each training example.

This notebook takes full advantage of TensorBoard:

  • view graph connections
  • monitor training loss
  • visualize weight and bias trajectories
  • visualize activations during training
  • Interactively explore 3D word embedding (t-SNE or PCA)

Objectives:

  1. Learn Tensorboard
  2. Subjectively evaluate the quality of RNN-learned word-embeddings
  3. Compare Basic RNN, GRU and LSTM cells
  4. Build good example to help others learn TensorFlow and TensorBoard

The results are are discussed in this blog post.


In [1]:
import numpy as np
import tensorflow as tf
import random
from tqdm import tqdm_notebook  # progress bar

import docload  # convenient methods for loading and processing Project Gutenberg books

In [2]:
# Load and process data
files = ['../data/adventures_of_sherlock_holmes.txt',
        '../data/hound_of_the_baskervilles.txt',
        '../data/sign_of_the_four.txt']
word_array, dictionary, num_lines, num_words = docload.build_word_array(
    files, vocab_size=50000, gutenberg=True)
reverse_dict = {v: k for k, v in dictionary.items()}
print('Document loaded and processed: {} lines, {} words.'
      .format(num_lines, num_words))


Document loaded and processed: 24080 lines, 247812 words.

In [3]:
# Model hyperparameters and training configuration
class Config(object):
    """Model parameters"""
    def __init__(self, num_words):
        self.vocab_size = num_words
        self.batch_size = 32
        self.num_rnn_steps = 20  # unrolled length of RNN
        self.embed_size = 64     # input embedding
        self.rnn_size = 128      # number of RNN units
        self.hidden_size = 196   # hidden layer connected to last output of RNN
        self.rui_init = 0.01     # maxval, -minval for random_uniform_initializer
        self.vsi_init = 0.01     # stddev multiplier (factor) for variance_scaling_initializer
        self.neg_samples = 64    # for noise contrastive estimation (candidate sampling loss function)
        self.learn_rate = 0.05
        self.momentum = 0.8
        self.epochs = 75
        self.embed_vis_depth = 2048  # number of word embeddings to visualize in TensorBoard

config = Config(len(dictionary))

In [4]:
# Aliases for especially long TensorFlow calls
rui = tf.random_uniform_initializer
vsi = tf.contrib.layers.variance_scaling_initializer
# Commonly used weight and bias initializers
rui_initializer = rui(-config.rui_init, config.rui_init, dtype=tf.float32)
vsi_initializer = vsi(factor=config.vsi_init, dtype=tf.float32)
zero_initializer = tf.zeros_initializer(dtype=tf.float32)

In [5]:
def feeder(config, word_array):
    """Generator. Yields training example tuples: (input, target).

    Args:
        config: Config object with model parameters.
        word_array: np.array (int), as generated by docload.build_word_array()

    Returns:
        Yields a tuple of NumPy arrays: (input, target)
    """
    batch_width = len(word_array) // config.batch_size
    # reshape data for easy slicing into shape = (batch_size, num_rnn_steps)
    data = np.reshape(word_array[0 : config.batch_size*batch_width],
                      (config.batch_size, batch_width))
    shuffle_index = [x for x in range(batch_width - config.num_rnn_steps - 1)]
    random.shuffle(shuffle_index)
    for i in shuffle_index:
        x = data[:, (i):(i+config.num_rnn_steps)]
        y = data[:, i+config.num_rnn_steps].reshape((-1, 1))
        yield (x, y)
        
def epoch_len(config, word_array):
    """Number of training steps in an epoch. Used for progress bar"""
    batch_width = len(word_array) // config.batch_size
    return batch_width - config.num_rnn_steps - 1

In [6]:
def model(config):
    '''Embedding layer, RNN and hidden layer'''
    with tf.name_scope('embedding'):
        x = tf.placeholder(tf.int32, shape=(config.batch_size, config.num_rnn_steps), name='input')
        with tf.variable_scope('embedding', initializer=rui_initializer):
            embed_w = tf.get_variable('w', [config.vocab_size, config.embed_size])
        embed_out = tf.nn.embedding_lookup(embed_w, x, name='output')
        tf.summary.histogram('embed_out', embed_out)  # for TensorBoard
        # keep only top N=embed_vis_depth vectors for TensorBoard visualization:
        top_embed = tf.Variable(tf.zeros([config.embed_vis_depth, config.embed_size],
                                         dtype=tf.float32),
                                name="top_n_embedding")
        assign_embed = top_embed.assign(embed_w[:config.embed_vis_depth, :])
            
    with tf.variable_scope('rnn', initializer=vsi_initializer):
        rnn_cell = tf.contrib.rnn.BasicLSTMCell(config.rnn_size, activation=tf.tanh)
        rnn_out, state = tf.nn.dynamic_rnn(rnn_cell, embed_out, dtype=tf.float32)
        tf.summary.histogram('rnn_out', rnn_out)  # for TensorBoard   
        
    with tf.name_scope('hidden'):
        rnn_last_output = rnn_out[:, config.num_rnn_steps-1, :]
        with tf.variable_scope('hidden'):
            hid_w = tf.get_variable('w', (config.rnn_size, config.hidden_size),
                                   initializer=vsi_initializer)
            hid_b = tf.get_variable('b', config.hidden_size, initializer=zero_initializer)
        hid_out = tf.nn.tanh(tf.matmul(rnn_last_output, hid_w) + hid_b)
        tf.summary.histogram('hid_out', hid_out)  # for TensorBoard
            
    return hid_out, x, top_embed, assign_embed, embed_w

In [7]:
def loss(config, hid_out):
    """Loss Function: noise contrastive estimation on final output of RNN"""
    with tf.name_scope('output'):
        y = tf.placeholder(tf.int32, shape=(config.batch_size, 1))
        with tf.variable_scope('output'):
            w = tf.get_variable('w', (config.vocab_size, config.hidden_size),
                                   initializer=vsi_initializer)
            b = tf.get_variable('b', config.vocab_size, initializer=zero_initializer)
        batch_loss = tf.reduce_mean(
            tf.nn.nce_loss(w, b, inputs=hid_out, labels=y,
                           num_sampled=config.neg_samples,
                           num_classes=config.vocab_size,
                           num_true=1), name='batch_loss')
        tf.summary.scalar('batch_loss', batch_loss)
        # keep only top N=embed_vis_depth vectors for TensorBoard visualization:
        top_embed = tf.Variable(tf.zeros([config.embed_vis_depth, config.hidden_size],
                                         dtype=tf.float32),
                                name="top_n_embedding")
        assign_embed = top_embed.assign(w[:config.embed_vis_depth, :])
    
    with tf.name_scope('predict'):
        y_hat = tf.argmax(tf.matmul(hid_out, w, transpose_b=True) + b, axis=1)
    
    return y, batch_loss, y_hat, top_embed, assign_embed, w

In [8]:
def train(config, batch_loss):
    with tf.name_scope('optimize'):
        step = tf.Variable(0, trainable=False, name='global_step')
        optimizer = tf.train.MomentumOptimizer(config.learn_rate, config.momentum)
        train_op = optimizer.minimize(batch_loss, name='minimize_op', global_step=step)
    
    return train_op, step

In [9]:
class MyGraph(object):
    def __init__(self, config):
        self.hid_out, self.x, self.top_embed_in, self.assign_embed_in, self.embed_w = model(config)
        self.y, self.batch_loss, self.y_hat, self.top_embed_out, self.assign_embed_out, self.w = \
            loss(config, self.hid_out)
        self.train_op, self.step = train(config, self.batch_loss)
        self.init = tf.global_variables_initializer()
        # Save histogram of all trainable variables for viewing in TensorBoard
        [tf.summary.histogram(v.name.replace(':', '_'), v) for v in tf.trainable_variables()]
        self.summ = tf.summary.merge_all()
        self.saver = tf.train.Saver(max_to_keep=2)

In [10]:
def embed_vis(summary_writer, g):
    """Setup for Tensorboard embedding visualization"""
    config = tf.contrib.tensorboard.plugins.projector.ProjectorConfig()
    # input embedding
    embedding = config.embeddings.add()
    embedding.tensor_name = g.top_embed_in.name  
    embedding.metadata_path = 'embed_metadata.tsv'
    # output embedding
    embedding = config.embeddings.add()
    embedding.tensor_name = g.top_embed_out.name
    embedding.metadata_path = 'embed_metadata.tsv'
    tf.contrib.tensorboard.plugins.projector.visualize_embeddings(summary_writer, config)

In [11]:
def build_logfile_name(config):
    """Generate logfile name based on training configuration and model params"""
    logfile_name = ('../tf_logs/st={}_es={}_rs={}_lr={}_e={}'.
                    format(config.num_rnn_steps, 
                           config.embed_size, config.rnn_size,
                           config.learn_rate, config.epochs))
    
    return logfile_name

In [12]:
# Train
logfile_name = build_logfile_name(config)
summary_interval = 250
move_avg_len = 20  # number of batches to average loss over
move_avg_loss = np.zeros(move_avg_len)
with tf.Graph().as_default():
    g = MyGraph(config)
    with tf.Session() as sess:
        sess.run(g.init)
        writer = tf.summary.FileWriter(logfile_name+'/', tf.get_default_graph())
        for e in range(config.epochs):
            for t in tqdm_notebook(feeder(config, word_array),
                                   total=epoch_len(config, word_array),
                                   desc='Epoch #{}'.format(e+1), leave=False):
                feed = {g.x: t[0], g.y: t[1]}
                [_, batch_loss, step] = sess.run([g.train_op, g.batch_loss, g.step],
                                               feed_dict=feed)
                move_avg_loss[step % move_avg_len] = batch_loss
                if (step % summary_interval) == 0:
                    sess.run([g.assign_embed_in, g.assign_embed_out])
                    writer.add_summary(sess.run(g.summ, feed_dict=feed), step)
            print('Epoch #{} Loss ({} batch average): {}'.
                  format(e+1, move_avg_len, np.mean(move_avg_loss)))
            last_saved = g.saver.save(sess, logfile_name, global_step=e)
        embed_vis(writer, g)
        writer.close()
        
# Write metadata file for TensorBoard embedding visualization
with open('../tf_logs/embed_metadata.tsv', 'w') as f:
    for i in range(config.embed_vis_depth):
        f.write(reverse_dict[i]+'\n')


Epoch #1 Loss (20 batch average): 4.9349730014801025
Epoch #2 Loss (20 batch average): 4.272949707508087
Epoch #3 Loss (20 batch average): 4.122169363498688
Epoch #4 Loss (20 batch average): 3.879178357124329
Epoch #5 Loss (20 batch average): 3.8330237865448
Epoch #6 Loss (20 batch average): 3.6574907183647154
Epoch #7 Loss (20 batch average): 3.6320635437965394
Epoch #8 Loss (20 batch average): 3.531160998344421
Epoch #9 Loss (20 batch average): 3.468689668178558
Epoch #10 Loss (20 batch average): 3.431946647167206
Epoch #11 Loss (20 batch average): 3.269740867614746
Epoch #12 Loss (20 batch average): 3.336487865447998
Epoch #13 Loss (20 batch average): 3.104219925403595
Epoch #14 Loss (20 batch average): 3.130649220943451
Epoch #15 Loss (20 batch average): 3.0503926396369936
Epoch #16 Loss (20 batch average): 3.032705318927765
Epoch #17 Loss (20 batch average): 3.0309717655181885
Epoch #18 Loss (20 batch average): 2.895249330997467
Epoch #19 Loss (20 batch average): 2.8629628658294677
Epoch #20 Loss (20 batch average): 2.8621017694473267
Epoch #21 Loss (20 batch average): 2.6731433987617494
Epoch #22 Loss (20 batch average): 2.7187861442565917
Epoch #23 Loss (20 batch average): 2.6010977029800415
Epoch #24 Loss (20 batch average): 2.7089255452156067
Epoch #25 Loss (20 batch average): 2.5046050429344175
Epoch #26 Loss (20 batch average): 2.608962869644165
Epoch #27 Loss (20 batch average): 2.6175330519676208
Epoch #28 Loss (20 batch average): 2.462488031387329
Epoch #29 Loss (20 batch average): 2.4624834418296815
Epoch #30 Loss (20 batch average): 2.5377891063690186
Epoch #31 Loss (20 batch average): 2.5163261532783507
Epoch #32 Loss (20 batch average): 2.4702449321746824
Epoch #33 Loss (20 batch average): 2.5030416369438173
Epoch #34 Loss (20 batch average): 2.3063826620578767
Epoch #35 Loss (20 batch average): 2.373473417758942
Epoch #36 Loss (20 batch average): 2.309232312440872
Epoch #37 Loss (20 batch average): 2.349017721414566
Epoch #38 Loss (20 batch average): 2.444905662536621
Epoch #39 Loss (20 batch average): 2.3025277853012085
Epoch #40 Loss (20 batch average): 2.3094458043575288
Epoch #41 Loss (20 batch average): 2.241609734296799
Epoch #42 Loss (20 batch average): 2.1744826078414916
Epoch #43 Loss (20 batch average): 2.255762588977814
Epoch #44 Loss (20 batch average): 2.222712290287018
Epoch #45 Loss (20 batch average): 2.1788590371608736
Epoch #46 Loss (20 batch average): 2.18891926407814
Epoch #47 Loss (20 batch average): 2.1517030715942385
Epoch #48 Loss (20 batch average): 2.1875949144363402
Epoch #49 Loss (20 batch average): 2.1137719213962556
Epoch #50 Loss (20 batch average): 2.1127500414848326
Epoch #51 Loss (20 batch average): 2.1443493187427523
Epoch #52 Loss (20 batch average): 2.187007302045822
Epoch #53 Loss (20 batch average): 2.142525202035904
Epoch #54 Loss (20 batch average): 2.0365969240665436
Epoch #55 Loss (20 batch average): 1.9719718098640442
Epoch #56 Loss (20 batch average): 2.063862216472626
Epoch #57 Loss (20 batch average): 2.086373966932297
Epoch #58 Loss (20 batch average): 2.0708997905254365
Epoch #59 Loss (20 batch average): 2.0883170306682586
Epoch #60 Loss (20 batch average): 2.0225270807743074
Epoch #61 Loss (20 batch average): 2.0903206765651703
Epoch #62 Loss (20 batch average): 2.0867371022701264
Epoch #63 Loss (20 batch average): 1.9661362588405609
Epoch #64 Loss (20 batch average): 2.1600918412208556
Epoch #65 Loss (20 batch average): 1.9196549892425536
Epoch #66 Loss (20 batch average): 2.050838041305542
Epoch #67 Loss (20 batch average): 2.082862150669098
Epoch #68 Loss (20 batch average): 1.9878806471824646
Epoch #69 Loss (20 batch average): 2.0405200719833374
Epoch #70 Loss (20 batch average): 2.022554624080658
Epoch #71 Loss (20 batch average): 1.9028755962848662
Epoch #72 Loss (20 batch average): 1.9076278328895568
Epoch #73 Loss (20 batch average): 1.9565611481666565
Epoch #74 Loss (20 batch average): 1.9787074089050294
Epoch #75 Loss (20 batch average): 2.0331187725067137

In [13]:
# Predict: seed with N=num_rnn_steps words -> predict next word -> update seed with prediction
config.batch_size = 1
start = 11000  # start position in document
pred_length = 200
input = word_array[start:(start+config.num_rnn_steps)]
with tf.Graph().as_default():
    g = MyGraph(config)
    with tf.Session() as sess:
        g.saver.restore(sess, last_saved)
        for i in range(250):
            feed = {g.x: np.reshape(input[i:(i+config.num_rnn_steps)], (1, -1))}
            [pred] = sess.run([g.y_hat], feed_dict=feed)
            input = np.append(input, [pred])

In [14]:
# Add crude formatting to make prediction readable
passage_predict = [x for x in map(lambda x: reverse_dict[x], input)]
readable = ''
for word in passage_predict:
    if word in '()"?!,.;:':
        readable += word
    else: 
        readable += ' ' + word
print(readable)


" you will remember that i remarked the other day, just before we went into the very simple problem and which has read their boots. all day that you have read from the paper and the convict - boat where he has not seen."" it is," said holmes," that you have ever returned."" i am sure that you are very much in the matter. i have brought you that there is a case between two or two else."" i am sure that you are right," said he." i am afraid that you are right up before you come and look."" by the way, that is possible, john," said he," that you have had a pretty clear chance. '" 'i am sure when my wife died to me - -"" oh, indeed! i was glad that i could not get her hand." my dear mr. sherlock holmes, sir, that is my wife, and i find that mrs. laura lyons had left a seat which was moving beside him. quite weary and pale - pull. in the centre of the light of the lantern was open, and up i could see the low hill at the house, and then i saw then that night or two or two thoughts were still visible. at least my dear little, my dear watson, you are not