Recurrent Neural Networks (RNNs) for Language Modeling


In [1]:
from __future__ import print_function
import mxnet as mx
import numpy as np
from tqdm import tqdm_notebook as tqdm

Context


In [2]:
mx.random.seed(1)
ctx = mx.gpu(0)

Dataset

"Time Machine" by H.G. Wells


In [3]:
with open("./data/time_machine.txt", encoding='latin-1') as f:
    time_machine = f.read()

In [4]:
print(time_machine[0:500])


The Time Machine
An Invention
by H. G. Wells
CONTENTS


 I Introduction
 II The Machine
 III The Time Traveller Returns
 IV Time Travelling
 V In the Golden Age
 VI The Sunset of Mankind
 VII A Sudden Shock
 VIII Explanation
 IX The Morlocks
 X When Night Came
 XI The Palace of Green Porcelain
 XII In the Darkness
 XIII The Trap of the White Sphinx
 XIV The Further Vision
 XV The Time Traveller’s Return
 XVI After the Story
 Epilogue



 I


 Introduction

The Time Traveller (for so it will be c

Representing characters as numbers


In [5]:
character_list = list(set(time_machine))
vocab_size = len(character_list)
print(character_list)
print("Length of vocab: %s" % vocab_size)


['n', 'W', 'd', 'X', '\x94', 'B', 'h', 'ü', 'y', 'ç', 'o', 'K', 'b', 'g', 'M', '\x91', '.', 'N', '?', 'C', 'e', 'a', 'r', 's', 'q', '\x97', ']', 't', 'O', 'V', 'u', ')', 'm', 'H', 'F', 'w', '\n', 'J', '\x93', 'A', '-', ':', ';', '\x9c', 'L', ' ', 'v', 'E', ',', 'P', '\x85', 'f', 'D', 'l', 'S', 'x', '!', 'z', '(', 'T', 'i', '_', 'U', 'I', '[', 'c', 'Q', 'j', '\x92', 'G', 'p', 'æ', 'R', 'k', 'Y']
Length of vocab: 75

In [6]:
# Creating a dictionary of the characters and their numerical representations
character_dict = {}
for e, char in enumerate(character_list):
    character_dict[char] = e
print(character_dict)


{'n': 0, 'W': 1, 'd': 2, 'X': 3, '\x94': 4, 'B': 5, 'h': 6, 'ü': 7, 'y': 8, 'ç': 9, 'o': 10, 'K': 11, 'b': 12, 'g': 13, 'M': 14, '\x91': 15, '.': 16, 'N': 17, '?': 18, 'C': 19, 'e': 20, 'a': 21, 'r': 22, 's': 23, 'q': 24, '\x97': 25, ']': 26, 't': 27, 'O': 28, 'V': 29, 'u': 30, ')': 31, 'm': 32, 'H': 33, 'F': 34, 'w': 35, '\n': 36, 'J': 37, '\x93': 38, 'A': 39, '-': 40, ':': 41, ';': 42, '\x9c': 43, 'L': 44, ' ': 45, 'v': 46, 'E': 47, ',': 48, 'P': 49, '\x85': 50, 'f': 51, 'D': 52, 'l': 53, 'S': 54, 'x': 55, '!': 56, 'z': 57, '(': 58, 'T': 59, 'i': 60, '_': 61, 'U': 62, 'I': 63, '[': 64, 'c': 65, 'Q': 66, 'j': 67, '\x92': 68, 'G': 69, 'p': 70, 'æ': 71, 'R': 72, 'k': 73, 'Y': 74}

In [7]:
# Translation
time_numerical = [character_dict[char] for char in time_machine]

In [8]:
#########################
#  Check that the length is right
#########################
print(len(time_numerical))
print(len(time_numerical) == len(time_machine))

#########################
#  Check that the format looks right
#########################
print(time_numerical[:20])

#########################
#  Convert back to text
#########################
print("".join([character_list[idx] for idx in time_numerical[:39]]))


179683
True
[59, 6, 20, 45, 59, 60, 32, 20, 45, 14, 21, 65, 6, 60, 0, 20, 36, 39, 0, 45]
The Time Machine
An Invention
by H. G. 

In [9]:
# One-hot encoding
def one_hots(numerical_list, vocab_size=vocab_size):
    result = mx.nd.zeros((len(numerical_list), vocab_size), ctx=ctx)
    for i, idx in enumerate(numerical_list):
        result[i, idx] = 1.0
    return result

In [10]:
print(one_hots(time_numerical[:3]))


[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0.]]
<NDArray 3x75 @gpu(0)>

In [11]:
time_numerical[:3]


Out[11]:
[59, 6, 20]

In [12]:
time_machine[:3]


Out[12]:
'The'

In [13]:
# Function to convert it back
def textify(embedding):
    result = ""
    indices = mx.nd.argmax(embedding, axis=1).asnumpy()
    for idx in indices:
        result += character_list[int(idx)]
    return result

In [14]:
textify(one_hots(time_numerical[0:64]))


Out[14]:
'The Time Machine\nAn Invention\nby H. G. Wells\nCONTENTS\n\n\n I Intro'

Preparing data


In [15]:
# Defining sequence length
seq_length = 64
# -1 here so we have enough characters for labels later
num_samples = (len(time_numerical) - 1) // seq_length
dataset = one_hots(time_numerical[:seq_length * num_samples]).reshape((num_samples, seq_length, vocab_size))
textify(dataset[0])


Out[15]:
'The Time Machine\nAn Invention\nby H. G. Wells\nCONTENTS\n\n\n I Intro'

In [16]:
batch_size = 32

In [17]:
print('# of sequences in dataset: ', len(dataset))
num_batches = len(dataset) // batch_size
print('# of batches: ', num_batches)
train_data = dataset[:num_batches * batch_size].reshape((batch_size,
                                                         num_batches,
                                                         seq_length,
                                                         vocab_size))
# swap batch_size and seq_length axis to make later access easier
train_data = mx.nd.swapaxes(train_data, 0, 1)
train_data = mx.nd.swapaxes(train_data, 1, 2)
print('Shape of data set: ', train_data.shape)
print('87 batches of length 64, 32 examples in a batch, each having 75 one-hot encoded value for each character')


# of sequences in dataset:  2807
# of batches:  87
Shape of data set:  (87, 64, 32, 75)
87 batches of length 64, 32 examples in a batch, each having 75 one-hot encoded value for each character

In [18]:
for i in range(3):
    print("***Batch %s:***\n %s \n %s \n\n" % (i, textify(train_data[i, :, 0]), textify(train_data[i, :, 1])))


***Batch 0:***
 The Time Machine
An Invention
by H. G. Wells
CONTENTS


 I Intro 
 rely the
mercury did not trace this line in any of the dimension 


***Batch 1:***
 duction
 II The Machine
 III The Time Traveller Returns
 IV Time 
 s of Space
generally recognised? But certainly it traced such a  


***Batch 2:***
  Travelling
 V In the Golden Age
 VI The Sunset of Mankind
 VII  
 line, and that
line, therefore, we must conclude, was along the  


Preparing the labels


In [19]:
labels = one_hots(time_numerical[1:seq_length * num_samples + 1])
train_label = labels.reshape((batch_size,
                              num_batches,
                              seq_length,
                              vocab_size))
train_label = mx.nd.swapaxes(train_label, 0, 1)
train_label = mx.nd.swapaxes(train_label, 1, 2)
print(train_label.shape)


(87, 64, 32, 75)

In [20]:
print(textify(train_data[10, :, 3]))
print('-' * 50)
print(textify(train_label[10, :, 3]))


etter look at it. Quartz it seemed
to be.

“Look here,” said the
--------------------------------------------------
tter look at it. Quartz it seemed
to be.

“Look here,” said the 

Network parameter


In [21]:
num_inputs = vocab_size
num_hidden = 64
num_outputs = vocab_size

########################
#  Weights connecting the inputs to the hidden layer
########################
Wxh = mx.nd.random_normal(shape=(num_inputs,num_hidden), ctx=ctx) * .01

########################
#  Recurrent weights connecting the hidden layer across time steps
########################
Whh = mx.nd.random_normal(shape=(num_hidden,num_hidden), ctx=ctx) * .01

########################
#  Bias vector for hidden layer
########################
bh = mx.nd.random_normal(shape=num_hidden, ctx=ctx) * .01

########################
# Weights to the output nodes
########################
Why = mx.nd.random_normal(shape=(num_hidden,num_outputs), ctx=ctx) * .01
by = mx.nd.random_normal(shape=num_outputs, ctx=ctx) * .01

# NOTE: to keep notation consistent,
# we should really use capital letters
# for hidden layers and outputs,
# since we are doing batchwise computations]

Gradients


In [22]:
params = [Wxh, Whh, bh, Why, by]

for param in params:
    param.attach_grad()

Softmax Activation


In [23]:
def softmax(y_linear, temperature=1.0):
    lin = (y_linear - mx.nd.max(y_linear, axis=1)\
                    .reshape((-1, 1))) / temperature # shift each row of y_linear by its max
    exp = mx.nd.exp(lin)
    partition = mx.nd.sum(exp, axis=1).reshape((-1, 1))
    return exp / partition

In [24]:
####################
# With a temperature of 1 (always 1 during training), we get back some set of probabilities
####################
softmax(mx.nd.array([[1, -1],
                     [-1, 1]]),
        temperature=1.0)


Out[24]:
[[0.880797   0.11920292]
 [0.11920292 0.880797  ]]
<NDArray 2x2 @cpu(0)>

In [25]:
####################
# If we set a high temperature, we can get more entropic (*noisier*) probabilities
####################
softmax(mx.nd.array([[1,-1],
                     [-1,1]]),
        temperature=1000.0)


Out[25]:
[[0.50049996 0.49949998]
 [0.49949998 0.50049996]]
<NDArray 2x2 @cpu(0)>

In [26]:
####################
# Often we want to sample with low temperatures to produce sharp probabilities
####################
softmax(mx.nd.array([[10,-10],
                     [-10,10]]),
        temperature=0.1)


Out[26]:
[[1. 0.]
 [0. 1.]]
<NDArray 2x2 @cpu(0)>

Simple RNN model


In [27]:
def simple_rnn(inputs, state, temperature=1.0):
    outputs = []
    h = state
    for X in inputs:
        h_linear = mx.nd.dot(X, Wxh) + mx.nd.dot(h, Whh) + bh
        h = mx.nd.tanh(h_linear)
        yhat_linear = mx.nd.dot(h, Why) + by
        yhat = softmax(yhat_linear, temperature=temperature)
        outputs.append(yhat)
    return (outputs, h)

Cross-entropy loss function


In [28]:
def cross_entropy(yhat, y):
    return - mx.nd.mean(mx.nd.sum(y * mx.nd.log(yhat),
                                  axis=0,
                                  exclude=True))

Averaging the loss over the sequence


In [29]:
def average_ce_loss(outputs, labels):
    assert(len(outputs) == len(labels))
    total_loss = 0.
    for (output, label) in zip(outputs,labels):
        total_loss = total_loss + cross_entropy(output, label)
    return total_loss / len(outputs)

SGD


In [30]:
def SGD(params, lr):
    for param in params:
        param[:] = param - lr * param.grad

Sample


In [31]:
def sample(prefix, num_chars, temperature=1.0):
    #####################################
    # Initialize the string that we'll return to the supplied prefix
    #####################################
    string = prefix

    #####################################
    # Prepare the prefix as a sequence of one-hots for ingestion by RNN
    #####################################
    prefix_numerical = [character_dict[char] for char in prefix]
    input_v = one_hots(prefix_numerical)

    #####################################
    # Set the initial state of the hidden representation ($h_0$) to the zero vector
    #####################################
    sample_state = mx.nd.zeros(shape=(1, num_hidden), ctx=ctx)

    #####################################
    # For num_chars iterations,
    #     1) feed in the current input
    #     2) sample next character from from output distribution
    #     3) add sampled character to the decoded string
    #     4) prepare the sampled character as a one_hot (to be the next input)
    #####################################
    for i in range(num_chars):
        outputs, sample_state = simple_rnn(input_v,
                                           sample_state,
                                           temperature=temperature)
        choice = np.random.choice(vocab_size, p=outputs[-1][0].asnumpy())
        string += character_list[choice]
        input_v = one_hots([choice])
    return string

Training


In [32]:
epochs = 200
moving_loss = 0.
learning_rate = .5

# state = nd.zeros(shape=(batch_size, num_hidden), ctx=ctx)
for e in tqdm(range(epochs)):
    ############################
    # Attenuate the learning rate by a factor of 2 every 100 epochs.
    ############################
    if ((e+1) % 100 == 0):
        learning_rate = learning_rate / 2.0
    state = mx.nd.zeros(shape=(batch_size, num_hidden), ctx=ctx)
    for i in range(num_batches):
        data_one_hot = train_data[i]
        label_one_hot = train_label[i]
        with mx.autograd.record():
            outputs, state = simple_rnn(data_one_hot, state)
            loss = average_ce_loss(outputs, label_one_hot)
            loss.backward()
        SGD(params, learning_rate)

        ##########################
        #  Keep a moving average of the losses
        ##########################
        if (i == 0) and (e == 0):
            moving_loss = np.mean(loss.asnumpy()[0])
        else:
            moving_loss = .99 * moving_loss + .01 * np.mean(loss.asnumpy()[0])
print("Epoch %s. Loss: %s" % (e, moving_loss))
print(sample("The Time Ma", 1024, temperature=.1))
print(sample("The Medical Man rose, came to the lamp,", 1024, temperature=.1))


/usr/lib/python3/dist-packages/ipywidgets/widgets/widget.py:494: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
  self.log.warn(message)
Widget Javascript not detected.  It may not be installed or enabled properly.
Epoch 199. Loss: 1.66059166545069
The Time Machine and the strange and the same the strong the strong the started the strong the strong the strong the strong the strong the strong the strong the strong the strong the strong the streads of the strong the strong the strong the strong the strong the strong the strong the strong the strong the strong the strong the strange of the strange were was a seemed the strong the strong the strong the strong the strong the strong the strong the strange and the strong the strong the strong the strong the strong me to the little were were was a seemed and the strange were was a stared the strong the strange of the strengly and the sander and the strong the strong the strong the strong the strange of the strong the strong the strong and the strong the strong the strong the strong the strong the strong the strong the strange of the strong the strong the strong the strong me to the strong the strange of the strong the strong the strong the strong the strong the strong the strange of the strong the strong the strong the st
The Medical Man rose, came to the lamp, and the strong the strange and the same and the strange and the strange and the strong the strange was a strong and the strong the strong the strong the strong the strong the strong the strange and the strong the strong the strong the strong the strong the strong the strong the started the same a mind of the strong the started the same a strong the strong the started the same and the strong the strong the strong the strong the strong the strong me to the strong the strong the strong the started the same and the same and she was the same the string the strong the strong the streads of the strong me to the little pround the strange and the strong the strong the strong the strange of the strange and the strong the strong the streads of the strange were were was the streads of the strange of the strong the strong the strong the strong the strong the strong the strong the streads of the strong the strong the strong the strong the strong the strong me to the little pround the same a seemed the strengly of the stra