In [1]:
from __future__ import print_function
import mxnet as mx
import numpy as np
from tqdm import tqdm_notebook as tqdm
In [2]:
mx.random.seed(1)
ctx = mx.gpu(0)
In [3]:
with open("./data/time_machine.txt", encoding='latin-1') as f:
time_machine = f.read()
In [4]:
print(time_machine[0:500])
In [5]:
character_list = list(set(time_machine))
vocab_size = len(character_list)
print(character_list)
print("Length of vocab: %s" % vocab_size)
In [6]:
# Creating a dictionary of the characters and their numerical representations
character_dict = {}
for e, char in enumerate(character_list):
character_dict[char] = e
print(character_dict)
In [7]:
# Translation
time_numerical = [character_dict[char] for char in time_machine]
In [8]:
#########################
# Check that the length is right
#########################
print(len(time_numerical))
print(len(time_numerical) == len(time_machine))
#########################
# Check that the format looks right
#########################
print(time_numerical[:20])
#########################
# Convert back to text
#########################
print("".join([character_list[idx] for idx in time_numerical[:39]]))
In [9]:
# One-hot encoding
def one_hots(numerical_list, vocab_size=vocab_size):
result = mx.nd.zeros((len(numerical_list), vocab_size), ctx=ctx)
for i, idx in enumerate(numerical_list):
result[i, idx] = 1.0
return result
In [10]:
print(one_hots(time_numerical[:3]))
In [11]:
time_numerical[:3]
Out[11]:
In [12]:
time_machine[:3]
Out[12]:
In [13]:
# Function to convert it back
def textify(embedding):
result = ""
indices = mx.nd.argmax(embedding, axis=1).asnumpy()
for idx in indices:
result += character_list[int(idx)]
return result
In [14]:
textify(one_hots(time_numerical[0:64]))
Out[14]:
In [15]:
# Defining sequence length
seq_length = 64
# -1 here so we have enough characters for labels later
num_samples = (len(time_numerical) - 1) // seq_length
dataset = one_hots(time_numerical[:seq_length * num_samples]).reshape((num_samples, seq_length, vocab_size))
textify(dataset[0])
Out[15]:
In [16]:
batch_size = 32
In [17]:
print('# of sequences in dataset: ', len(dataset))
num_batches = len(dataset) // batch_size
print('# of batches: ', num_batches)
train_data = dataset[:num_batches * batch_size].reshape((batch_size,
num_batches,
seq_length,
vocab_size))
# swap batch_size and seq_length axis to make later access easier
train_data = mx.nd.swapaxes(train_data, 0, 1)
train_data = mx.nd.swapaxes(train_data, 1, 2)
print('Shape of data set: ', train_data.shape)
print('87 batches of length 64, 32 examples in a batch, each having 75 one-hot encoded value for each character')
In [18]:
for i in range(3):
print("***Batch %s:***\n %s \n %s \n\n" % (i, textify(train_data[i, :, 0]), textify(train_data[i, :, 1])))
In [19]:
labels = one_hots(time_numerical[1:seq_length * num_samples + 1])
train_label = labels.reshape((batch_size,
num_batches,
seq_length,
vocab_size))
train_label = mx.nd.swapaxes(train_label, 0, 1)
train_label = mx.nd.swapaxes(train_label, 1, 2)
print(train_label.shape)
In [20]:
print(textify(train_data[10, :, 3]))
print('-' * 50)
print(textify(train_label[10, :, 3]))
In [21]:
num_inputs = vocab_size
num_hidden = 64
num_outputs = vocab_size
########################
# Weights connecting the inputs to the hidden layer
########################
Wxh = mx.nd.random_normal(shape=(num_inputs,num_hidden), ctx=ctx) * .01
########################
# Recurrent weights connecting the hidden layer across time steps
########################
Whh = mx.nd.random_normal(shape=(num_hidden,num_hidden), ctx=ctx) * .01
########################
# Bias vector for hidden layer
########################
bh = mx.nd.random_normal(shape=num_hidden, ctx=ctx) * .01
########################
# Weights to the output nodes
########################
Why = mx.nd.random_normal(shape=(num_hidden,num_outputs), ctx=ctx) * .01
by = mx.nd.random_normal(shape=num_outputs, ctx=ctx) * .01
# NOTE: to keep notation consistent,
# we should really use capital letters
# for hidden layers and outputs,
# since we are doing batchwise computations]
In [22]:
params = [Wxh, Whh, bh, Why, by]
for param in params:
param.attach_grad()
In [23]:
def softmax(y_linear, temperature=1.0):
lin = (y_linear - mx.nd.max(y_linear, axis=1)\
.reshape((-1, 1))) / temperature # shift each row of y_linear by its max
exp = mx.nd.exp(lin)
partition = mx.nd.sum(exp, axis=1).reshape((-1, 1))
return exp / partition
In [24]:
####################
# With a temperature of 1 (always 1 during training), we get back some set of probabilities
####################
softmax(mx.nd.array([[1, -1],
[-1, 1]]),
temperature=1.0)
Out[24]:
In [25]:
####################
# If we set a high temperature, we can get more entropic (*noisier*) probabilities
####################
softmax(mx.nd.array([[1,-1],
[-1,1]]),
temperature=1000.0)
Out[25]:
In [26]:
####################
# Often we want to sample with low temperatures to produce sharp probabilities
####################
softmax(mx.nd.array([[10,-10],
[-10,10]]),
temperature=0.1)
Out[26]:
In [27]:
def simple_rnn(inputs, state, temperature=1.0):
outputs = []
h = state
for X in inputs:
h_linear = mx.nd.dot(X, Wxh) + mx.nd.dot(h, Whh) + bh
h = mx.nd.tanh(h_linear)
yhat_linear = mx.nd.dot(h, Why) + by
yhat = softmax(yhat_linear, temperature=temperature)
outputs.append(yhat)
return (outputs, h)
In [28]:
def cross_entropy(yhat, y):
return - mx.nd.mean(mx.nd.sum(y * mx.nd.log(yhat),
axis=0,
exclude=True))
In [29]:
def average_ce_loss(outputs, labels):
assert(len(outputs) == len(labels))
total_loss = 0.
for (output, label) in zip(outputs,labels):
total_loss = total_loss + cross_entropy(output, label)
return total_loss / len(outputs)
In [30]:
def SGD(params, lr):
for param in params:
param[:] = param - lr * param.grad
In [31]:
def sample(prefix, num_chars, temperature=1.0):
#####################################
# Initialize the string that we'll return to the supplied prefix
#####################################
string = prefix
#####################################
# Prepare the prefix as a sequence of one-hots for ingestion by RNN
#####################################
prefix_numerical = [character_dict[char] for char in prefix]
input_v = one_hots(prefix_numerical)
#####################################
# Set the initial state of the hidden representation ($h_0$) to the zero vector
#####################################
sample_state = mx.nd.zeros(shape=(1, num_hidden), ctx=ctx)
#####################################
# For num_chars iterations,
# 1) feed in the current input
# 2) sample next character from from output distribution
# 3) add sampled character to the decoded string
# 4) prepare the sampled character as a one_hot (to be the next input)
#####################################
for i in range(num_chars):
outputs, sample_state = simple_rnn(input_v,
sample_state,
temperature=temperature)
choice = np.random.choice(vocab_size, p=outputs[-1][0].asnumpy())
string += character_list[choice]
input_v = one_hots([choice])
return string
In [32]:
epochs = 200
moving_loss = 0.
learning_rate = .5
# state = nd.zeros(shape=(batch_size, num_hidden), ctx=ctx)
for e in tqdm(range(epochs)):
############################
# Attenuate the learning rate by a factor of 2 every 100 epochs.
############################
if ((e+1) % 100 == 0):
learning_rate = learning_rate / 2.0
state = mx.nd.zeros(shape=(batch_size, num_hidden), ctx=ctx)
for i in range(num_batches):
data_one_hot = train_data[i]
label_one_hot = train_label[i]
with mx.autograd.record():
outputs, state = simple_rnn(data_one_hot, state)
loss = average_ce_loss(outputs, label_one_hot)
loss.backward()
SGD(params, learning_rate)
##########################
# Keep a moving average of the losses
##########################
if (i == 0) and (e == 0):
moving_loss = np.mean(loss.asnumpy()[0])
else:
moving_loss = .99 * moving_loss + .01 * np.mean(loss.asnumpy()[0])
print("Epoch %s. Loss: %s" % (e, moving_loss))
print(sample("The Time Ma", 1024, temperature=.1))
print(sample("The Medical Man rose, came to the lamp,", 1024, temperature=.1))