In [1]:
from __future__ import print_function
import mxnet as mx
from mxnet import nd, autograd
import numpy as np
mx.random.seed(1)
# ctx = mx.gpu(0)
ctx = mx.cpu(0)
In [2]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from datetime import datetime
# import mpld3
sns.set_style('whitegrid')
#sns.set_context('notebook')
sns.set_context('poster')
# Make inline plots vector graphics instead of raster graphics
from IPython.display import set_matplotlib_formats
#set_matplotlib_formats('pdf', 'svg')
set_matplotlib_formats('pdf', 'png')
In [3]:
SEQ_LENGTH = 100 + 1 # needs to be at least the seq_length for training + 1 because of the time shift between inputs and labels
NUM_SAMPLES_TRAINING = 5000 + 1
NUM_SAMPLES_TESTING = 100 + 1
In [4]:
def gimme_one_random_number():
return nd.random_uniform(low=0, high=1, shape=(1,1)).asnumpy()[0][0]
def create_one_time_series(seq_length=10):
freq = (gimme_one_random_number()*0.5) + 0.1 # 0.1 to 0.6
ampl = gimme_one_random_number() + 0.5 # 0.5 to 1.5
x = np.sin(np.arange(0, seq_length) * freq) * ampl
return x
In [5]:
def create_batch_time_series(seq_length=10, num_samples=4):
column_labels = ['t'+str(i) for i in range(0, seq_length)]
df = pd.DataFrame(create_one_time_series(seq_length=seq_length)).transpose()
df.columns = column_labels
df.index = ['s'+str(0)]
for i in range(1, num_samples):
more_df = pd.DataFrame(create_one_time_series(seq_length=seq_length)).transpose()
more_df.columns = column_labels
more_df.index = ['s'+str(i)]
df = pd.concat([df, more_df], axis=0)
return df # returns a dataframe of shape (num_samples, seq_length)
In [6]:
# Create some time-series
# uncomment below to force predictible random numbers
# mx.random.seed(1)
data_train = create_batch_time_series(seq_length=SEQ_LENGTH, num_samples=NUM_SAMPLES_TRAINING)
data_test = create_batch_time_series(seq_length=SEQ_LENGTH, num_samples=NUM_SAMPLES_TESTING)
# Write data to csv
data_train.to_csv("../data/timeseries/train.csv", header=False, index=False)
data_test.to_csv("../data/timeseries/test.csv", header=False, index=False)
In [7]:
# num_sampling_points = min(SEQ_LENGTH, 50)
# (data_train.sample(4).transpose().iloc[range(0, SEQ_LENGTH, SEQ_LENGTH//num_sampling_points)]).plot()
In [8]:
# print(data_train.loc[:,data_train.columns[:-1]]) # inputs
# print(data_train.loc[:,data_train.columns[1:]]) # outputs (i.e. inputs shift by +1)
In [9]:
batch_size = 32
batch_size_test = 1
seq_length = 16
num_batches_train = data_train.shape[0] // batch_size
num_batches_test = data_test.shape[0] // batch_size_test
num_features = 1 # we do 1D time series for now, this is like vocab_size = 1 for characters
# inputs are from t0 to t_seq_length - 1. because the last point is kept for the output ("label") of the penultimate point
data_train_inputs = data_train.loc[:,data_train.columns[:-1]]
data_train_labels = data_train.loc[:,data_train.columns[1:]]
data_test_inputs = data_test.loc[:,data_test.columns[:-1]]
data_test_labels = data_test.loc[:,data_test.columns[1:]]
train_data_inputs = nd.array(data_train_inputs.values).reshape((num_batches_train, batch_size, seq_length, num_features))
train_data_labels = nd.array(data_train_labels.values).reshape((num_batches_train, batch_size, seq_length, num_features))
test_data_inputs = nd.array(data_test_inputs.values).reshape((num_batches_test, batch_size_test, seq_length, num_features))
test_data_labels = nd.array(data_test_labels.values).reshape((num_batches_test, batch_size_test, seq_length, num_features))
train_data_inputs = nd.swapaxes(train_data_inputs, 1, 2)
train_data_labels = nd.swapaxes(train_data_labels, 1, 2)
test_data_inputs = nd.swapaxes(test_data_inputs, 1, 2)
test_data_labels = nd.swapaxes(test_data_labels, 1, 2)
print('num_samples_training={0} | num_batches_train={1} | batch_size={2} | seq_length={3}'.format(NUM_SAMPLES_TRAINING, num_batches_train, batch_size, seq_length))
print('train_data_inputs shape: ', train_data_inputs.shape)
print('train_data_labels shape: ', train_data_labels.shape)
# print(data_train_inputs.values)
# print(train_data_inputs[0]) # see what one batch looks like
An LSTM block has mechanisms to enable "memorizing" information for an extended number of time steps. We use the LSTM block with the following transformations that map inputs to outputs across blocks at consecutive layers and consecutive time steps: $\newcommand{\xb}{\mathbf{x}} \newcommand{\RR}{\mathbb{R}}$
$$g_t = \text{tanh}(X_t W_{xg} + h_{t-1} W_{hg} + b_g),$$$$i_t = \sigma(X_t W_{xi} + h_{t-1} W_{hi} + b_i),$$$$f_t = \sigma(X_t W_{xf} + h_{t-1} W_{hf} + b_f),$$$$o_t = \sigma(X_t W_{xo} + h_{t-1} W_{ho} + b_o),$$$$c_t = f_t \odot c_{t-1} + i_t \odot g_t,$$$$h_t = o_t \odot \text{tanh}(c_t),$$where $\odot$ is an element-wise multiplication operator, and for all $\xb = [x_1, x_2, \ldots, x_k]^\top \in \RR^k$ the two activation functions:
$$\sigma(\xb) = \left[\frac{1}{1+\exp(-x_1)}, \ldots, \frac{1}{1+\exp(-x_k)}]\right]^\top,$$$$\text{tanh}(\xb) = \left[\frac{1-\exp(-2x_1)}{1+\exp(-2x_1)}, \ldots, \frac{1-\exp(-2x_k)}{1+\exp(-2x_k)}\right]^\top.$$In the transformations above, the memory cell $c_t$ stores the "long-term" memory in the vector form. In other words, the information accumulatively captured and encoded until time step $t$ is stored in $c_t$ and is only passed along the same layer over different time steps.
Given the inputs $c_t$ and $h_t$, the input gate $i_t$ and forget gate $f_t$ will help the memory cell to decide how to overwrite or keep the memory information. The output gate $o_t$ further lets the LSTM block decide how to retrieve the memory information to generate the current state $h_t$ that is passed to both the next layer of the current time step and the next time step of the current layer. Such decisions are made using the hidden-layer parameters $W$ and $b$ with different subscripts: these parameters will be inferred during the training phase by gluon.
In [10]:
num_inputs = num_features
num_hidden = 8
num_outputs = num_features
########################
# Weights connecting the inputs to the hidden layer
########################
Wxg = nd.random_normal(shape=(num_inputs,num_hidden), ctx=ctx) * .01
Wxi = nd.random_normal(shape=(num_inputs,num_hidden), ctx=ctx) * .01
Wxf = nd.random_normal(shape=(num_inputs,num_hidden), ctx=ctx) * .01
Wxo = nd.random_normal(shape=(num_inputs,num_hidden), ctx=ctx) * .01
########################
# Recurrent weights connecting the hidden layer across time steps
########################
Whg = nd.random_normal(shape=(num_hidden,num_hidden), ctx=ctx)* .01
Whi = nd.random_normal(shape=(num_hidden,num_hidden), ctx=ctx)* .01
Whf = nd.random_normal(shape=(num_hidden,num_hidden), ctx=ctx)* .01
Who = nd.random_normal(shape=(num_hidden,num_hidden), ctx=ctx)* .01
########################
# Bias vector for hidden layer
########################
bg = nd.random_normal(shape=num_hidden, ctx=ctx) * .01
bi = nd.random_normal(shape=num_hidden, ctx=ctx) * .01
bf = nd.random_normal(shape=num_hidden, ctx=ctx) * .01
bo = nd.random_normal(shape=num_hidden, ctx=ctx) * .01
########################
# Weights to the output nodes
########################
Why = nd.random_normal(shape=(num_hidden,num_outputs), ctx=ctx) * .01
by = nd.random_normal(shape=num_outputs, ctx=ctx) * .01
In [11]:
params = [Wxg, Wxi, Wxf, Wxo, Whg, Whi, Whf, Who, bg, bi, bf, bo, Why, by]
for param in params:
param.attach_grad()
In [12]:
def softmax(y_linear, temperature=1.0):
lin = (y_linear-nd.max(y_linear)) / temperature
exp = nd.exp(lin)
partition = nd.sum(exp, axis=0, exclude=True).reshape((-1,1))
return exp / partition
In [13]:
def lstm_rnn(inputs, h, c, temperature=1.0):
outputs = []
# inputs is one BATCH of sequences so its shape is number_of_seq, seq_length, features_dim
# (latter is 1 for a time series, vocab_size for a character, n for a n different times series)
for X in inputs:
# print('shape of inputs, X: ', inputs.shape, X.shape)
# X is batch of one time stamp. E.g. if each batch has 37 sequences, then the first value of X will be a set of the 37 first values of each of the 37 sequences
# that means each iteration on X corresponds to one time stamp, but it is done in batches of different sequences
g = nd.tanh(nd.dot(X, Wxg) + nd.dot(h, Whg) + bg)
i = nd.sigmoid(nd.dot(X, Wxi) + nd.dot(h, Whi) + bi)
f = nd.sigmoid(nd.dot(X, Wxf) + nd.dot(h, Whf) + bf)
o = nd.sigmoid(nd.dot(X, Wxo) + nd.dot(h, Who) + bo)
#######################
#
#######################
c = f * c + i * g
h = o * nd.tanh(c)
#######################
#
#######################
yhat_linear = nd.dot(h, Why) + by
# yhat is a batch of several values of the same time stamp
# this is basically the prediction of the sequence, which overlaps most of the input sequence, plus one point (character or value)
# yhat = softmax(yhat_linear, temperature=temperature)
# yhat = nd.sigmoid(yhat_linear)
# yhat = nd.tanh(yhat_linear)
yhat = yhat_linear # we cant use a 1.0-bounded activation function since amplitudes can be greater than 1.0
outputs.append(yhat) # outputs has same shape as inputs, i.e. a list of batches of data points.
# print('some shapes... yhat outputs', yhat.shape, len(outputs) )
return (outputs, h, c)
In [14]:
def cross_entropy(yhat, y):
return - nd.mean(nd.sum(y * nd.log(yhat), axis=0, exclude=True))
# root_mean_squared_error = mx.metric.RMSE()
def rmse(yhat, y):
# root_mean_squared_error.update(labels = y, preds = yhat)
# return root_mean_squared_error.get()
# print("LOOOL ", nd.power(y - yhat, 2))
# return - nd.mean(nd.sum(y * nd.log(yhat), axis=0, exclude=True))
return nd.mean(nd.sqrt(nd.sum(nd.power(y - yhat, 2), axis=0, exclude=True)))
In [15]:
def average_ce_loss(outputs, labels):
assert(len(outputs) == len(labels))
total_loss = 0.
for (output, label) in zip(outputs,labels):
total_loss = total_loss + cross_entropy(output, label)
return total_loss / len(outputs)
def average_rmse_loss(outputs, labels):
assert(len(outputs) == len(labels))
total_loss = 0.
for (output, label) in zip(outputs,labels):
total_loss = total_loss + rmse(output, label)
return total_loss / len(outputs)
In [16]:
def SGD(params, lr):
for param in params:
param[:] = param - lr * param.grad
In [17]:
def test_prediction(one_input_seq, one_label_seq, temperature=1.0):
#####################################
# Set the initial state of the hidden representation ($h_0$) to the zero vector
##################################### # some better initialization needed??
h = nd.zeros(shape=(1, num_hidden), ctx=ctx)
c = nd.zeros(shape=(1, num_hidden), ctx=ctx)
outputs, h, c = lstm_rnn(one_input_seq, h, c, temperature=temperature)
loss = rmse(outputs[-1][0], one_label_seq)
return outputs[-1][0].asnumpy()[-1], one_label_seq.asnumpy()[-1], loss.asnumpy()[-1], outputs, one_label_seq
def check_prediction(index):
o, label, loss, outputs, labels = test_prediction(test_data_inputs[index], test_data_labels[index], temperature=1.0)
prediction = round(o, 3)
true_label = round(label, 3)
outputs = [float(i.asnumpy().flatten()) for i in outputs]
true_labels = list(test_data_labels[index].asnumpy().flatten())
# print(outputs, '\n----\n', true_labels)
df = pd.DataFrame([outputs, true_labels]).transpose()
df.columns = ['predicted', 'true']
# print(df)
rel_error = round(100. * (prediction / true_label - 1.0), 2)
# print('\nprediction = {0} | actual_value = {1} | rel_error = {2}'.format(prediction, true_label, rel_error))
return df
In [19]:
epochs = 45
moving_loss = 0.
learning_rate = .03
# needed to update plots on the fly
%matplotlib notebook
fig, axxx = plt.subplots(1,1)
# for i in range(4):
# l1 = test_data_inputs[i].asnumpy().flatten()
# l2 = test_data_labels[i].asnumpy().flatten()
# df = pd.DataFrame([l1, l2]).transpose()
# df.columns = ['predicted', 'true']
# axxx.clear()
# # plt.pause(0.3)
# df.plot(ax=axxx)
# fig.canvas.draw()
# # plt.draw()
# # plt.pause(0.5)
# # df.plot(fig.axes)
# time.sleep(0.3)
# %matplotlib inline
# state = nd.zeros(shape=(batch_size, num_hidden), ctx=ctx)
for e in range(epochs):
############################
# Attenuate the learning rate by a factor of 2 every 100 epochs.
############################
if ((e+1) % 100 == 0):
learning_rate = learning_rate / 2.0
h = nd.zeros(shape=(batch_size, num_hidden), ctx=ctx)
c = nd.zeros(shape=(batch_size, num_hidden), ctx=ctx)
for i in range(num_batches_train):
data_one_hot = train_data_inputs[i]
label_one_hot = train_data_labels[i]
with autograd.record():
# print('SHAPE OF data_one_hot ', data_one_hot.shape)
outputs, h, c = lstm_rnn(data_one_hot, h, c)
loss = average_rmse_loss(outputs, label_one_hot)
loss.backward()
SGD(params, learning_rate)
##########################
# Keep a moving average of the losses
##########################
if (i == 0) and (e == 0):
moving_loss = nd.mean(loss).asscalar()
else:
moving_loss = .99 * moving_loss + .01 * nd.mean(loss).asscalar()
# print("Epoch %s. Loss: %s" % (e, moving_loss))
data_prediction_df = check_prediction(index=e)
axxx.clear()
data_prediction_df.plot(ax=axxx)
fig.canvas.draw()
prediction = round(data_prediction_df.tail(1)['predicted'].values.flatten()[-1], 3)
true_label = round(data_prediction_df.tail(1)['true'].values.flatten()[-1], 3)
rel_error = round(100. * (prediction / true_label - 1.0), 2)
print("Epoch = {0} | Loss = {1} | Prediction = {2} True = {3} Error = {4}".format(e, moving_loss, prediction, true_label, rel_error ))
%matplotlib inline
For whinges or inquiries, open an issue on GitHub.