Deep LSTM RNNs for N-D time series


In [2]:
from __future__ import print_function
import mxnet as mx
from mxnet import nd, autograd
import numpy as np
from collections import defaultdict
mx.random.seed(1)
# ctx = mx.gpu(0)
ctx = mx.cpu(0)

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from datetime import datetime
# import mpld3
sns.set_style('whitegrid')
#sns.set_context('notebook')
sns.set_context('poster')
# Make inline plots vector graphics instead of raster graphics
from IPython.display import set_matplotlib_formats
#set_matplotlib_formats('pdf', 'svg')
set_matplotlib_formats('pdf', 'png')

SEQ_LENGTH = 16 + 1  # needs to be at least the seq_length for training + 1 because of the time shift between inputs and labels
NUM_SAMPLES_TRAINING = 5000 + 1
NUM_SAMPLES_TESTING = 250 + 1
NUM_FEATURES = 2
CREATE_DATA_SETS = True  # True if you don't have the data files or re-create them

Dataset: "Some time-series"


In [2]:
def gimme_one_random_number():
    return nd.random_uniform(low=0, high=1, shape=(1,1)).asnumpy()[0][0]

def create_one_2D_time_series(seq_length=10):
    freq = (gimme_one_random_number()*0.5) + 0.1  # 0.1 to 0.6
    ampl = gimme_one_random_number() + 0.5  # 0.5 to 1.5
    x1 = np.sin(np.arange(0, seq_length) * freq) * ampl
    x2 = np.abs((x1-0.5)) // 0.2
    x = zip(x1, x2)
    x = [list(i) for i in x]
    return x

def create_3_feature_time_series(seq_length=5):
    freq = (gimme_one_random_number()*0.5) + 0.1  # 0.1 to 0.6
    ampl = gimme_one_random_number() + 0.5  # 0.5 to 1.5
    x1 = np.sin(np.arange(0, seq_length) * freq) * ampl
    x2 = np.abs((x1-0.5)) // 0.2
    x3 = np.abs((x1-0.5)) // 0.1
    return (x1, x2, x3)

def create_batch_2D_time_series(seq_length=10, num_samples=4):
    data = []
    for i in range(0, num_samples):
        data.append(create_one_2D_time_series(seq_length=seq_length))
    return np.array(data)

In [3]:
#Create some time-series
#uncomment below to force predictible random numbers
mx.random.seed(123)
if CREATE_DATA_SETS:
    data_train = create_batch_2D_time_series(seq_length=SEQ_LENGTH, num_samples=NUM_SAMPLES_TRAINING)  
    data_test = create_batch_2D_time_series(seq_length=SEQ_LENGTH, num_samples=NUM_SAMPLES_TESTING)
    #Write data to csv
#     data_train.to_csv("../data/timeseries/train.csv")
#     data_test.to_csv("../data/timeseries/test.csv")
# else: 
#     data_train = pd.read_csv("../data/timeseries/train.csv", index_col=0)
#     data_test = pd.read_csv("../data/timeseries/test.csv", index_col=0)

Check the data real quick


In [4]:
# num_sampling_points = min(SEQ_LENGTH, 400)
# (data_train.sample(4).transpose().iloc[range(0, SEQ_LENGTH, SEQ_LENGTH//num_sampling_points)]).plot()
# print (data_train)
# print(data_train[:, :-1, :]) # inputs
# batch_size = 5
# num_batches_train = data_train.shape[0] // batch_size
# print(num_batches_train)
# print ( nd.array(data_train[:, :-1, :]).reshape((num_batches_train, 5, SEQ_LENGTH-1, NUM_FEATURES)) )

Preparing the data for training


In [5]:
batch_size = 64
batch_size_test = 1
seq_length = SEQ_LENGTH - 1

num_batches_train = data_train.shape[0] // batch_size
num_batches_test = data_test.shape[0] // batch_size_test

num_features = NUM_FEATURES  #  we do 1D time series for now, this is like vocab_size = 1 for characters

# inputs are from t0 to t_seq_length - 1. because the last point is kept for the output ("label") of the penultimate point 
data_train_inputs = data_train[:, :-1, :]
data_train_labels = data_train[:, 1:, :]
data_test_inputs = data_test[:, :-1, :]
data_test_labels = data_test[:, 1:, :]

train_data_inputs = nd.array(data_train_inputs).reshape((num_batches_train, batch_size, seq_length, num_features))
train_data_labels = nd.array(data_train_labels).reshape((num_batches_train, batch_size, seq_length, num_features))
test_data_inputs = nd.array(data_test_inputs).reshape((num_batches_test, batch_size_test, seq_length, num_features))
test_data_labels = nd.array(data_test_labels).reshape((num_batches_test, batch_size_test, seq_length, num_features))

train_data_inputs = nd.swapaxes(train_data_inputs, 1, 2)
train_data_labels = nd.swapaxes(train_data_labels, 1, 2)
test_data_inputs = nd.swapaxes(test_data_inputs, 1, 2)
test_data_labels = nd.swapaxes(test_data_labels, 1, 2)


print('num_samples_training={0} | num_batches_train={1} | batch_size={2} | seq_length={3}'.format(NUM_SAMPLES_TRAINING, num_batches_train, batch_size, seq_length))
print('train_data_inputs shape: ', train_data_inputs.shape)
print('train_data_labels shape: ', train_data_labels.shape)
# print(data_train_inputs.values)
# print(train_data_inputs[0]) # see what one batch looks like


num_samples_training=5001 | num_batches_train=78 | batch_size=64 | seq_length=16
train_data_inputs shape:  (78L, 16L, 64L, 2L)
train_data_labels shape:  (78L, 16L, 64L, 2L)

Long short-term memory (LSTM) RNNs

An LSTM block has mechanisms to enable "memorizing" information for an extended number of time steps. We use the LSTM block with the following transformations that map inputs to outputs across blocks at consecutive layers and consecutive time steps: $\newcommand{\xb}{\mathbf{x}} \newcommand{\RR}{\mathbb{R}}$

$$g_t = \text{tanh}(X_t W_{xg} + h_{t-1} W_{hg} + b_g),$$$$i_t = \sigma(X_t W_{xi} + h_{t-1} W_{hi} + b_i),$$$$f_t = \sigma(X_t W_{xf} + h_{t-1} W_{hf} + b_f),$$$$o_t = \sigma(X_t W_{xo} + h_{t-1} W_{ho} + b_o),$$$$c_t = f_t \odot c_{t-1} + i_t \odot g_t,$$$$h_t = o_t \odot \text{tanh}(c_t),$$

where $\odot$ is an element-wise multiplication operator, and for all $\xb = [x_1, x_2, \ldots, x_k]^\top \in \RR^k$ the two activation functions:

$$\sigma(\xb) = \left[\frac{1}{1+\exp(-x_1)}, \ldots, \frac{1}{1+\exp(-x_k)}]\right]^\top,$$$$\text{tanh}(\xb) = \left[\frac{1-\exp(-2x_1)}{1+\exp(-2x_1)}, \ldots, \frac{1-\exp(-2x_k)}{1+\exp(-2x_k)}\right]^\top.$$

In the transformations above, the memory cell $c_t$ stores the "long-term" memory in the vector form. In other words, the information accumulatively captured and encoded until time step $t$ is stored in $c_t$ and is only passed along the same layer over different time steps.

Given the inputs $c_t$ and $h_t$, the input gate $i_t$ and forget gate $f_t$ will help the memory cell to decide how to overwrite or keep the memory information. The output gate $o_t$ further lets the LSTM block decide how to retrieve the memory information to generate the current state $h_t$ that is passed to both the next layer of the current time step and the next time step of the current layer. Such decisions are made using the hidden-layer parameters $W$ and $b$ with different subscripts: these parameters will be inferred during the training phase by gluon.

Allocate parameters


In [6]:
num_inputs = NUM_FEATURES  # for a 1D time series, this is just a scalar equal to 1.0
num_outputs = NUM_FEATURES  # same comment
num_hidden_units = [6, 4]  # num of hidden units in each hidden LSTM layer
num_hidden_layers = len(num_hidden_units)  # num of hidden LSTM layers
num_units_layers = [num_features] + num_hidden_units

########################
#  Weights connecting the inputs to the hidden layer
########################
Wxg, Wxi, Wxf, Wxo, Whg, Whi, Whf, Who, bg, bi, bf, bo = {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {} 
for i_layer in range(1, num_hidden_layers+1):
    num_inputs = num_units_layers[i_layer-1]
    num_hidden_units = num_units_layers[i_layer]
    Wxg[i_layer] = nd.random_normal(shape=(num_inputs,num_hidden_units), ctx=ctx) * .01
    Wxi[i_layer] = nd.random_normal(shape=(num_inputs,num_hidden_units), ctx=ctx) * .01
    Wxf[i_layer] = nd.random_normal(shape=(num_inputs,num_hidden_units), ctx=ctx) * .01
    Wxo[i_layer] = nd.random_normal(shape=(num_inputs,num_hidden_units), ctx=ctx) * .01

    ########################
    #  Recurrent weights connecting the hidden layer across time steps
    ########################
    Whg[i_layer] = nd.random_normal(shape=(num_hidden_units, num_hidden_units), ctx=ctx) * .01
    Whi[i_layer] = nd.random_normal(shape=(num_hidden_units, num_hidden_units), ctx=ctx) * .01
    Whf[i_layer] = nd.random_normal(shape=(num_hidden_units, num_hidden_units), ctx=ctx) * .01
    Who[i_layer] = nd.random_normal(shape=(num_hidden_units, num_hidden_units), ctx=ctx) * .01

    ########################
    #  Bias vector for hidden layer
    ########################
    bg[i_layer] = nd.random_normal(shape=num_hidden_units, ctx=ctx) * .01
    bi[i_layer] = nd.random_normal(shape=num_hidden_units, ctx=ctx) * .01
    bf[i_layer] = nd.random_normal(shape=num_hidden_units, ctx=ctx) * .01
    bo[i_layer] = nd.random_normal(shape=num_hidden_units, ctx=ctx) * .01

########################
# Weights to the output nodes
########################
Why = nd.random_normal(shape=(num_units_layers[-1], num_outputs), ctx=ctx) * .01
by = nd.random_normal(shape=num_outputs, ctx=ctx) * .01

Attach the gradients


In [7]:
params = []
for i_layer in range(1, num_hidden_layers+1):
    params += [Wxg[i_layer], Wxi[i_layer], Wxf[i_layer], Wxo[i_layer], Whg[i_layer], Whi[i_layer], Whf[i_layer], Who[i_layer], bg[i_layer], bi[i_layer], bf[i_layer], bo[i_layer]]

params += [Why, by]  # add the output layer

for param in params:
    param.attach_grad()

Softmax Activation


In [8]:
def softmax(y_linear, temperature=1.0):
    lin = (y_linear-nd.max(y_linear)) / temperature
    exp = nd.exp(lin)
    partition = nd.sum(exp, axis=0, exclude=True).reshape((-1,1))
    return exp / partition

Cross-entropy loss function


In [9]:
def cross_entropy(yhat, y):
    return - nd.mean(nd.sum(y * nd.log(yhat), axis=0, exclude=True))

def rmse(yhat, y):
    return nd.mean(nd.sqrt(nd.sum(nd.power(y - yhat, 2), axis=0, exclude=True)))

Averaging the loss over the sequence


In [10]:
def average_ce_loss(outputs, labels):
    assert(len(outputs) == len(labels))
    total_loss = 0.
    for (output, label) in zip(outputs,labels):
        total_loss = total_loss + cross_entropy(output, label)
    return total_loss / len(outputs)

def average_rmse_loss(outputs, labels):
    assert(len(outputs) == len(labels))
    total_loss = 0.
    for (output, label) in zip(outputs,labels):
        total_loss = total_loss + rmse(output, label)
    return total_loss / len(outputs)

Optimizer


In [11]:
from exceptions import ValueError

def SGD(params, learning_rate):
    for param in params:
#         print('grrrrr: ', param.grad)
        param[:] = param - learning_rate * param.grad

def adam(params, learning_rate, M , R, index_adam_call, beta1, beta2, eps):
    k = -1
    for param in params:
        k += 1
        M[k] = beta1 * M[k] + (1. - beta1) * param.grad
        R[k] = beta2 * R[k] + (1. - beta2) * (param.grad)**2
        # bias correction since we initilized M & R to zeros, they're biased toward zero on the first few iterations
        m_k_hat = M[k] / (1. - beta1**(index_adam_call))
        r_k_hat = R[k] / (1. - beta2**(index_adam_call))
        if((np.isnan(M[k].asnumpy())).any() or (np.isnan(R[k].asnumpy())).any()):
#             print('GRRRRRR ', M, K)
            raise(ValueError('Nans!!'))
#         print('grrrrr: ', param.grad)
        param[:] = param - learning_rate * m_k_hat / (nd.sqrt(r_k_hat) + eps)
#     print('m_k_hat r_k_hat', m_k_hat, r_k_hat)
    return params, M, R

Define the model


In [12]:
def single_lstm_unit_calcs(X, c, Wxg, h, Whg, bg, Wxi, Whi, bi, Wxf, Whf, bf, Wxo, Who, bo):
    g = nd.tanh(nd.dot(X, Wxg) + nd.dot(h, Whg) + bg)
    i = nd.sigmoid(nd.dot(X, Wxi) + nd.dot(h, Whi) + bi)
    f = nd.sigmoid(nd.dot(X, Wxf) + nd.dot(h, Whf) + bf)
    o = nd.sigmoid(nd.dot(X, Wxo) + nd.dot(h, Who) + bo)
    #######################
    c = f * c + i * g
    h = o * nd.tanh(c)
    return c, h

def deep_lstm_rnn(inputs, h, c, temperature=1.0):
    """
        h: dict of nd.arrays, each key is the index of a hidden layer (from 1 to whatever). 
        Index 0, if any, is the input layer
    """
    outputs = []
    # inputs is one BATCH of sequences so its shape is number_of_seq, seq_length, features_dim 
    # (latter is 1 for a time series, vocab_size for a character, n for a n different times series)
    for X in inputs:
        # X is batch of one time stamp. E.g. if each batch has 37 sequences, then the first value of X will be a set of the 37 first values of each of the 37 sequences 
        # that means each iteration on X corresponds to one time stamp, but it is done in batches of different sequences
        h[0] = X # the first hidden layer takes the input X as input 
        for i_layer in range(1, num_hidden_layers+1):
            # lstm units now have the 2 following inputs: 
            # i) h_t from the previous layer (equivalent to the input X for a non-deep lstm net), 
            # ii) h_t-1 from the current layer (same as for non-deep lstm nets)
            c[i_layer], h[i_layer] = single_lstm_unit_calcs(h[i_layer-1], c[i_layer], Wxg[i_layer], h[i_layer], Whg[i_layer], bg[i_layer], Wxi[i_layer], Whi[i_layer], bi[i_layer], Wxf[i_layer], Whf[i_layer], bf[i_layer], Wxo[i_layer], Who[i_layer], bo[i_layer])
        yhat_linear = nd.dot(h[num_hidden_layers], Why) + by
        # yhat is a batch of several values of the same time stamp
        # this is basically the prediction of the sequence, which overlaps most of the input sequence, plus one point (character or value)
#         yhat = softmax(yhat_linear, temperature=temperature)
#         yhat = nd.sigmoid(yhat_linear)
#         yhat = nd.tanh(yhat_linear)
        yhat = yhat_linear # we cant use a 1.0-bounded activation function since amplitudes can be greater than 1.0
        outputs.append(yhat) # outputs has same shape as inputs, i.e. a list of batches of data points.
#     print('some shapes... yhat outputs', yhat.shape, len(outputs) )
    return (outputs, h, c)

Test and visualize predictions


In [13]:
INDEX_TARGET_VALUE = 0
def test_prediction(one_input_seq, one_label_seq, temperature=1.0):
      # WE ASSUME the first value in input vector is the variable of interest
    #####################################
    # Set the initial state of the hidden representation ($h_0$) to the zero vector
    #####################################  # some better initialization needed??
    h, c = {}, {}
    for i_layer in range(1, num_hidden_layers+1):
        h[i_layer] = nd.zeros(shape=(batch_size_test, num_units_layers[i_layer]), ctx=ctx)
        c[i_layer] = nd.zeros(shape=(batch_size_test, num_units_layers[i_layer]), ctx=ctx)
    
    outputs, h, c = deep_lstm_rnn(one_input_seq, h, c, temperature=temperature)

    return outputs[-1][0].asnumpy()[INDEX_TARGET_VALUE], one_label_seq.asnumpy()[-1].flatten()[INDEX_TARGET_VALUE], outputs, one_label_seq

def check_prediction(index):
#     o, label, outputs, labels = test_prediction(test_data_inputs[index], test_data_labels[index], temperature=1.0)
    o, label, outputs, labels = test_prediction(test_data_inputs[index], test_data_labels[index], temperature=1.0)    
#     return o, label

#     if len(label) > 1:
#         label = label[0]
#     if len(o) > 1:
#         o = o[0]
#     if len(outputs) > 1:
#         outputs = outputs[0]
    prediction = round(o, 3)
#     print('lABELL ', label)
    true_label = round(label, 3)
    outputs = [float(i.asnumpy().flatten()[INDEX_TARGET_VALUE]) for i in outputs]  # if batch_size_test=1 then this float() will work, otherwise, nope.
    true_labels = list(test_data_labels[index].asnumpy()[:,:,INDEX_TARGET_VALUE].flatten())
    # print(outputs, '\n----\n', true_labels)
    df = pd.DataFrame([outputs, true_labels]).transpose()
    df.columns = ['predicted', 'true']
    # print(df)
    rel_error = round(100. * (prediction / true_label - 1.0), 2)
#     print('\nprediction = {0} | actual_value = {1} | rel_error = {2}'.format(prediction, true_label, rel_error))
    return df

In [14]:
epochs = 80  # at some point, some nans appear in M, R matrices of Adam. TODO investigate why
moving_loss = 0.
learning_rate = 0.001  # 0.1 works for a [8, 8] after about 70 epochs of 32-sized batches

# Adam Optimizer stuff
beta1 = .9
beta2 = .999
index_adam_call = 0
# M & R arrays to keep track of momenta in adam optimizer. params is a list that contains all ndarrays of parameters
M = {k: nd.zeros_like(v) for k, v in enumerate(params)}
R = {k: nd.zeros_like(v) for k, v in enumerate(params)}

df_moving_loss = pd.DataFrame(columns=['Loss', 'Error'])
df_moving_loss.index.name = 'Epoch'

# needed to update plots on the fly
%matplotlib notebook
fig, axes_fig1 = plt.subplots(1,1, figsize=(6,3))
fig2, axes_fig2 = plt.subplots(1,1, figsize=(6,3))

for e in range(epochs):
    ############################
    # Attenuate the learning rate by a factor of 2 every 100 epochs
    ############################
    if ((e+1) % 1000 == 0):
        learning_rate = learning_rate / 2.0  # TODO check if its ok to adjust learning_rate when using Adam Optimizer
    h, c = {}, {}
    for i_layer in range(1, num_hidden_layers+1):
        h[i_layer] = nd.zeros(shape=(batch_size, num_units_layers[i_layer]), ctx=ctx)
        c[i_layer] = nd.zeros(shape=(batch_size, num_units_layers[i_layer]), ctx=ctx)

    for i in range(num_batches_train):
        data_one_hot = train_data_inputs[i]
        label_one_hot = train_data_labels[i]
        with autograd.record():
            outputs, h, c = deep_lstm_rnn(data_one_hot, h, c)
            loss = average_rmse_loss(outputs, label_one_hot)
            loss.backward()
#         SGD(params, learning_rate)
        index_adam_call += 1  # needed for bias correction in Adam optimizer
        params, M, R = adam(params, learning_rate, M, R, index_adam_call, beta1, beta2, 1e-8)
        
        ##########################
        #  Keep a moving average of the losses
        ##########################
        if (i == 0) and (e == 0):
            moving_loss = nd.mean(loss).asscalar()
        else:
            moving_loss = .99 * moving_loss + .01 * nd.mean(loss).asscalar()
        df_moving_loss.loc[e] = round(moving_loss, 4)

    ############################
    #  Predictions and plots
    ############################
    data_prediction_df = check_prediction(index=e)
#     print (data_prediction_df)
#     outputs, h, c = deep_lstm_rnn(test_data_inputs[0], h, c, temperature=1.0)
#     outputs, one_label_seq, loss = check_prediction(index=e)
    
    
    
    
    
    
    
    axes_fig1.clear()
    data_prediction_df.plot(ax=axes_fig1)
    fig.canvas.draw()
    prediction = round(data_prediction_df.tail(1)['predicted'].values.flatten()[-1], 3)
    true_label = round(data_prediction_df.tail(1)['true'].values.flatten()[-1], 3)
    rel_error = round(100. * np.abs(prediction / true_label - 1.0), 2)
    print("Epoch = {0} | Loss = {1} | Prediction = {2} True = {3} Error = {4}".format(e, moving_loss, prediction, true_label, rel_error ))
    axes_fig2.clear()
    if e == 0:
        moving_rel_error = rel_error
    else:
        moving_rel_error = .9 * moving_rel_error + .1 * rel_error

    df_moving_loss.loc[e, ['Error']] = moving_rel_error
    axes_loss_plot = df_moving_loss.plot(ax=axes_fig2, secondary_y='Loss', color=['r','b'])
    axes_loss_plot.right_ax.grid(False)
#     axes_loss_plot.right_ax.set_yscale('log')
    fig2.canvas.draw()
    
%matplotlib inline


/usr/local/lib/python2.7/site-packages/IPython/core/formatters.py:92: DeprecationWarning: DisplayFormatter._ipython_display_formatter_default is deprecated: use @default decorator instead.
  def _ipython_display_formatter_default(self):
/usr/local/lib/python2.7/site-packages/IPython/core/formatters.py:669: DeprecationWarning: PlainTextFormatter._singleton_printers_default is deprecated: use @default decorator instead.
  def _singleton_printers_default(self):
Epoch = 0 | Loss = 2.4588569466 | Prediction = 0.298 True = -1.094 Error = 127.24
/usr/local/lib/python2.7/site-packages/matplotlib/axes/_base.py:2787: UserWarning: Attempting to set identical left==right results
in singular transformations; automatically expanding.
left=0.0, right=0.0
  'left=%s, right=%s') % (left, right))
Epoch = 1 | Loss = 2.2807719775 | Prediction = 0.527 True = 0.86 Error = 38.72
Epoch = 2 | Loss = 2.12875277364 | Prediction = 0.503 True = 0.458 Error = 9.83
Epoch = 3 | Loss = 2.04752333478 | Prediction = 0.498 True = 0.498 Error = 0.0
Epoch = 4 | Loss = 2.00373157132 | Prediction = 0.511 True = 0.185 Error = 176.22
Epoch = 5 | Loss = 1.9685147595 | Prediction = 0.533 True = 1.004 Error = 46.91
Epoch = 6 | Loss = 1.93045426418 | Prediction = 0.619 True = -1.471 Error = 142.08
Epoch = 7 | Loss = 1.87459083229 | Prediction = 0.416 True = 0.072 Error = 477.78
Epoch = 8 | Loss = 1.78361060171 | Prediction = 0.345 True = 0.094 Error = 267.02
Epoch = 9 | Loss = 1.67992441298 | Prediction = 0.402 True = 0.821 Error = 51.04
Epoch = 10 | Loss = 1.58621683655 | Prediction = 0.558 True = -0.348 Error = 260.34
Epoch = 11 | Loss = 1.49579923502 | Prediction = 0.429 True = -0.399 Error = 207.52
Epoch = 12 | Loss = 1.40297516462 | Prediction = 0.479 True = 0.895 Error = 46.48
Epoch = 13 | Loss = 1.30704417223 | Prediction = 0.049 True = -0.417 Error = 111.75
Epoch = 14 | Loss = 1.19848408387 | Prediction = 0.475 True = 0.953 Error = 50.16
Epoch = 15 | Loss = 1.09523787243 | Prediction = 0.477 True = 1.008 Error = 52.68
Epoch = 16 | Loss = 1.00851292505 | Prediction = -0.62 True = -0.981 Error = 36.8
Epoch = 17 | Loss = 0.941485468335 | Prediction = 0.462 True = 0.503 Error = 8.15
Epoch = 18 | Loss = 0.887871681565 | Prediction = 0.487 True = 0.866 Error = 43.76
Epoch = 19 | Loss = 0.842963130697 | Prediction = 0.95 True = 1.287 Error = 26.18
Epoch = 20 | Loss = 0.803581538255 | Prediction = 0.69 True = 1.146 Error = 39.79
Epoch = 21 | Loss = 0.768530744616 | Prediction = 1.014 True = 0.961 Error = 5.52
Epoch = 22 | Loss = 0.73914425791 | Prediction = 0.486 True = 0.655 Error = 25.8
Epoch = 23 | Loss = 0.715639217814 | Prediction = 0.445 True = 0.064 Error = 595.31
Epoch = 24 | Loss = 0.696676386474 | Prediction = 0.919 True = 0.655 Error = 40.31
Epoch = 25 | Loss = 0.680508933195 | Prediction = -0.625 True = -0.635 Error = 1.57
Epoch = 26 | Loss = 0.666013658896 | Prediction = -0.304 True = -0.401 Error = 24.19
Epoch = 27 | Loss = 0.652857802312 | Prediction = 0.195 True = 0.145 Error = 34.48
Epoch = 28 | Loss = 0.641101068536 | Prediction = 0.418 True = 0.49 Error = 14.69
Epoch = 29 | Loss = 0.630650542103 | Prediction = 0.412 True = 0.39 Error = 5.64
Epoch = 30 | Loss = 0.621352946438 | Prediction = 0.985 True = 1.222 Error = 19.39
Epoch = 31 | Loss = 0.612961316316 | Prediction = 0.465 True = 0.849 Error = 45.23
Epoch = 32 | Loss = 0.605280692797 | Prediction = 0.837 True = 0.788 Error = 6.22
Epoch = 33 | Loss = 0.59815205112 | Prediction = -0.162 True = -0.135 Error = 20.0
Epoch = 34 | Loss = 0.591422785659 | Prediction = -0.674 True = -0.964 Error = 30.08
Epoch = 35 | Loss = 0.585029555249 | Prediction = 0.602 True = 0.494 Error = 21.86
Epoch = 36 | Loss = 0.57894677886 | Prediction = 1.179 True = 1.475 Error = 20.07
Epoch = 37 | Loss = 0.573107839559 | Prediction = -0.542 True = -0.763 Error = 28.96
Epoch = 38 | Loss = 0.567454019775 | Prediction = -0.759 True = -0.789 Error = 3.8
Epoch = 39 | Loss = 0.561950598375 | Prediction = -0.614 True = -0.471 Error = 30.36
Epoch = 40 | Loss = 0.556605449448 | Prediction = 0.648 True = 0.623 Error = 4.01
Epoch = 41 | Loss = 0.551449269873 | Prediction = -0.14 True = -0.272 Error = 48.53
Epoch = 42 | Loss = 0.546463514305 | Prediction = 0.41 True = 0.265 Error = 54.72
Epoch = 43 | Loss = 0.54168154519 | Prediction = 0.346 True = 0.132 Error = 162.12
Epoch = 44 | Loss = 0.53711674231 | Prediction = -0.611 True = -0.819 Error = 25.4
Epoch = 45 | Loss = 0.532787626039 | Prediction = -0.927 True = -0.901 Error = 2.89
Epoch = 46 | Loss = 0.528671676685 | Prediction = 0.137 True = 0.018 Error = 661.11
Epoch = 47 | Loss = 0.524750928792 | Prediction = -0.328 True = -0.505 Error = 35.05
Epoch = 48 | Loss = 0.521019539694 | Prediction = 0.371 True = 0.348 Error = 6.61
Epoch = 49 | Loss = 0.51746958148 | Prediction = 0.207 True = -0.026 Error = 896.15
Epoch = 50 | Loss = 0.514088672005 | Prediction = 0.774 True = 0.721 Error = 7.35
Epoch = 51 | Loss = 0.510840631107 | Prediction = -1.036 True = -1.183 Error = 12.43
Epoch = 52 | Loss = 0.507714054794 | Prediction = 1.087 True = 1.057 Error = 2.84
Epoch = 53 | Loss = 0.504680946308 | Prediction = -0.568 True = -0.588 Error = 3.4
Epoch = 54 | Loss = 0.501721005896 | Prediction = -0.752 True = -0.741 Error = 1.48
Epoch = 55 | Loss = 0.498815467589 | Prediction = 0.776 True = 0.877 Error = 11.52
Epoch = 56 | Loss = 0.49595865225 | Prediction = -0.77 True = -0.929 Error = 17.12
Epoch = 57 | Loss = 0.493134441645 | Prediction = 0.537 True = -0.026 Error = 2165.38
Epoch = 58 | Loss = 0.49033715061 | Prediction = 0.961 True = 0.969 Error = 0.83
Epoch = 59 | Loss = 0.487560863749 | Prediction = 0.651 True = 0.552 Error = 17.93
Epoch = 60 | Loss = 0.484815261675 | Prediction = 0.413 True = 0.391 Error = 5.63
Epoch = 61 | Loss = 0.482098450733 | Prediction = 0.288 True = 0.026 Error = 1007.69
Epoch = 62 | Loss = 0.479412318473 | Prediction = 1.151 True = 1.131 Error = 1.77
Epoch = 63 | Loss = 0.476758283022 | Prediction = 1.047 True = 1.201 Error = 12.82
Epoch = 64 | Loss = 0.474145147048 | Prediction = 0.667 True = 0.861 Error = 22.53
Epoch = 65 | Loss = 0.471575020908 | Prediction = 0.58 True = 0.434 Error = 33.64
Epoch = 66 | Loss = 0.469053999464 | Prediction = 0.631 True = 0.871 Error = 27.55
Epoch = 67 | Loss = 0.466590675509 | Prediction = 0.155 True = 0.187 Error = 17.11
Epoch = 68 | Loss = 0.464191761522 | Prediction = -0.529 True = -0.674 Error = 21.51
Epoch = 69 | Loss = 0.461861603355 | Prediction = 0.057 True = 0.101 Error = 43.56
Epoch = 70 | Loss = 0.459602119736 | Prediction = 0.67 True = 0.998 Error = 32.87
Epoch = 71 | Loss = 0.457424326199 | Prediction = -0.196 True = -0.368 Error = 46.74
Epoch = 72 | Loss = 0.455323522641 | Prediction = 0.546 True = 0.057 Error = 857.89
Epoch = 73 | Loss = 0.453309978858 | Prediction = 0.495 True = 0.478 Error = 3.56
Epoch = 74 | Loss = 0.451389961282 | Prediction = 0.474 True = 0.438 Error = 8.22
Epoch = 75 | Loss = 0.449552763631 | Prediction = 0.739 True = 0.766 Error = 3.52
Epoch = 76 | Loss = 0.447796639897 | Prediction = 1.19 True = 1.184 Error = 0.51
Epoch = 77 | Loss = 0.446115830393 | Prediction = -0.568 True = -0.714 Error = 20.45
Epoch = 78 | Loss = 0.444491640159 | Prediction = 1.227 True = 1.233 Error = 0.49
Epoch = 79 | Loss = 0.442899209121 | Prediction = -0.655 True = -0.571 Error = 14.71

In [15]:
# print(outputs[0].asnumpy()[0].flatten())
# print(test_data_labels[0].asnumpy()[:,:,0].flatten())
# [float(i.asnumpy().flatten()) for i in outputs]
# print([i.asnumpy() for i in outputs])
# one_label_seq = test_data_labels[0]
# print(outputs[-1][0])
# print(one_label_seq)
# print(rmse(outputs[-1][0], one_label_seq))
# print(test_data_inputs[0].asnumpy()[-1].flatten()[0])
# print([i.asnumpy().flatten() for i in outputs])
# print(float(outputs[0].asnumpy()[:, 0].flatten()))

Conclusions