Deep LSTM RNNs for N-D time series


In [1]:
from __future__ import print_function
import mxnet as mx
from mxnet import nd, autograd
import numpy as np
from collections import defaultdict
mx.random.seed(1)
# ctx = mx.gpu(0)
ctx = mx.cpu(0)

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import scipy.fftpack
from pandas.tools import plotting
from pandas.tools.plotting import autocorrelation_plot
from datetime import datetime
sns.set_style('whitegrid')
#sns.set_context('notebook')
sns.set_context('poster')
# Make inline plots vector graphics instead of raster graphics
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('pdf', 'png')


/usr/local/lib/python2.7/site-packages/IPython/core/formatters.py:98: DeprecationWarning: DisplayFormatter._formatters_default is deprecated: use @default decorator instead.
  def _formatters_default(self):
/usr/local/lib/python2.7/site-packages/IPython/core/formatters.py:677: DeprecationWarning: PlainTextFormatter._deferred_printers_default is deprecated: use @default decorator instead.
  def _deferred_printers_default(self):
/usr/local/lib/python2.7/site-packages/IPython/core/formatters.py:669: DeprecationWarning: PlainTextFormatter._singleton_printers_default is deprecated: use @default decorator instead.
  def _singleton_printers_default(self):
/usr/local/lib/python2.7/site-packages/IPython/core/formatters.py:672: DeprecationWarning: PlainTextFormatter._type_printers_default is deprecated: use @default decorator instead.
  def _type_printers_default(self):
/usr/local/lib/python2.7/site-packages/IPython/core/formatters.py:672: DeprecationWarning: PlainTextFormatter._type_printers_default is deprecated: use @default decorator instead.
  def _type_printers_default(self):
/usr/local/lib/python2.7/site-packages/IPython/core/formatters.py:677: DeprecationWarning: PlainTextFormatter._deferred_printers_default is deprecated: use @default decorator instead.
  def _deferred_printers_default(self):

Dataset: "Some time-series"


In [2]:
def get_list_unique_block_indices(len_data=100, seq_length=5, n_samples=10):
    """ returns a list of unique random int that serve as index of the first element of a block of data
        args:
            len_data (int): length of the data set
            seq_length (int): length of the blocks to extract
            n_blocks (int): # of blocks to extract
    """
    set1 = set(np.random.randint(len_data // seq_length, size=n_samples)*seq_length)
    full_set = set1
    while len(full_set) < n_samples:
        set2 = set(np.random.randint(len_data // seq_length, size=n_samples)*seq_length)
        full_set = full_set | set2
    returned_list = list(full_set)[0:n_samples]
    assert(len(returned_list) == n_samples)
    return returned_list

def extract_random_sequence(data, seq_length=5, block_start_index=None):
    columns_subset = ['car.count', 'day_of_week_int', 'cloudy_or_not_cloudy', 'weather', 'current_month']
    if block_start_index is None:
        block_start_index = np.random.randint(len(data)-seq_length)
    data_subset = data.reset_index().loc[block_start_index:block_start_index+seq_length-1, columns_subset]
    assert(len(data_subset) == (seq_length))
    out_data = [list(i) for i in data_subset.values]
    return out_data

def create_batch_ND_time_series(full_data, seq_length=10, num_samples=4):
    out_data = []
    # get a list of non-overlapping random sequence start indices
    all_samples_start_indices = get_list_unique_block_indices(len(full_data), seq_length, num_samples)
    assert(len(all_samples_start_indices) == num_samples)
    for one_random_start_index in all_samples_start_indices:
        out_data.append(extract_random_sequence(full_data, seq_length, one_random_start_index))
        assert(len(out_data[-1]) == (seq_length))
    return out_data

In [3]:
# OI data
original_data = pd.read_csv("../data/timeseries/data.csv", index_col=0)
dict_days_to_int = {'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4, 'Friday': 5, 'Saturday': 6, 'Sunday': 7}
original_data['date_']=original_data.index
original_data['current_month'] = original_data['date_'].apply(lambda x: pd.Timestamp(x).month)
original_data['day_of_week_int'] = original_data['day.of.week'].apply(lambda x: dict_days_to_int[x])
original_data['cloudy_or_not_cloudy'] = original_data['cloud.indicator'].apply(lambda x: 0 if x=='clear' else 1)
full_data = pd.DataFrame()

#############################
# JUST A RDM SAMPLE FOR NOW   /!\ /!\ /!\ /!\ /!\ /!\ /!\ careful as it shuffles the time order!
#############################
full_data = original_data#.sample(1500)
# filter out cloudy data!
full_data = full_data[full_data['cloudy_or_not_cloudy']==0]

SEQ_LENGTH = 2
NUM_FEATURES = 5

# let's divide data in train (75%), dev (15%), test (10%)
# in sequences of 5 days (SEQ_LENGTH = 5)
full_data_length = len(full_data)
# the actual length of extracted sequence is SEQ_LENGTH + 1 so that we can do the shift of +1 for labels
total_num_of_sequences = full_data_length // (SEQ_LENGTH+1) - 1

# the length of extracted sequence is SEQ_LENGTH so that we can do the shift of +1 for labels
all_random_sequences = create_batch_ND_time_series(full_data, seq_length=SEQ_LENGTH+1, num_samples=total_num_of_sequences)

n_seq_train = int(total_num_of_sequences*0.75)
n_seq_dev = int(total_num_of_sequences*0.9) - int(total_num_of_sequences*0.75)
n_seq_test = len(all_random_sequences) - int(total_num_of_sequences*0.9)

data_train = np.array(all_random_sequences[0:n_seq_train])
data_dev = np.array(all_random_sequences[n_seq_train:n_seq_train+n_seq_dev])
data_test = np.array(all_random_sequences[n_seq_train+n_seq_dev:])

print('SHAPES of ALL, TRAIN, DEV, TEST:') 
print(np.array(all_random_sequences).shape)
print(np.array(data_train).shape)
print(np.array(data_dev).shape)
print(np.array(data_test).shape)

assert(data_train.shape == (n_seq_train, SEQ_LENGTH+1, NUM_FEATURES))
assert(data_dev.shape == (n_seq_dev, SEQ_LENGTH+1, NUM_FEATURES))
assert(data_test.shape == (n_seq_test, SEQ_LENGTH+1, NUM_FEATURES))


SHAPES of ALL, TRAIN, DEV, TEST:
(592, 4, 5)
(444, 4, 5)
(88, 4, 5)
(60, 4, 5)

Check the data real quick


In [4]:
# num_sampling_points = min(SEQ_LENGTH, 400)
# (data_train.sample(4).transpose().iloc[range(0, SEQ_LENGTH, SEQ_LENGTH//num_sampling_points)]).plot()
# print (data_train)
# print(data_train[:, :-1, :]) # inputs
# batch_size = 5
# num_batches_train = data_train.shape[0] // batch_size
# print(num_batches_train)
# print ( nd.array(data_train[:, :-1, :]).reshape((num_batches_train, 5, SEQ_LENGTH-1, NUM_FEATURES)) )

Preparing the data for training


In [5]:
batch_size = 32
batch_size_test = 1
seq_length = SEQ_LENGTH

num_batches_train = data_train.shape[0] // batch_size
num_batches_test = data_test.shape[0] // batch_size_test

num_features = NUM_FEATURES  #  we do 1D time series for now, this is like vocab_size = 1 for characters

# inputs are from t0 to t_seq_length - 1. because the last point is kept for the 
# output ("label") of the penultimate point 
data_train_inputs = data_train[:, :-1, :]
data_train_labels = data_train[:, 1:, :]
data_test_inputs = data_test[:, :-1, :]
data_test_labels = data_test[:, 1:, :]

train_data_inputs = nd.array(data_train_inputs).reshape((num_batches_train, batch_size, seq_length, num_features))
train_data_labels = nd.array(data_train_labels).reshape((num_batches_train, batch_size, seq_length, num_features))
test_data_inputs = nd.array(data_test_inputs).reshape((num_batches_test, batch_size_test, seq_length, num_features))
test_data_labels = nd.array(data_test_labels).reshape((num_batches_test, batch_size_test, seq_length, num_features))

train_data_inputs = nd.swapaxes(train_data_inputs, 1, 2)
train_data_labels = nd.swapaxes(train_data_labels, 1, 2)
test_data_inputs = nd.swapaxes(test_data_inputs, 1, 2)
test_data_labels = nd.swapaxes(test_data_labels, 1, 2)


print('num_mini-batches_train={0} | seq_length={2} | mini-batch_size={1} | num_features={3}'.format(num_batches_train, batch_size, seq_length, num_features))
print('train_data_inputs shape: ', train_data_inputs.shape)
print('train_data_labels shape: ', train_data_labels.shape)
# print(data_train_inputs.values)
# print(train_data_inputs[0]) # see what one batch looks like


num_mini-batches_train=13 | seq_length=3 | mini-batch_size=32 | num_features=5
train_data_inputs shape:  (13L, 3L, 32L, 5L)
train_data_labels shape:  (13L, 3L, 32L, 5L)

Long short-term memory (LSTM) RNNs

An LSTM block has mechanisms to enable "memorizing" information for an extended number of time steps. We use the LSTM block with the following transformations that map inputs to outputs across blocks at consecutive layers and consecutive time steps: $\newcommand{\xb}{\mathbf{x}} \newcommand{\RR}{\mathbb{R}}$

$$g_t = \text{tanh}(X_t W_{xg} + h_{t-1} W_{hg} + b_g),$$$$i_t = \sigma(X_t W_{xi} + h_{t-1} W_{hi} + b_i),$$$$f_t = \sigma(X_t W_{xf} + h_{t-1} W_{hf} + b_f),$$$$o_t = \sigma(X_t W_{xo} + h_{t-1} W_{ho} + b_o),$$$$c_t = f_t \odot c_{t-1} + i_t \odot g_t,$$$$h_t = o_t \odot \text{tanh}(c_t),$$

where $\odot$ is an element-wise multiplication operator, and for all $\xb = [x_1, x_2, \ldots, x_k]^\top \in \RR^k$ the two activation functions:

$$\sigma(\xb) = \left[\frac{1}{1+\exp(-x_1)}, \ldots, \frac{1}{1+\exp(-x_k)}]\right]^\top,$$$$\text{tanh}(\xb) = \left[\frac{1-\exp(-2x_1)}{1+\exp(-2x_1)}, \ldots, \frac{1-\exp(-2x_k)}{1+\exp(-2x_k)}\right]^\top.$$

In the transformations above, the memory cell $c_t$ stores the "long-term" memory in the vector form. In other words, the information accumulatively captured and encoded until time step $t$ is stored in $c_t$ and is only passed along the same layer over different time steps.

Given the inputs $c_t$ and $h_t$, the input gate $i_t$ and forget gate $f_t$ will help the memory cell to decide how to overwrite or keep the memory information. The output gate $o_t$ further lets the LSTM block decide how to retrieve the memory information to generate the current state $h_t$ that is passed to both the next layer of the current time step and the next time step of the current layer. Such decisions are made using the hidden-layer parameters $W$ and $b$ with different subscripts: these parameters will be inferred during the training phase by gluon.

Allocate parameters


In [6]:
num_inputs = NUM_FEATURES  # for a 1D time series, this is just a scalar equal to 1.0
num_outputs = NUM_FEATURES  # same comment
num_hidden_units = [64]  # num of hidden units in each hidden LSTM layer
num_hidden_layers = len(num_hidden_units)  # num of hidden LSTM layers
num_units_layers = [num_features] + num_hidden_units

########################
#  Weights connecting the inputs to the hidden layer
########################
Wxg, Wxi, Wxf, Wxo, Whg, Whi, Whf, Who, bg, bi, bf, bo = {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {} 
for i_layer in range(1, num_hidden_layers+1):
    num_inputs = num_units_layers[i_layer-1]
    num_hidden_units = num_units_layers[i_layer]
    Wxg[i_layer] = nd.random_normal(shape=(num_inputs,num_hidden_units), ctx=ctx) * .01
    Wxi[i_layer] = nd.random_normal(shape=(num_inputs,num_hidden_units), ctx=ctx) * .01
    Wxf[i_layer] = nd.random_normal(shape=(num_inputs,num_hidden_units), ctx=ctx) * .01
    Wxo[i_layer] = nd.random_normal(shape=(num_inputs,num_hidden_units), ctx=ctx) * .01

    ########################
    #  Recurrent weights connecting the hidden layer across time steps
    ########################
    Whg[i_layer] = nd.random_normal(shape=(num_hidden_units, num_hidden_units), ctx=ctx) * .01
    Whi[i_layer] = nd.random_normal(shape=(num_hidden_units, num_hidden_units), ctx=ctx) * .01
    Whf[i_layer] = nd.random_normal(shape=(num_hidden_units, num_hidden_units), ctx=ctx) * .01
    Who[i_layer] = nd.random_normal(shape=(num_hidden_units, num_hidden_units), ctx=ctx) * .01

    ########################
    #  Bias vector for hidden layer
    ########################
    bg[i_layer] = nd.random_normal(shape=num_hidden_units, ctx=ctx) * .01
    bi[i_layer] = nd.random_normal(shape=num_hidden_units, ctx=ctx) * .01
    bf[i_layer] = nd.random_normal(shape=num_hidden_units, ctx=ctx) * .01
    bo[i_layer] = nd.random_normal(shape=num_hidden_units, ctx=ctx) * .01

########################
# Weights to the output nodes
########################
Why = nd.random_normal(shape=(num_units_layers[-1], num_outputs), ctx=ctx) * .01
by = nd.random_normal(shape=num_outputs, ctx=ctx) * .01

Attach the gradients


In [7]:
params = []
for i_layer in range(1, num_hidden_layers+1):
    params += [Wxg[i_layer], Wxi[i_layer], Wxf[i_layer], Wxo[i_layer], Whg[i_layer], Whi[i_layer], Whf[i_layer], Who[i_layer], bg[i_layer], bi[i_layer], bf[i_layer], bo[i_layer]]

params += [Why, by]  # add the output layer

for param in params:
    param.attach_grad()

Softmax Activation


In [8]:
def softmax(y_linear, temperature=1.0):
    lin = (y_linear-nd.max(y_linear)) / temperature
    exp = nd.exp(lin)
    partition = nd.sum(exp, axis=0, exclude=True).reshape((-1,1))
    return exp / partition

Cross-entropy loss function


In [9]:
def cross_entropy(yhat, y):
    return - nd.mean(nd.sum(y * nd.log(yhat), axis=0, exclude=True))

def rmse(yhat, y):
    return nd.mean(nd.sqrt(nd.sum(nd.power(y - yhat, 2), axis=0, exclude=True)))

Averaging the loss over the sequence


In [10]:
def average_ce_loss(outputs, labels):
    assert(len(outputs) == len(labels))
    total_loss = 0.
    for (output, label) in zip(outputs,labels):
        total_loss = total_loss + cross_entropy(output, label)
    return total_loss / len(outputs)

def average_rmse_loss(outputs, labels):
    assert(len(outputs) == len(labels))
    total_loss = 0.
    for (output, label) in zip(outputs,labels):
        total_loss = total_loss + rmse(output, label)
    return total_loss / len(outputs)

Optimizer


In [11]:
from exceptions import ValueError

def SGD(params, learning_rate):
    for param in params:
#         print('grrrrr: ', param.grad)
        param[:] = param - learning_rate * param.grad

def adam(params, learning_rate, M , R, index_adam_call, beta1, beta2, eps):
    k = -1
    for param in params:
        k += 1
        M[k] = beta1 * M[k] + (1. - beta1) * param.grad
        R[k] = beta2 * R[k] + (1. - beta2) * (param.grad)**2
        # bias correction since we initilized M & R to zeros, they're biased toward zero on the first few iterations
        m_k_hat = M[k] / (1. - beta1**(index_adam_call))
        r_k_hat = R[k] / (1. - beta2**(index_adam_call))
        if((np.isnan(M[k].asnumpy())).any() or (np.isnan(R[k].asnumpy())).any()):
#             print('GRRRRRR ', M, K)
            raise(ValueError('Nans!!'))
#         print('grrrrr: ', param.grad)
        param[:] = param - learning_rate * m_k_hat / (nd.sqrt(r_k_hat) + eps)
#     print('m_k_hat r_k_hat', m_k_hat, r_k_hat)
    return params, M, R

Define the model


In [12]:
def single_lstm_unit_calcs(X, c, Wxg, h, Whg, bg, Wxi, Whi, bi, Wxf, Whf, bf, Wxo, Who, bo):
    g = nd.tanh(nd.dot(X, Wxg) + nd.dot(h, Whg) + bg)
    i = nd.sigmoid(nd.dot(X, Wxi) + nd.dot(h, Whi) + bi)
    f = nd.sigmoid(nd.dot(X, Wxf) + nd.dot(h, Whf) + bf)
    o = nd.sigmoid(nd.dot(X, Wxo) + nd.dot(h, Who) + bo)
    #######################
    c = f * c + i * g
    h = o * nd.tanh(c)
    return c, h

def deep_lstm_rnn(inputs, h, c, temperature=1.0):
    """
        h: dict of nd.arrays, each key is the index of a hidden layer (from 1 to whatever). 
        Index 0, if any, is the input layer
    """
    outputs = []
    # inputs is one BATCH of sequences so its shape is number_of_seq, seq_length, features_dim 
    # (latter is 1 for a time series, vocab_size for a character, n for a n different times series)
    for X in inputs:
        # X is batch of one time stamp. E.g. if each batch has 37 sequences, then the first value of X will be a set of the 37 first values of each of the 37 sequences 
        # that means each iteration on X corresponds to one time stamp, but it is done in batches of different sequences
        h[0] = X # the first hidden layer takes the input X as input 
        for i_layer in range(1, num_hidden_layers+1):
            # lstm units now have the 2 following inputs: 
            # i) h_t from the previous layer (equivalent to the input X for a non-deep lstm net), 
            # ii) h_t-1 from the current layer (same as for non-deep lstm nets)
            c[i_layer], h[i_layer] = single_lstm_unit_calcs(h[i_layer-1], c[i_layer], Wxg[i_layer], h[i_layer], Whg[i_layer], bg[i_layer], Wxi[i_layer], Whi[i_layer], bi[i_layer], Wxf[i_layer], Whf[i_layer], bf[i_layer], Wxo[i_layer], Who[i_layer], bo[i_layer])
        yhat_linear = nd.dot(h[num_hidden_layers], Why) + by
        # yhat is a batch of several values of the same time stamp
        # this is basically the prediction of the sequence, which overlaps most of the input sequence, plus one point (character or value)
#         yhat = softmax(yhat_linear, temperature=temperature)
#         yhat = nd.sigmoid(yhat_linear)
#         yhat = nd.tanh(yhat_linear)
        yhat = yhat_linear # we cant use a 1.0-bounded activation function since amplitudes can be greater than 1.0
        outputs.append(yhat) # outputs has same shape as inputs, i.e. a list of batches of data points.
#     print('some shapes... yhat outputs', yhat.shape, len(outputs) )
    return (outputs, h, c)

Test and visualize predictions


In [13]:
INDEX_TARGET_VALUE = 0
def test_prediction(one_input_seq, one_label_seq, temperature=1.0):
      # WE ASSUME the first value in input vector is the variable of interest
    #####################################
    # Set the initial state of the hidden representation ($h_0$) to the zero vector
    #####################################  # some better initialization needed??
    h, c = {}, {}
    for i_layer in range(1, num_hidden_layers+1):
        h[i_layer] = nd.zeros(shape=(batch_size_test, num_units_layers[i_layer]), ctx=ctx)
        c[i_layer] = nd.zeros(shape=(batch_size_test, num_units_layers[i_layer]), ctx=ctx)
    
    outputs, h, c = deep_lstm_rnn(one_input_seq, h, c, temperature=temperature)

    return outputs[-1][0].asnumpy()[INDEX_TARGET_VALUE], one_label_seq.asnumpy()[-1].flatten()[INDEX_TARGET_VALUE], outputs, one_label_seq

def check_prediction(index):
    if index >= len(test_data_inputs):
        index = np.random.randint(len(test_data_inputs))
        
    o, label, outputs, labels = test_prediction(test_data_inputs[index], test_data_labels[index], temperature=1.0)    

    prediction = round(o, 3)
    true_label = round(label, 3)
    outputs = [float(i.asnumpy().flatten()[INDEX_TARGET_VALUE]) for i in outputs]  # if batch_size_test=1 then this float() will work, otherwise, nope.
    true_labels = list(test_data_labels[index].asnumpy()[:,:,INDEX_TARGET_VALUE].flatten())
    
    df = pd.DataFrame([outputs, true_labels]).transpose()
    df.columns = ['predicted', 'true']

    if true_label != 0:
        rel_error = round(100. * (prediction / (true_label+1e-5) - 1.0), 2)
    else:
        rel_error = 100.
#     print('\nprediction = {0} | actual_value = {1} | rel_error = {2}'.format(prediction, true_label, rel_error))
    return df

In [14]:
epochs = 10000  # one epoch is one pass over the entire training set
moving_loss = 0.
learning_rate = 0.001  # 0.1 works for a [8, 8] after about 70 epochs of 32-sized batches

# Adam Optimizer stuff
beta1 = .9
beta2 = .999
index_adam_call = 0
# M & R arrays to keep track of momenta in adam optimizer. params is a list that contains all ndarrays of parameters
M = {k: nd.zeros_like(v) for k, v in enumerate(params)}
R = {k: nd.zeros_like(v) for k, v in enumerate(params)}

df_moving_loss = pd.DataFrame(columns=['Loss', 'Error'])
df_moving_loss.index.name = 'Epoch'

# needed to update plots on the fly
%matplotlib notebook
fig, axes_fig1 = plt.subplots(1,1, figsize=(6,3))
fig2, axes_fig2 = plt.subplots(1,1, figsize=(6,3))


for e in range(epochs):
    ############################
    # Attenuate the learning rate by a factor of 2 every 100 epochs
    ############################
    if ((e+1) % 1000 == 0):
        learning_rate = learning_rate / 2.0  # TODO check if its ok to adjust learning_rate when using Adam Optimizer
    h, c = {}, {}
    for i_layer in range(1, num_hidden_layers+1):
        h[i_layer] = nd.zeros(shape=(batch_size, num_units_layers[i_layer]), ctx=ctx)
        c[i_layer] = nd.zeros(shape=(batch_size, num_units_layers[i_layer]), ctx=ctx)

    for i in range(num_batches_train):
        data_one_hot = train_data_inputs[i]
        label_one_hot = train_data_labels[i]
        with autograd.record():
            outputs, h, c = deep_lstm_rnn(data_one_hot, h, c)
            loss = average_rmse_loss(outputs, label_one_hot)
            loss.backward()
#         SGD(params, learning_rate)
        index_adam_call += 1  # needed for bias correction in Adam optimizer
        params, M, R = adam(params, learning_rate, M, R, index_adam_call, beta1, beta2, 1e-8)
        
        ##########################
        #  Keep a moving average of the losses
        ##########################
        if (i == 0) and (e == 0):
            moving_loss = nd.mean(loss).asscalar()
        else:
            moving_loss = .99 * moving_loss + .01 * nd.mean(loss).asscalar()
        df_moving_loss.loc[e] = round(moving_loss, 4)

    ############################
    #  Predictions and plots
    ############################
    data_prediction_df = check_prediction(index=e)
    
    if not (e%50):
        axes_fig1.clear()
        data_prediction_df.plot(ax=axes_fig1)
        fig.canvas.draw()
    prediction = round(data_prediction_df.tail(1)['predicted'].values.flatten()[-1], 3)
    true_label = round(data_prediction_df.tail(1)['true'].values.flatten()[-1], 3)
    if true_label != 0:
        rel_error = round(100. * np.abs(prediction / (true_label+1e-5) - 1.0), 2)
    else:
        rel_error = moving_rel_error
    if not (e%50):
        print("Epoch = {0} | Loss = {1} | Prediction = {2} True = {3} Error = {4}".format(e, moving_loss, prediction, true_label, rel_error ))
    if not (e%50):
        axes_fig2.clear()
    if e == 0:
        moving_rel_error = rel_error
    else:
        moving_rel_error = .99 * moving_rel_error + .01 * rel_error

    df_moving_loss.loc[e, ['Error']] = moving_rel_error
    if not (e%50):
        axes_loss_plot = df_moving_loss.plot(ax=axes_fig2, secondary_y='Loss', color=['r','b'])
        axes_loss_plot.right_ax.grid(False)
#     axes_loss_plot.right_ax.set_yscale('log')
        fig2.canvas.draw()
    
%matplotlib inline


/usr/local/lib/python2.7/site-packages/IPython/core/formatters.py:92: DeprecationWarning: DisplayFormatter._ipython_display_formatter_default is deprecated: use @default decorator instead.
  def _ipython_display_formatter_default(self):
/usr/local/lib/python2.7/site-packages/IPython/core/formatters.py:669: DeprecationWarning: PlainTextFormatter._singleton_printers_default is deprecated: use @default decorator instead.
  def _singleton_printers_default(self):
Epoch = 0 | Loss = 90.4195212589 | Prediction = 1.251 True = 104.0 Error = 98.8
/usr/local/lib/python2.7/site-packages/matplotlib/axes/_base.py:2787: UserWarning: Attempting to set identical left==right results
in singular transformations; automatically expanding.
left=0.0, right=0.0
  'left=%s, right=%s') % (left, right))
Epoch = 50 | Loss = 63.5509483776 | Prediction = 78.374 True = 102.0 Error = 23.16
Epoch = 100 | Loss = 43.3374257251 | Prediction = 119.007 True = 15.0 Error = 693.38
Epoch = 150 | Loss = 42.769435475 | Prediction = 116.039 True = 100.0 Error = 16.04
Epoch = 200 | Loss = 41.351768018 | Prediction = 113.742 True = 67.0 Error = 69.76
Epoch = 250 | Loss = 39.3556160145 | Prediction = 99.18 True = 100.0 Error = 0.82
Epoch = 300 | Loss = 38.2838466638 | Prediction = 93.743 True = 112.0 Error = 16.3
Epoch = 350 | Loss = 37.4492007919 | Prediction = 114.738 True = 60.0 Error = 91.23
Epoch = 400 | Loss = 36.5580613772 | Prediction = 92.622 True = 5.0 Error = 1752.44
Epoch = 450 | Loss = 35.6170624096 | Prediction = 93.374 True = 84.0 Error = 11.16
Epoch = 500 | Loss = 34.5398781922 | Prediction = 98.356 True = 4.0 Error = 2358.89
Epoch = 550 | Loss = 33.4545468116 | Prediction = 83.799 True = 120.0 Error = 30.17
Epoch = 600 | Loss = 32.4191180225 | Prediction = 94.84 True = 84.0 Error = 12.9
Epoch = 650 | Loss = 31.4382775243 | Prediction = 87.044 True = 107.0 Error = 18.65
Epoch = 700 | Loss = 30.2074917727 | Prediction = 91.677 True = 123.0 Error = 25.47
Epoch = 750 | Loss = 29.0804917816 | Prediction = 54.541 True = 84.0 Error = 35.07
Epoch = 800 | Loss = 27.9675334184 | Prediction = 109.112 True = 113.0 Error = 3.44
Epoch = 850 | Loss = 27.1998839413 | Prediction = 103.292 True = 113.0 Error = 8.59
Epoch = 900 | Loss = 26.1971362753 | Prediction = 76.142 True = 115.0 Error = 33.79
Epoch = 950 | Loss = 24.713831021 | Prediction = 111.525 True = 114.0 Error = 2.17
Epoch = 1000 | Loss = 23.8570173218 | Prediction = 46.093 True = 66.0 Error = 30.16
Epoch = 1050 | Loss = 22.7657340349 | Prediction = 112.258 True = 115.0 Error = 2.38
Epoch = 1100 | Loss = 21.9677115881 | Prediction = 62.767 True = 107.0 Error = 41.34
Epoch = 1150 | Loss = 21.5274821104 | Prediction = 124.572 True = 123.0 Error = 1.28
Epoch = 1200 | Loss = 21.1102524074 | Prediction = 102.089 True = 51.0 Error = 100.17
Epoch = 1250 | Loss = 20.8769301047 | Prediction = 27.801 True = 110.0 Error = 74.73
Epoch = 1300 | Loss = 20.6268728305 | Prediction = 110.712 True = 27.0 Error = 310.04
Epoch = 1350 | Loss = 20.3048467329 | Prediction = 108.795 True = 21.0 Error = 418.07
Epoch = 1400 | Loss = 19.9752429758 | Prediction = 109.149 True = 142.0 Error = 23.13
Epoch = 1450 | Loss = 19.6160965714 | Prediction = 112.657 True = 63.0 Error = 78.82
Epoch = 1500 | Loss = 19.9488020958 | Prediction = 121.02 True = 112.0 Error = 8.05
Epoch = 1550 | Loss = 19.0848039935 | Prediction = 99.943 True = 128.0 Error = 21.92
Epoch = 1600 | Loss = 18.3784891231 | Prediction = 102.199 True = 113.0 Error = 9.56
Epoch = 1650 | Loss = 18.5643984713 | Prediction = 93.142 True = 1.0 Error = 9214.11
Epoch = 1700 | Loss = 18.4995417406 | Prediction = 73.712 True = 29.0 Error = 154.18
Epoch = 1750 | Loss = 18.5354765444 | Prediction = 89.319 True = 104.0 Error = 14.12
Epoch = 1800 | Loss = 17.8465526695 | Prediction = 110.848 True = 128.0 Error = 13.4
Epoch = 1850 | Loss = 17.400083044 | Prediction = 65.031 True = 98.0 Error = 33.64
Epoch = 1900 | Loss = 17.0216465135 | Prediction = 99.322 True = 117.0 Error = 15.11
Epoch = 1950 | Loss = 16.9688231999 | Prediction = 107.632 True = 148.0 Error = 27.28
Epoch = 2000 | Loss = 16.1771419825 | Prediction = 124.94 True = 84.0 Error = 48.74
Epoch = 2050 | Loss = 14.9550154339 | Prediction = 133.193 True = 127.0 Error = 4.88
Epoch = 2100 | Loss = 14.7943185004 | Prediction = 63.721 True = 100.0 Error = 36.28
Epoch = 2150 | Loss = 14.6587808591 | Prediction = 61.962 True = 128.0 Error = 51.59
Epoch = 2200 | Loss = 14.7835274748 | Prediction = 67.533 True = 119.0 Error = 43.25
Epoch = 2250 | Loss = 14.7463619153 | Prediction = 136.051 True = 127.0 Error = 7.13
Epoch = 2300 | Loss = 14.4550964374 | Prediction = 107.948 True = 27.0 Error = 299.81
Epoch = 2350 | Loss = 14.2365115537 | Prediction = 134.787 True = 158.0 Error = 14.69
Epoch = 2400 | Loss = 14.1470061001 | Prediction = 130.802 True = 135.0 Error = 3.11
Epoch = 2450 | Loss = 13.9158515153 | Prediction = 30.491 True = 107.0 Error = 71.5
Epoch = 2500 | Loss = 14.407407794 | Prediction = 114.89 True = 117.0 Error = 1.8
Epoch = 2550 | Loss = 14.5285168704 | Prediction = 83.108 True = 109.0 Error = 23.75
Epoch = 2600 | Loss = 14.2993511792 | Prediction = 125.273 True = 9.0 Error = 1291.92
Epoch = 2650 | Loss = 14.0576951273 | Prediction = 122.077 True = 114.0 Error = 7.09
Epoch = 2700 | Loss = 13.9934303895 | Prediction = 8.391 True = 15.0 Error = 44.06
Epoch = 2750 | Loss = 13.958653951 | Prediction = 42.995 True = 128.0 Error = 66.41
Epoch = 2800 | Loss = 13.8161519746 | Prediction = 117.89 True = 117.0 Error = 0.76
Epoch = 2850 | Loss = 13.4040323862 | Prediction = 49.479 True = 102.0 Error = 51.49
Epoch = 2900 | Loss = 13.3946423551 | Prediction = 112.342 True = 27.0 Error = 316.08
Epoch = 2950 | Loss = 13.4743798554 | Prediction = 35.317 True = 110.0 Error = 67.89
Epoch = 3000 | Loss = 13.9533871408 | Prediction = 113.275 True = 5.0 Error = 2165.5
Epoch = 3050 | Loss = 13.0243946052 | Prediction = 118.918 True = 130.0 Error = 8.52
Epoch = 3100 | Loss = 12.9316450997 | Prediction = 122.127 True = 4.0 Error = 2953.17
Epoch = 3150 | Loss = 13.0145069611 | Prediction = 139.251 True = 127.0 Error = 9.65
Epoch = 3200 | Loss = 13.0009316396 | Prediction = 140.224 True = 101.0 Error = 38.84
Epoch = 3250 | Loss = 12.8040470137 | Prediction = 122.271 True = 60.0 Error = 103.78
Epoch = 3300 | Loss = 12.4892697216 | Prediction = 115.266 True = 121.0 Error = 4.74
Epoch = 3350 | Loss = 12.7517491107 | Prediction = 99.225 True = 148.0 Error = 32.96
Epoch = 3400 | Loss = 12.7906603835 | Prediction = 23.323 True = 138.0 Error = 83.1
Epoch = 3450 | Loss = 12.6528871224 | Prediction = 49.565 True = 196.0 Error = 74.71
Epoch = 3500 | Loss = 12.8868158171 | Prediction = 50.843 True = 104.0 Error = 51.11
Epoch = 3550 | Loss = 13.2313567789 | Prediction = -1.527 True = 151.0 Error = 101.01
Epoch = 3600 | Loss = 12.8865009017 | Prediction = 158.819 True = 118.0 Error = 34.59
Epoch = 3650 | Loss = 12.6046641145 | Prediction = 115.05 True = 5.0 Error = 2201.0
Epoch = 3700 | Loss = 12.4589927638 | Prediction = 130.525 True = 4.0 Error = 3163.12
Epoch = 3750 | Loss = 12.5249461757 | Prediction = 140.131 True = 101.0 Error = 38.74
Epoch = 3800 | Loss = 12.6733780985 | Prediction = 125.023 True = 121.0 Error = 3.32
Epoch = 3850 | Loss = 12.3049375565 | Prediction = 71.378 True = 51.0 Error = 39.96
Epoch = 3900 | Loss = 12.4761137984 | Prediction = 148.589 True = 110.0 Error = 35.08
Epoch = 3950 | Loss = 12.2778209252 | Prediction = 51.275 True = 196.0 Error = 73.84
Epoch = 4000 | Loss = 12.294243613 | Prediction = 103.089 True = 98.0 Error = 5.19
Epoch = 4050 | Loss = 11.9422618019 | Prediction = 144.014 True = 127.0 Error = 13.4
Epoch = 4100 | Loss = 11.8270136145 | Prediction = 87.242 True = 125.0 Error = 30.21
Epoch = 4150 | Loss = 11.7765378391 | Prediction = 121.536 True = 121.0 Error = 0.44
Epoch = 4200 | Loss = 11.4546758605 | Prediction = 86.434 True = 115.0 Error = 24.84
Epoch = 4250 | Loss = 11.3178835157 | Prediction = 91.756 True = 109.0 Error = 15.82
Epoch = 4300 | Loss = 11.4860414712 | Prediction = 136.356 True = 84.0 Error = 62.33
Epoch = 4350 | Loss = 11.563016488 | Prediction = 69.044 True = 128.0 Error = 46.06
Epoch = 4400 | Loss = 11.762776757 | Prediction = 141.879 True = 127.0 Error = 11.72
Epoch = 4450 | Loss = 11.7237854462 | Prediction = 94.41 True = 29.0 Error = 225.55
Epoch = 4500 | Loss = 11.9470235499 | Prediction = 10.179 True = 15.0 Error = 32.14
Epoch = 4550 | Loss = 11.7346896901 | Prediction = 109.331 True = 117.0 Error = 6.55
Epoch = 4600 | Loss = 11.6561997111 | Prediction = 156.002 True = 95.0 Error = 64.21
Epoch = 4650 | Loss = 11.6002039192 | Prediction = 138.774 True = 64.0 Error = 116.83
Epoch = 4700 | Loss = 11.6635987122 | Prediction = 56.742 True = 102.0 Error = 44.37
Epoch = 4750 | Loss = 11.3440091861 | Prediction = 45.038 True = 76.0 Error = 40.74
Epoch = 4800 | Loss = 11.4734281961 | Prediction = 91.864 True = 67.0 Error = 37.11
Epoch = 4850 | Loss = 11.4937789253 | Prediction = 143.647 True = 112.0 Error = 28.26
Epoch = 4900 | Loss = 11.4351572737 | Prediction = 90.601 True = 109.0 Error = 16.88
Epoch = 4950 | Loss = 11.5471731531 | Prediction = 53.871 True = 104.0 Error = 48.2
Epoch = 5000 | Loss = 11.8138794329 | Prediction = 117.404 True = 5.0 Error = 2248.08
Epoch = 5050 | Loss = 11.6593943651 | Prediction = 89.846 True = 109.0 Error = 17.57
Epoch = 5100 | Loss = 11.6564754697 | Prediction = 102.195 True = 128.0 Error = 20.16
Epoch = 5150 | Loss = 11.6680648523 | Prediction = 86.112 True = 67.0 Error = 28.53
Epoch = 5200 | Loss = 11.5869334575 | Prediction = 34.736 True = 107.0 Error = 67.54
Epoch = 5250 | Loss = 11.2785363047 | Prediction = 90.949 True = 21.0 Error = 333.09
Epoch = 5300 | Loss = 11.2675427954 | Prediction = 136.775 True = 101.0 Error = 35.42
Epoch = 5350 | Loss = 11.4578113726 | Prediction = 90.299 True = 21.0 Error = 330.0
Epoch = 5400 | Loss = 11.5344703722 | Prediction = 124.292 True = 27.0 Error = 360.34
Epoch = 5450 | Loss = 11.6491109125 | Prediction = 161.61 True = 118.0 Error = 36.96
Epoch = 5500 | Loss = 11.4439867312 | Prediction = 45.73 True = 102.0 Error = 55.17
Epoch = 5550 | Loss = 11.6691364275 | Prediction = 140.141 True = 84.0 Error = 66.83
Epoch = 5600 | Loss = 12.2740273579 | Prediction = 126.589 True = 4.0 Error = 3064.72
Epoch = 5650 | Loss = 11.3682313643 | Prediction = 101.248 True = 112.0 Error = 9.6
Epoch = 5700 | Loss = 11.6127670595 | Prediction = 56.106 True = 66.0 Error = 14.99
Epoch = 5750 | Loss = 11.8511040031 | Prediction = 162.835 True = 118.0 Error = 38.0
Epoch = 5800 | Loss = 12.1208135561 | Prediction = 99.029 True = 112.0 Error = 11.58
Epoch = 5850 | Loss = 12.2036964216 | Prediction = 10.179 True = 15.0 Error = 32.14
Epoch = 5900 | Loss = 11.34590629 | Prediction = 10.094 True = 15.0 Error = 32.71
Epoch = 5950 | Loss = 11.6763316877 | Prediction = 149.662 True = 127.0 Error = 17.84
Epoch = 6000 | Loss = 11.5956953571 | Prediction = 126.676 True = 60.0 Error = 111.13
Epoch = 6050 | Loss = 11.3928141618 | Prediction = 104.529 True = 63.0 Error = 65.92
Epoch = 6100 | Loss = 11.2905095317 | Prediction = 95.851 True = 67.0 Error = 43.06
Epoch = 6150 | Loss = 11.1506316435 | Prediction = 18.309 True = 138.0 Error = 86.73
Epoch = 6200 | Loss = 10.9887163981 | Prediction = 91.057 True = 125.0 Error = 27.15
Epoch = 6250 | Loss = 10.9158398862 | Prediction = 58.199 True = 196.0 Error = 70.31
Epoch = 6300 | Loss = 10.8097249305 | Prediction = 140.02 True = 135.0 Error = 3.72
Epoch = 6350 | Loss = 10.8193681094 | Prediction = 143.488 True = 51.0 Error = 181.35
Epoch = 6400 | Loss = 10.8842073049 | Prediction = 7.898 True = 15.0 Error = 47.35
Epoch = 6450 | Loss = 11.0217982088 | Prediction = 101.43 True = 112.0 Error = 9.44
Epoch = 6500 | Loss = 10.9870619931 | Prediction = 67.762 True = 128.0 Error = 47.06
Epoch = 6550 | Loss = 10.9953464126 | Prediction = 143.078 True = 84.0 Error = 70.33
Epoch = 6600 | Loss = 11.1471179932 | Prediction = 105.418 True = 120.0 Error = 12.15
Epoch = 6650 | Loss = 11.0811758941 | Prediction = 110.814 True = 117.0 Error = 5.29
Epoch = 6700 | Loss = 11.188373728 | Prediction = 151.334 True = 111.0 Error = 36.34
Epoch = 6750 | Loss = 11.0663528659 | Prediction = 81.514 True = 119.0 Error = 31.5
Epoch = 6800 | Loss = 11.0325781906 | Prediction = 8.874 True = 15.0 Error = 40.84
Epoch = 6850 | Loss = 11.2567021879 | Prediction = 141.132 True = 64.0 Error = 120.52
Epoch = 6900 | Loss = 11.5496261954 | Prediction = 103.246 True = 104.0 Error = 0.73
Epoch = 6950 | Loss = 11.4773783594 | Prediction = 108.813 True = 120.0 Error = 9.32
Epoch = 7000 | Loss = 11.2473684377 | Prediction = 59.61 True = 196.0 Error = 69.59
Epoch = 7050 | Loss = 11.2050935217 | Prediction = 32.818 True = 107.0 Error = 69.33
Epoch = 7100 | Loss = 11.3992800672 | Prediction = 124.994 True = 114.0 Error = 9.64
Epoch = 7150 | Loss = 11.5324363806 | Prediction = 148.717 True = 112.0 Error = 32.78
Epoch = 7200 | Loss = 11.4812030535 | Prediction = 151.93 True = 111.0 Error = 36.87
Epoch = 7250 | Loss = 11.6647015568 | Prediction = 89.87 True = 143.0 Error = 37.15
Epoch = 7300 | Loss = 11.9134170856 | Prediction = 144.949 True = 113.0 Error = 28.27
Epoch = 7350 | Loss = 12.0526404295 | Prediction = 91.219 True = 21.0 Error = 334.38
Epoch = 7400 | Loss = 12.0087663862 | Prediction = 137.346 True = 101.0 Error = 35.99
Epoch = 7450 | Loss = 11.8775698689 | Prediction = 90.358 True = 21.0 Error = 330.28
Epoch = 7500 | Loss = 11.8097558559 | Prediction = 86.978 True = 99.0 Error = 12.14
Epoch = 7550 | Loss = 11.8121928917 | Prediction = 118.016 True = 5.0 Error = 2260.32
Epoch = 7600 | Loss = 11.6873176836 | Prediction = 88.048 True = 142.0 Error = 37.99
Epoch = 7650 | Loss = 11.6361698991 | Prediction = 116.269 True = 158.0 Error = 26.41
Epoch = 7700 | Loss = 11.7217162106 | Prediction = 116.74 True = 148.0 Error = 21.12
Epoch = 7750 | Loss = 11.8964469016 | Prediction = 102.099 True = 113.0 Error = 9.65
Epoch = 7800 | Loss = 11.7282216018 | Prediction = 63.082 True = 104.0 Error = 39.34
Epoch = 7850 | Loss = 11.543311817 | Prediction = 142.344 True = 84.0 Error = 69.46
Epoch = 7900 | Loss = 11.9807126852 | Prediction = 59.56 True = 196.0 Error = 69.61
Epoch = 7950 | Loss = 11.5294592895 | Prediction = 134.392 True = 101.0 Error = 33.06
Epoch = 8000 | Loss = 11.3918870037 | Prediction = 118.898 True = 1.0 Error = 11789.68
Epoch = 8050 | Loss = 11.4578328227 | Prediction = 127.043 True = 114.0 Error = 11.44
Epoch = 8100 | Loss = 11.4666136464 | Prediction = 119.762 True = 1.0 Error = 11876.08
Epoch = 8150 | Loss = 11.3954913585 | Prediction = -0.882 True = 151.0 Error = 100.58
Epoch = 8200 | Loss = 11.4417014644 | Prediction = 78.171 True = 142.0 Error = 44.95
Epoch = 8250 | Loss = 11.4170797228 | Prediction = 127.181 True = 9.0 Error = 1313.12
Epoch = 8300 | Loss = 11.3054711077 | Prediction = 23.01 True = 138.0 Error = 83.33
Epoch = 8350 | Loss = 11.2190185698 | Prediction = 109.251 True = 117.0 Error = 6.62
Epoch = 8400 | Loss = 11.1036245204 | Prediction = 79.56 True = 51.0 Error = 56.0
Epoch = 8450 | Loss = 11.0800143222 | Prediction = 108.8 True = 120.0 Error = 9.33
Epoch = 8500 | Loss = 11.0433155133 | Prediction = 88.457 True = 125.0 Error = 29.23
Epoch = 8550 | Loss = 11.0291315249 | Prediction = 90.558 True = 99.0 Error = 8.53
Epoch = 8600 | Loss = 11.1565106611 | Prediction = 98.135 True = 67.0 Error = 46.47
Epoch = 8650 | Loss = 11.2143772471 | Prediction = 126.563 True = 9.0 Error = 1306.25
Epoch = 8700 | Loss = 11.2022920416 | Prediction = 144.878 True = 113.0 Error = 28.21
Epoch = 8750 | Loss = 11.2767862755 | Prediction = 49.886 True = 76.0 Error = 34.36
Epoch = 8800 | Loss = 11.2786532038 | Prediction = 78.849 True = 109.0 Error = 27.66
Epoch = 8850 | Loss = 11.3052738707 | Prediction = 116.762 True = 148.0 Error = 21.11
Epoch = 8900 | Loss = 11.20204408 | Prediction = 130.766 True = 101.0 Error = 29.47
Epoch = 8950 | Loss = 11.1416765953 | Prediction = 123.565 True = 158.0 Error = 21.79
Epoch = 9000 | Loss = 11.1402867052 | Prediction = 96.139 True = 143.0 Error = 32.77
Epoch = 9050 | Loss = 11.1766112327 | Prediction = 56.968 True = 110.0 Error = 48.21
Epoch = 9100 | Loss = 11.1557205756 | Prediction = 96.474 True = 63.0 Error = 53.13
Epoch = 9150 | Loss = 11.1170775447 | Prediction = 17.7 True = 138.0 Error = 87.17
Epoch = 9200 | Loss = 11.1092157024 | Prediction = 154.052 True = 111.0 Error = 38.79
Epoch = 9250 | Loss = 11.1075507329 | Prediction = 86.697 True = 99.0 Error = 12.43
Epoch = 9300 | Loss = 11.0717529464 | Prediction = 116.88 True = 5.0 Error = 2237.6
Epoch = 9350 | Loss = 10.9955827625 | Prediction = 120.061 True = 27.0 Error = 344.67
Epoch = 9400 | Loss = 10.9358465992 | Prediction = 77.804 True = 109.0 Error = 28.62
Epoch = 9450 | Loss = 10.9421564652 | Prediction = 158.272 True = 95.0 Error = 66.6
Epoch = 9500 | Loss = 10.9709656772 | Prediction = 146.075 True = 110.0 Error = 32.8
Epoch = 9550 | Loss = 10.972768172 | Prediction = 153.411 True = 111.0 Error = 38.21
Epoch = 9600 | Loss = 10.934043782 | Prediction = 64.995 True = 104.0 Error = 37.5
Epoch = 9650 | Loss = 10.8668740386 | Prediction = 147.347 True = 127.0 Error = 16.02
Epoch = 9700 | Loss = 10.8970722135 | Prediction = 2.568 True = 151.0 Error = 98.3
Epoch = 9750 | Loss = 10.9906052818 | Prediction = 142.584 True = 84.0 Error = 69.74
Epoch = 9800 | Loss = 11.0039909214 | Prediction = 61.964 True = 66.0 Error = 6.12
Epoch = 9850 | Loss = 11.0297774342 | Prediction = 32.703 True = 107.0 Error = 69.44
Epoch = 9900 | Loss = 11.0341778408 | Prediction = 158.492 True = 95.0 Error = 66.83
Epoch = 9950 | Loss = 10.9908553478 | Prediction = 94.873 True = 63.0 Error = 50.59

In [15]:
# print(outputs[0].asnumpy()[0].flatten())
# print(test_data_labels[0].asnumpy()[:,:,0].flatten())
# [float(i.asnumpy().flatten()) for i in outputs]
# print([i.asnumpy() for i in outputs])
# one_label_seq = test_data_labels[0]
# print(outputs[-1][0])
# print(one_label_seq)
# print(rmse(outputs[-1][0], one_label_seq))
# print(test_data_inputs[0].asnumpy()[-1].flatten()[0])
# print([i.asnumpy().flatten() for i in outputs])
# print(float(outputs[0].asnumpy()[:, 0].flatten()))

Conclusions