Deep LSTM RNNs for N-D Time Series Analysis



In [1]:

    
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning, module='.*/IPython/.*')



In [2]:

    
####################
# necessary imports. !! pip install mxnet if mxnet not installed !!
####################
from __future__ import print_function
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning, module='.*/IPython/.*')
import mxnet as mx  #  !! pip install mxnet if mxnet not installed !!
from mxnet import nd, autograd, ndarray
from mxnet.ndarray import round as nd_round
import numpy as np
from collections import defaultdict
import copy
mx.random.seed(1234)
# ctx = mx.gpu(0)
ctx = mx.cpu(0)

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import scipy.fftpack
from scipy import stats
from pandas.tools import plotting
from pandas.tools.plotting import autocorrelation_plot
try:
    from sklearn.model_selection import train_test_split
except ImportError:
    from sklearn.cross_validation import train_test_split
from numpy import polyfit
from datetime import datetime
sns.set_style('whitegrid')
#sns.set_context('notebook')
sns.set_context('poster')
# Make inline plots vector graphics instead of raster graphics
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('pdf', 'png')

Data Loading



In [3]:

    
######################################
# LOAD DATA
######################################
# columns_subset = ['car.count', 'day_of_week_int', 'cloudy_or_not_cloudy', 'weather', 'current_month']
# original_data = pd.read_csv("../data/timeseries/data.csv", index_col=0)
original_data = pd.read_csv("../data/timeseries/agari_aetna.csv", index_col=0)

print('Columns: ', original_data.columns.tolist(), '\n')

original_data.index = pd.to_datetime(original_data.index)
original_data.sort_index(inplace=True)
full_data = original_data


late_data = full_data.iloc[-len(full_data)//10:]
early_data = full_data.iloc[:-len(full_data)//10]

# full_data = full_data[full_data['strategy_id']==1]
# we groupby because we need one vector per day. full_data is a list of dataframes, each being index by unique dates
# early_data = [i[1] for i in list(early_data.groupby(['strategy_id', 'finfolio_account_id']))]
# late_data = [i[1] for i in list(late_data.groupby(['strategy_id', 'finfolio_account_id']))]

print(early_data.head(10))
print('\n -------- \n', late_data.tail(5))









    



Columns:  ['value'] 

                     value
date                      
2018-01-21 00:00:00      0
2018-01-21 00:01:00      0
2018-01-21 00:02:00      0
2018-01-21 00:03:00      0
2018-01-21 00:04:00      0
2018-01-21 00:05:00      0
2018-01-21 00:06:00      0
2018-01-21 00:07:00      0
2018-01-21 00:08:00      0
2018-01-21 00:09:00      0

 -------- 
                      value
date                      
2018-03-04 23:55:00      0
2018-03-04 23:56:00      0
2018-03-04 23:57:00      0
2018-03-04 23:58:00      0
2018-03-04 23:59:00      0

Data Preparation



In [4]:

    
#############################
# Utility functions for data preparation
#############################
def get_list_unique_block_indices(len_data=100, seq_length=5, n_samples=10):
    """ returns a list of unique random int that serve as index of the first element of a block of data
        args:
            len_data (int): length of the data set
            seq_length (int): length of the blocks to extract
            n_blocks (int): # of blocks to extract
    """
#     print(len_data // seq_length, n_samples, ' --')
    set1 = set(np.random.randint(len_data // seq_length, size=n_samples)*seq_length)
    full_set = set1
    while len(full_set) < n_samples:
        set2 = set(np.random.randint(len_data // seq_length, size=n_samples)*seq_length)
        full_set = full_set | set2
    returned_list = list(full_set)[0:n_samples]
    assert(len(returned_list) == n_samples)
    return returned_list

def extract_random_sequence(data, seq_length=5, columns_subset=['value'], block_start_index=None):
    if block_start_index is None:
        block_start_index = np.random.randint(len(data)-seq_length)
    data_subset = data.reset_index().loc[block_start_index:block_start_index+seq_length-1, columns_subset]
    assert(len(data_subset) == (seq_length))
    out_data = [list(i) for i in data_subset.values]
    return out_data

def create_batch_ND_time_series(full_data, column_subset=['value'], seq_length=10, num_samples=4):
    out_data = []
    # get a list of non-overlapping random sequence start indices
    all_samples_start_indices = get_list_unique_block_indices(len(full_data), seq_length, num_samples)
    assert(len(all_samples_start_indices) == num_samples)
    for one_random_start_index in all_samples_start_indices:
        out_data.append(extract_random_sequence(full_data, seq_length, column_subset, one_random_start_index))
        assert(len(out_data[-1]) == (seq_length))
    return out_data



In [5]:

    
#############################
# Data Preparation
#############################
SEQ_LENGTH = 2  # we use the last SEQ_LENGTH points as inputs
NUM_FEATURES = 0  # we use NUM_FEATURES features (underlying_return, strategy_id etc.)
BATCH_SIZE = 64
BATCH_SIZE_TEST = 1

# columns_subset = ['car.count', 'day_of_week_int', 'weather', 'current_month', 'current_year']
columns_subset = ['value']

def create_random_sequences(data, columns_subset, seq_length):
    all_random_sequences = []
    n_data_sets = len(data)
    if not (type(data) == list):
        data = [data]
    for i, one_data_batch in enumerate(data):
#         print('--> {0} / {1}'.format(i, n_data_sets))
        # let's divide data in train (75%), dev (15%), test (10%)
        # in sequences of 5 days (SEQ_LENGTH = 5)
        one_data_batch_length = len(one_data_batch)
        # the actual length of extracted sequence is SEQ_LENGTH + 1 so that we can do the shift of +1 for labels
        total_num_of_sequences = one_data_batch_length // (seq_length+1) - 1
        if total_num_of_sequences < 2:
            continue
        # the length of extracted sequence is SEQ_LENGTH so that we can do the shift of +1 for labels
        one_batch_ND_time_series = create_batch_ND_time_series(one_data_batch, 
                                                               columns_subset, 
                                                               seq_length+1, total_num_of_sequences)
        all_random_sequences += one_batch_ND_time_series

    return all_random_sequences



all_random_sequences = create_random_sequences(early_data, columns_subset, SEQ_LENGTH)
late_data_random_sequences = create_random_sequences(late_data, columns_subset, SEQ_LENGTH)
# actual total number of sequences
total_num_of_sequences = len(all_random_sequences)

percent_train, percent_dev = 0.99, 0.01  # we used dev set to try a few sets of parameters

n_seq_train = int(total_num_of_sequences*percent_train)
n_seq_dev = int(total_num_of_sequences*percent_dev) - int(total_num_of_sequences*percent_train)
# n_seq_test = len(all_random_sequences) - int(total_num_of_sequences*percent_dev)
n_seq_test = len(late_data_random_sequences)

data_train = np.array(all_random_sequences[0:n_seq_train])
# data_dev = np.array(all_random_sequences[n_seq_train:n_seq_train+n_seq_dev])
# data_test = np.array(all_random_sequences[n_seq_train+n_seq_dev:])

# we gonna use late data for testing
data_test = np.array(late_data_random_sequences)

print('SHAPES of ALL, TRAIN, TEST:') 
print(np.array(all_random_sequences).shape)
print(np.array(data_train).shape)
# print(np.array(data_dev).shape)
print(np.array(data_test).shape)

assert(data_train.shape == (n_seq_train, SEQ_LENGTH+1, NUM_FEATURES+1))  # +1 for the target value
# assert(data_dev.shape == (n_seq_dev, SEQ_LENGTH+1, NUM_FEATURES+1))
assert(data_test.shape == (n_seq_test, SEQ_LENGTH+1, NUM_FEATURES+1))

print('REMEMBER TO NOT take a random order of sequences for training to check if that matters. Typically, a deep RNN should be learnt purely sequentially, one sequence after the other, ordered in time, even if correlation drops quickly.. ')









    



SHAPES of ALL, TRAIN, TEST:
(18575, 3, 1)
(18389, 3, 1)
(2063, 3, 1)
REMEMBER TO NOT take a random order of sequences for training to check if that matters. Typically, a deep RNN should be learnt purely sequentially, one sequence after the other, ordered in time, even if correlation drops quickly..



In [6]:

    
# print(np.random.rand(*bad_data_test_labels[:, -1, :].shape))



In [7]:

    
# num_batches_train = data_train.shape[0] // BATCH_SIZE
num_batches_test = data_test.shape[0] // BATCH_SIZE_TEST
dim_inputs = NUM_FEATURES + 1  # dimension of the input vector for a given time stamp
# for labels, we just keep the time-series prediction which has index of 0, we dont keep the features.
# See Remarks Below
indices_outputs = [0]
dim_outputs = len(indices_outputs) # same comment

# inputs are from t0 to t_seq_length - 1. because the last point is kept for the 
# output ("label") of the penultimate point 


#######################
# DITCH ZEROED SEQUENCE FROM ALL DATA ******* THIS IS TEMPORARY, MEANT FOR TESTING /!\ *******
#######################
data_train_non_zero = data_train[np.any(data_train != 0, axis=1).flatten()]
data_train = data_train_non_zero
data_test_non_zero = data_test[np.any(data_test != 0, axis=1).flatten()]
data_test = data_test_non_zero



data_train_inputs = data_train[:, :-1, :]
# select only sequences with at least one non-zero value
# data_train_non_zero_inputs = data_train_inputs[np.any(data_train_inputs != 0, axis=1).flatten()]
# num_batches_non_zero_train = data_train_non_zero_inputs.shape[0] // BATCH_SIZE
data_train_labels = data_train[:, 1:, indices_outputs]
data_test_inputs = data_test[:, :-1, :]
data_test_labels = data_test[:, 1:, indices_outputs]


## WHICH MEANS YOU HAVE TO UPDATE the num_ variables since data was sliced
# data_train_inputs = data_train_non_zero_inputs
num_batches_train = data_train_inputs.shape[0] // BATCH_SIZE
num_batches_test = data_test_inputs.shape[0] // BATCH_SIZE_TEST


# creates 'bad' data for testing
# bad_data_test_labels = copy.deepcopy(data_test_labels)
# blur using a random factor between 0 and 4
# blurring_factors = np.random.randint(4, *bad_data_test_labels[:, -1, :].shape)
# bad_data_test_labels[:, -1, :] *= blurring_factors


train_data_inputs = nd.array(data_train_inputs).reshape((num_batches_train, BATCH_SIZE, SEQ_LENGTH, dim_inputs))
# train_data_non_zero_inputs = nd.array(data_train_non_zero_inputs).reshape((num_batches_non_zero_train, BATCH_SIZE, SEQ_LENGTH, dim_inputs))
train_data_labels = nd.array(data_train_labels).reshape((num_batches_train, BATCH_SIZE, SEQ_LENGTH, dim_outputs))
test_data_inputs = nd.array(data_test_inputs).reshape((num_batches_test, BATCH_SIZE_TEST, SEQ_LENGTH, dim_inputs))
test_data_labels = nd.array(data_test_labels).reshape((num_batches_test, BATCH_SIZE_TEST, SEQ_LENGTH, dim_outputs))
# test_bad_data_labels = nd.array(bad_data_test_labels).reshape((num_batches_test, BATCH_SIZE_TEST, SEQ_LENGTH, dim_outputs))

train_data_inputs = nd.swapaxes(train_data_inputs, 1, 2)
# train_data_non_zero_inputs = nd.swapaxes(train_data_non_zero_inputs, 1, 2)
train_data_labels = nd.swapaxes(train_data_labels, 1, 2)
test_data_inputs = nd.swapaxes(test_data_inputs, 1, 2)
test_data_labels = nd.swapaxes(test_data_labels, 1, 2)
# test_bad_data_labels = nd.swapaxes(test_bad_data_labels, 1, 2)


print('num_mini-batches_train={0} | seq_length={2} | mini-batch_size={1} | dim_input={3} | dim_output={4}'.format(num_batches_train, BATCH_SIZE, SEQ_LENGTH, dim_inputs, dim_outputs))
print('\ntrain_data_inputs shape: ', train_data_inputs.shape)
# print('train_data_non_zero_inputs shape: ', train_data_non_zero_inputs.shape)
print('train_data_labels shape: ', train_data_labels.shape)
print('\ntest_data_inputs shape: ', test_data_inputs.shape)
print('test_data_labels shape: ', test_data_labels.shape)
# print(train_data_labels)









    



num_mini-batches_train=64 | seq_length=2 | mini-batch_size=64 | dim_input=1 | dim_output=1

train_data_inputs shape:  (64L, 2L, 64L, 1L)
train_data_labels shape:  (64L, 2L, 64L, 1L)

test_data_inputs shape:  (478L, 2L, 1L, 1L)
test_data_labels shape:  (478L, 2L, 1L, 1L)



In [8]:

    
# select only sequences with at least one non-zero value
# data_train_non_zero_inputs = data_train_inputs[np.any(data_train_inputs != 0, axis=1).flatten()]
# len(data_train_non_zero_inputs)
len(data_test_non_zero)









    Out[8]:





478



In [9]:

    
# Allocate parameters



In [10]:

    
num_hidden_units = [64, 16]  # num of hidden units in each hidden LSTM layer
num_hidden_layers = len(num_hidden_units)  # num of hidden LSTM layers
num_units_layers = [dim_inputs] + num_hidden_units

norm_factor = 0.005  # normalization factor for weight initialization. set to 0.001 for prediction of small values (<1)

########################
#  Weights connecting the inputs to the hidden layer
########################
Wxg, Wxi, Wxf, Wxo, Whg, Whi, Whf, Who, bg, bi, bf, bo = {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {} 
for i_layer in range(1, num_hidden_layers+1):
    num_inputs = num_units_layers[i_layer-1]
    num_hidden_units = num_units_layers[i_layer]
    Wxg[i_layer] = nd.random_normal(shape=(num_inputs,num_hidden_units), ctx=ctx) * norm_factor
    Wxi[i_layer] = nd.random_normal(shape=(num_inputs,num_hidden_units), ctx=ctx) * norm_factor
    Wxf[i_layer] = nd.random_normal(shape=(num_inputs,num_hidden_units), ctx=ctx) * norm_factor
    Wxo[i_layer] = nd.random_normal(shape=(num_inputs,num_hidden_units), ctx=ctx) * norm_factor

    ########################
    #  Recurrent weights connecting the hidden layer across time steps
    ########################
    Whg[i_layer] = nd.random_normal(shape=(num_hidden_units, num_hidden_units), ctx=ctx) * norm_factor
    Whi[i_layer] = nd.random_normal(shape=(num_hidden_units, num_hidden_units), ctx=ctx) * norm_factor
    Whf[i_layer] = nd.random_normal(shape=(num_hidden_units, num_hidden_units), ctx=ctx) * norm_factor
    Who[i_layer] = nd.random_normal(shape=(num_hidden_units, num_hidden_units), ctx=ctx) * norm_factor

    ########################
    #  Bias vector for hidden layer
    ########################
    bg[i_layer] = nd.random_normal(shape=num_hidden_units, ctx=ctx) * norm_factor
    bi[i_layer] = nd.random_normal(shape=num_hidden_units, ctx=ctx) * norm_factor
    bf[i_layer] = nd.random_normal(shape=num_hidden_units, ctx=ctx) * norm_factor
    bo[i_layer] = nd.random_normal(shape=num_hidden_units, ctx=ctx) * norm_factor

########################
# Weights to the output nodes
########################
Why = nd.random_normal(shape=(num_units_layers[-1], dim_outputs), ctx=ctx) * norm_factor
by = nd.random_normal(shape=dim_outputs, ctx=ctx) * norm_factor

Attach the gradients



In [11]:

    
params = []
for i_layer in range(1, num_hidden_layers+1):
    params += [Wxg[i_layer], Wxi[i_layer], Wxf[i_layer], Wxo[i_layer], Whg[i_layer], Whi[i_layer], Whf[i_layer], Who[i_layer], bg[i_layer], bi[i_layer], bf[i_layer], bo[i_layer]]

params += [Why, by]  # add the output layer

for param in params:
    param.attach_grad()

Loss utility functions



In [12]:

    
def rmse(yhat, y):
    return nd.mean(nd.sqrt(nd.sum(nd.power(y - yhat, 2), axis=0, exclude=True)))

def average_rmse_loss(outputs, labels, last_element_only=False):
#     assert(len(outputs) == len(labels))
    total_loss = 0.
    zipped_elements = zip(outputs,labels)
    if last_element_only:
        output, label = zipped_elements[-1]
        total_loss = rmse(output, label)
        return total_loss
    for (output, label) in zipped_elements:
        total_loss = total_loss + rmse(output, label)
    return total_loss / len(outputs)

Optimizer

Note: we will use the Adam Optimizer. Much faster and much more robust than Standard Gradient Descent.



In [13]:

    
from exceptions import ValueError

def SGD(params, learning_rate):
    for param in params:
        param[:] = param - learning_rate * param.grad

def adam(params, learning_rate, M , R, index_adam_call, beta1, beta2, eps):
    k = -1
    for param in params:
        k += 1
        M[k] = beta1 * M[k] + (1. - beta1) * param.grad
        R[k] = beta2 * R[k] + (1. - beta2) * (param.grad)**2
        # bias correction since we initilized M & R to zeros, they're biased toward zero on the first few iterations
        m_k_hat = M[k] / (1. - beta1**(index_adam_call))
        r_k_hat = R[k] / (1. - beta2**(index_adam_call))
        if((np.isnan(M[k].asnumpy())).any() or (np.isnan(R[k].asnumpy())).any()):
            raise(ValueError('Nans!!'))
        param[:] = param - learning_rate * m_k_hat / (nd.sqrt(r_k_hat) + eps)
    
    return params, M, R

Deep LSTM Net core functions



In [14]:

    
def single_lstm_unit_calcs(X, c, Wxg, h, Whg, bg, Wxi, Whi, bi, Wxf, Whf, bf, Wxo, Who, bo):
    g = nd.tanh(nd.dot(X, Wxg) + nd.dot(h, Whg) + bg)
    i = nd.sigmoid(nd.dot(X, Wxi) + nd.dot(h, Whi) + bi)
    f = nd.sigmoid(nd.dot(X, Wxf) + nd.dot(h, Whf) + bf)
    o = nd.sigmoid(nd.dot(X, Wxo) + nd.dot(h, Who) + bo)
    #######################
    c = f * c + i * g
    h = o * nd.tanh(c)
    return c, h

def deep_lstm_rnn(inputs, h, c, temperature=1.0):
    """
        h: dict of nd.arrays, each key is the index of a hidden layer (from 1 to whatever). 
        Index 0, if any, is the input layer
    """
    outputs = []
    # inputs is one BATCH of sequences so its shape is number_of_seq, seq_length, features_dim 
    # (latter is 1 for a time series, vocab_size for a character, n for a n different times series)
    for X in inputs:
        # X is batch of one time stamp. E.g. if each batch has 37 sequences, then the first value of X will be a set of the 37 first values of each of the 37 sequences 
        # that means each iteration on X corresponds to one time stamp, but it is done in batches of different sequences
        h[0] = X # the first hidden layer takes the input X as input 
        for i_layer in range(1, num_hidden_layers+1):
            # lstm units now have the 2 following inputs: 
            # i) h_t from the previous layer (equivalent to the input X for a non-deep lstm net), 
            # ii) h_t-1 from the current layer (same as for non-deep lstm nets)
            c[i_layer], h[i_layer] = single_lstm_unit_calcs(h[i_layer-1], c[i_layer], Wxg[i_layer], h[i_layer], Whg[i_layer], bg[i_layer], Wxi[i_layer], Whi[i_layer], bi[i_layer], Wxf[i_layer], Whf[i_layer], bf[i_layer], Wxo[i_layer], Who[i_layer], bo[i_layer])
        yhat_linear = nd.dot(h[num_hidden_layers], Why) + by
        # yhat is a batch of several values of the same time stamp
        # this is basically the prediction of the sequence, which overlaps most of the input sequence, plus one point (character or value)
#         yhat = nd.sigmoid(yhat_linear)
        yhat = yhat_linear # we cant use a 1.0-bounded activation function since outputs (car count) can be greater than 1.0
        outputs.append(yhat) # outputs has same shape as inputs, i.e. a list of batches of data points.

    return (outputs, h, c)

Test and visualize predictions



In [15]:

    
# inn = ndarray.array([[1],[2],[3],[4]])
# outputs, _, _ = deep_lstm_rnn(inn, h, c, temperature=1.)
# print(outputs[-1])
# outputs = np.array(outputs)
# print(outputs)
# vfunc = np.vectorize(lambda x: ndarray.array([x]))    
# outputs_tmp = vfunc(outputs)
# outputs_tmp = outputs_tmp.tolist()



In [16]:

    
INDEX_TARGET_VALUE = 0
def test_prediction(one_input_seq, one_label_seq, temperature=1.0):
      # WE ASSUME the first value in input vector is the variable of interest
    #####################################
    # Set the initial state of the hidden representation ($h_0$) to the zero vector
    #####################################  # some better initialization needed??
    h, c = {}, {}
    for i_layer in range(1, num_hidden_layers+1):
        h[i_layer] = nd.zeros(shape=(BATCH_SIZE_TEST, num_units_layers[i_layer]), ctx=ctx)
        c[i_layer] = nd.zeros(shape=(BATCH_SIZE_TEST, num_units_layers[i_layer]), ctx=ctx)
    
    outputs, h, c = deep_lstm_rnn(one_input_seq, h, c, temperature=temperature)
    
#     outputs = [nd_round(one_out) for one_out in outputs]
#     print('ZZZ ', outputs[-1])
#     round up values of outputs to check if that improves predictions

#     outputs = np.array(outputs).round()
#     vfunc = np.vectorize(lambda x: ndarray.array([x]))    
#     outputs_tmp = vfunc(outputs)
#     outputs_tmp = outputs_tmp.tolist()
    
    return outputs[-1][0].asnumpy()[INDEX_TARGET_VALUE], one_label_seq.asnumpy()[-1].flatten()[INDEX_TARGET_VALUE], outputs, one_label_seq

def check_prediction(index, test_inputs=None, test_labels=None):
    if test_inputs is None:
        test_inputs = test_data_inputs
    if test_labels is None:
        test_labels = test_data_labels
    if index >= len(test_inputs):
        index = np.random.randint(len(test_inputs))

    o, label, outputs, labels = test_prediction(test_inputs[index], test_labels[index], temperature=1.0)    
    
    # HERE, we're intereste in INTEGERS, so round up predictions. 
    # And yes, I tried doing that at the training level, and it doesn't do much apart slowing things down
#     prediction = round(o, 0)
#     true_label = round(label, 3)
    # if batch_size_test=1 then this float() will work, otherwise, nope.
    outputs = [round(float(i.asnumpy().flatten()[INDEX_TARGET_VALUE]), 0) for i in outputs] 
    true_labels = list(test_labels[index].asnumpy()[:,:,INDEX_TARGET_VALUE].flatten())
    
    test_inputs_tmp = list(test_inputs[index].asnumpy()[:,:,INDEX_TARGET_VALUE].flatten())
    
    # first 'output' is None since rnn spits out the next-timestep prediction
    outputs, true_labels = [None] + outputs, [test_inputs_tmp[0]] + true_labels
    
    df = pd.DataFrame([outputs, true_labels]).transpose()
    df.columns = ['predicted', 'true']

    return df


def compute_test_errors(bias_correction=1., labels=None, test_inputs=None):
    if labels is None:
        labels = test_data_labels
    list_abs_rel_error, list_rel_error, list_abs_error, list_abs_diff = [], [], [], []
    for e in range(0, labels.shape[0]):
        data_prediction_df = check_prediction(index=e, test_inputs=test_inputs, test_labels=labels)
        prediction = round(data_prediction_df.tail(1)['predicted'].values.flatten()[-1] * bias_correction, 3) 
        true_label = round(data_prediction_df.tail(1)['true'].values.flatten()[-1], 3)
        if true_label != 0:
            list_abs_rel_error.append( round(100. * np.abs(prediction / (true_label) - 1.0), 2) )
            list_rel_error.append( round(100. * (prediction / (true_label) - 1.0), 2) )
            list_abs_error.append( round(prediction - true_label, 0) )
            list_abs_diff.append( round(abs(prediction - true_label), 2) )
        else:
#             continue
            if prediction == 0:
                prediction=1e-5
                true_label=1e-5
            else:
                true_label=0.5
            list_abs_rel_error.append( round(100. * np.abs(prediction / (true_label) - 1.0), 2) )
            list_rel_error.append( round(100. * (prediction / (true_label) - 1.0), 2) )
            list_abs_error.append( round(prediction - true_label, 0) )
            list_abs_diff.append( round(abs(prediction - true_label), 2) )

    return list_abs_rel_error, list_rel_error, list_abs_error, list_abs_diff



moving_rel_error = 0.
moving_loss = 0.
learning_rate = 0.002  # no more than 0.002, even with Adam, otherwise, convergence will likely saturate
refresh_interval = 40
epochs = 400  # one epoch is one pass over the entire training set

# Adam Optimizer stuff
beta1 = .9
beta2 = .999
index_adam_call = 0
# M & R arrays to keep track of momenta in adam optimizer. params is a list that contains all ndarrays of parameters
M = {k: nd.zeros_like(v) for k, v in enumerate(params)}
R = {k: nd.zeros_like(v) for k, v in enumerate(params)}



In [31]:

    
# needed to update plots on the fly
%matplotlib notebook
fig, axes_fig1 = plt.subplots(1,1, figsize=(18,9))
fig2, axes_fig2 = plt.subplots(1,1, figsize=(12,6))
df_moving_loss = pd.DataFrame(columns=['Loss', 'Error'])
df_moving_loss.index.name = 'Epoch'


for e in range(epochs):
    ############################
    # Attenuate the learning rate by a factor of 2 every 100 epochs
    ############################
    if ((e+1) % 1000 == 0):
        learning_rate = learning_rate / 2.0  # TODO check if its ok to adjust learning_rate when using Adam Optimizer
    h, c = {}, {}
    for i_layer in range(1, num_hidden_layers+1):
        h[i_layer] = nd.zeros(shape=(BATCH_SIZE, num_units_layers[i_layer]), ctx=ctx)
        c[i_layer] = nd.zeros(shape=(BATCH_SIZE, num_units_layers[i_layer]), ctx=ctx)

    for i in range(num_batches_train):
        data_one_hot = train_data_inputs[i]
        label_one_hot = train_data_labels[i]
        with autograd.record():
            outputs, h, c = deep_lstm_rnn(data_one_hot, h, c)
            loss = average_rmse_loss(outputs, label_one_hot, last_element_only=True)
            loss.backward()
#         SGD(params, learning_rate)
        index_adam_call += 1  # needed for bias correction in Adam optimizer
        try:
            params, M, R = adam(params, learning_rate, M, R, index_adam_call, beta1, beta2, 1e-8)
        except ValueError:
            M = {k: nd.zeros_like(v) for k, v in enumerate(params)}
            R = {k: nd.zeros_like(v) for k, v in enumerate(params)}
            #df_moving_loss = pd.DataFrame(columns=['Loss', 'Error'])
            #df_moving_loss.index.name = 'Epoch'
            pass

        
        ##########################
        #  Keep a moving average of the losses
        ##########################
        if (i == 0) and (e == 0):
            moving_loss = nd.mean(loss).asscalar()
        else:
            moving_loss = .99 * moving_loss + .01 * nd.mean(loss).asscalar()
        df_moving_loss.loc[e] = round((moving_loss), 5)

    ############################
    #  Predictions and plots
    ############################
    data_prediction_df = check_prediction(index=e)
    
    if not (e%refresh_interval):
        axes_fig1.clear()
        data_prediction_df.plot(ax=axes_fig1, style=['--', '-'], marker='o', alpha=0.6)
        fig.canvas.draw()
    prediction = round(1.*data_prediction_df.tail(1)['predicted'].values.flatten()[-1], 3)
    true_label = round(1.*data_prediction_df.tail(1)['true'].values.flatten()[-1], 3)
    if true_label != 0:
        rel_error = round(100. * np.abs(prediction / (true_label+1e-5) - 1.0), 2)
    else:
        rel_error = moving_rel_error
    if not (e%refresh_interval):
        print("Epoch = {0} | Loss = {1} | Prediction = {2} True = {3} Error = {4}".format(e, (moving_loss), prediction, true_label, rel_error ))
    if not (e%refresh_interval):
        axes_fig2.clear()
    if e == 0:
        moving_rel_error = rel_error
    else:
        moving_rel_error = .99 * moving_rel_error + .01 * rel_error

    df_moving_loss.loc[e, ['Error']] = moving_rel_error
    if not (e%refresh_interval):
        axes_loss_plot = df_moving_loss.plot(ax=axes_fig2, secondary_y='Loss', color=['r','b'])
        axes_loss_plot.right_ax.grid(False)
#         axes_loss_plot.right_ax.set_yscale('log')
        fig2.canvas.draw()
    
%matplotlib inline









    














    











    














    











    



Epoch = 0 | Loss = 0.24136845885 | Prediction = 1.0 True = 1.0 Error = 0.0
Epoch = 40 | Loss = 0.383494326751 | Prediction = 0.0 True = 1.0 Error = 100.0
Epoch = 80 | Loss = 0.381646839521 | Prediction = 1.0 True = 2.0 Error = 50.0
Epoch = 120 | Loss = 0.387409123133 | Prediction = 0.0 True = 0.0 Error = 13.2544738727
Epoch = 160 | Loss = 0.382538905324 | Prediction = -0.0 True = 4.0 Error = 100.0
Epoch = 200 | Loss = 0.408300721966 | Prediction = 0.0 True = 1.0 Error = 100.0
Epoch = 240 | Loss = 0.383599673158 | Prediction = 0.0 True = 0.0 Error = 28.8101879983
Epoch = 280 | Loss = 0.380294619626 | Prediction = -0.0 True = 0.0 Error = 30.1597670982
Epoch = 320 | Loss = 0.389746173309 | Prediction = 1.0 True = 1.0 Error = 0.0
Epoch = 360 | Loss = 0.392630438458 | Prediction = 1.0 True = 2.0 Error = 50.0

Error Analysis



In [33]:

    
list_abs_rel_error, list_rel_error, _, _ = compute_test_errors(bias_correction=1.0, 
                                                               labels=test_data_labels)

# bias_correction = 100. / (100. + np.mean(list_rel_error))  # estimate of the bias
bias_correction = 1.
# recompute errors using "de-biased" model
list_abs_rel_error, list_rel_error, list_abs_error, list_abs_diff = compute_test_errors(bias_correction=bias_correction, 
                                                                         labels=test_data_labels)

# recompute errors using "de-biased" model
# bad_data_list_abs_rel_error, bad_data_list_rel_error, bad_data_list_abs_error, bad_data_list_abs_diff = compute_test_errors(
#     bias_correction=bias_correction, 
#     labels=test_bad_data_labels)



In [29]:

    
# print(list_abs_error)
# print(test_data_labels)



In [34]:

    
%matplotlib inline
fig_errors, axes_fig_errors = plt.subplots(1,1, figsize=(10,8))
# plt.hist(list_abs_rel_error, 20, alpha=0.4, label='abs percent error')
# plt.hist(list_rel_error, 20, alpha=0.4, label='percent error')

plt.hist(list_abs_error, 17, range=[-8., 8.], alpha=0.5, normed=True, label='error (predicted - true)', color='blue')
# plt.hist(list_abs_diff, 6, range=[0., 6.], alpha=0.5, label='good data', color='blue')
# plt.hist(bad_data_list_abs_diff, 100, range=[0., 5.], alpha=0.5, label='bad data', color='red')

plt.legend(loc='upper right')
plt.xlabel('error')
plt.gca().set_xlim([-8, 8])
plt.show()

# print('Estimate of the Bias: {0}% | Median Abs Rel Error: {1}%'.format(round(bias_correction,1), round(np.median(list_abs_rel_error), 1)))



In [ ]: