In [1]:
# -*- coding: UTF-8 -*-
#%load_ext autoreload
%reload_ext autoreload
%autoreload 2

In [2]:
from __future__ import division
import tensorflow as tf
from os import path
import numpy as np
import pandas as pd
import csv
from sklearn.model_selection import StratifiedShuffleSplit
from time import time
from matplotlib import pyplot as plt
import seaborn as sns
from mylibs.jupyter_notebook_helper import show_graph
from tensorflow.contrib import rnn
from tensorflow.contrib import learn
import shutil
from tensorflow.contrib.learn.python.learn import learn_runner
from mylibs.tf_helper import getDefaultGPUconfig
from sklearn.metrics import r2_score
from mylibs.py_helper import factors
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean
from statsmodels.tsa.stattools import coint
from common import get_or_run_nn
from data_providers.price_history_seq2seq_data_provider import PriceHistorySeq2SeqDataProvider
from models.price_history_seq2seq_native import PriceHistorySeq2SeqNative


/home/studenthp/anaconda2/envs/dis/lib/python2.7/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools

In [3]:
dtype = tf.float32
seed = 16011984
random_state = np.random.RandomState(seed=seed)
config = getDefaultGPUconfig()
%matplotlib inline

Step 0 - hyperparams

vocab_size is all the potential words you could have (classification for translation case) and max sequence length are the SAME thing

decoder RNN hidden units are usually same size as encoder RNN hidden units in translation but for our case it does not seem really to be a relationship there but we can experiment and find out later, not a priority thing right now


In [4]:
num_epochs = 10

num_features = 1
num_units = 400 #state size

input_len = 60
target_len = 30

batch_size = 47
#batch_size = 50
#trunc_backprop_len = ??

Step 1 - collect data (and/or generate them)


In [5]:
npz_path = '../price_history_03_dp_60to30_from_fixed_len.npz'
#npz_path = '../data/price_history/price_history_03_dp_60to30_6400_train.npz'

In [6]:
dp = PriceHistorySeq2SeqDataProvider(npz_path=npz_path, batch_size=batch_size)
dp.inputs.shape, dp.targets.shape


Out[6]:
((11374, 60, 1), (11374, 30))

In [7]:
aa, bb = dp.next()
aa.shape, bb.shape


Out[7]:
((47, 60, 1), (47, 31))

Step 2 - Build model


In [8]:
model = PriceHistorySeq2SeqNative(rng=random_state, dtype=dtype, config=config)

In [9]:
graph = model.getGraph(batch_size=batch_size,
                       num_units=num_units,
                       input_len=input_len,
                       target_len=target_len)


learning rate: 0.001000
60
Tensor("inputs/unstack:0", shape=(47, 1), dtype=float32)

Tensor("encoder_rnn_layer/rnn/basic_rnn_cell_59/Tanh:0", shape=(47, 400), dtype=float32)

BasicDecoderOutput(rnn_output=<tf.Tensor 'decoder_rnn_layer/decoder/transpose:0' shape=(47, ?, 400) dtype=float32>, sample_id=<tf.Tensor 'decoder_rnn_layer/decoder/transpose_1:0' shape=(47, ?) dtype=int32>)

Tensor("decoder_rnn_layer/decoder/transpose:0", shape=(47, ?, 400), dtype=float32)

Tensor("readout_layer/readouts:0", shape=(?, 1), dtype=float32)

Tensor("predictions/Reshape:0", shape=(47, 31), dtype=float32)

Tensor("error/Select:0", shape=(47, 31), dtype=float32)

Tensor("error/Mean:0", shape=(), dtype=float32)


In [10]:
#show_graph(graph)

Conclusion

There is no way this graph makes much sense but let's give it a try to see how bad really is

Step 3 training the network

RECALL: baseline is around 4 for huber loss for current problem, anything above 4 should be considered as major errors

Basic RNN cell (EOS 1000)


In [11]:
rnn_cell = PriceHistorySeq2SeqNative.RNN_CELLS.BASIC_RNN
num_epochs = 10
eos_token = float(1e3)
num_epochs, num_units, batch_size


Out[11]:
(10, 400, 47)

In [12]:
def experiment():
    return model.run(
        npz_path=npz_path,
        epochs=num_epochs,
        batch_size=batch_size,
        num_units=num_units,
        input_len = input_len,
        target_len = target_len,
        rnn_cell=rnn_cell,
        eos_token=eos_token,
    )

In [13]:
dyn_stats, preds_dict = get_or_run_nn(experiment, filename='007_rnn_seq2seq_native_EOS1000_60to30_10epochs')

In [14]:
dyn_stats.plotStats()
plt.show()



In [15]:
r2_scores = [r2_score(y_true=dp.targets[ind], y_pred=preds_dict[ind])
            for ind in range(len(dp.targets))]

In [16]:
ind = np.argmin(r2_scores)
ind


Out[16]:
3918

In [17]:
reals = dp.targets[ind]
preds = preds_dict[ind]

In [18]:
r2_score(y_true=reals, y_pred=preds)


Out[18]:
-4.9919848164179885e+31

In [19]:
sns.tsplot(data=dp.inputs[ind].flatten())


Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f505c7fd190>

In [20]:
fig = plt.figure(figsize=(15,6))
plt.plot(reals, 'b')
plt.plot(preds, 'g')
plt.legend(['reals','preds'])
plt.show()



In [21]:
%%time
dtw_scores = [fastdtw(dp.targets[ind], preds_dict[ind])[0]
             for ind in range(len(dp.targets))]


CPU times: user 24.9 s, sys: 104 ms, total: 25 s
Wall time: 24.9 s

In [22]:
np.mean(dtw_scores)


Out[22]:
411.17963842603302

In [23]:
coint(preds, reals)


Out[23]:
(0.34036394495620331,
 0.99132492506678827,
 array([-4.31395736, -3.55493606, -3.19393252]))

In [24]:
cur_ind = np.random.randint(len(dp.targets))
reals = dp.targets[cur_ind]
preds = preds_dict[cur_ind]
fig = plt.figure(figsize=(15,6))
plt.plot(reals, 'b')
plt.plot(preds, 'g')
plt.legend(['reals','preds'])
plt.show()


Conclusion

The initial price difference of the predictions is still not as good as we would expect, perhaps using an EOS as they do in machine translation models is not the best architecture for our case

GRU cell - with EOS = 1000


In [11]:
rnn_cell = PriceHistorySeq2SeqNative.RNN_CELLS.GRU
num_epochs = 30
eos_token = float(1e3)
num_epochs, num_units, batch_size


Out[11]:
(30, 400, 47)

In [12]:
def experiment():
    return model.run(
        npz_path=npz_path,
        epochs=num_epochs,
        batch_size=batch_size,
        num_units=num_units,
        input_len = input_len,
        target_len = target_len,
        rnn_cell=rnn_cell,
        eos_token=eos_token,
    )

In [13]:
experiment()


epochs: 30
End Epoch 01 (53.685 secs): err(train) = 6.4089
End Epoch 02 (52.962 secs): err(train) = 4.7552
End Epoch 03 (52.946 secs): err(train) = 4.4936
End Epoch 04 (52.996 secs): err(train) = 4.2464
End Epoch 05 (52.967 secs): err(train) = 4.0295
End Epoch 06 (52.958 secs): err(train) = 3.7900
End Epoch 07 (52.974 secs): err(train) = 3.5612
End Epoch 08 (52.979 secs): err(train) = 3.3881
End Epoch 09 (52.994 secs): err(train) = 3.1615
End Epoch 10 (52.964 secs): err(train) = 2.9864
End Epoch 11 (52.997 secs): err(train) = 2.7971
End Epoch 12 (53.017 secs): err(train) = 2.6431
End Epoch 13 (53.021 secs): err(train) = 2.5256
End Epoch 14 (52.985 secs): err(train) = 2.3730
End Epoch 15 (52.974 secs): err(train) = 2.2432
End Epoch 16 (53.025 secs): err(train) = 2.1385
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-13-064d5fa82e28> in <module>()
----> 1 experiment()

<ipython-input-12-75e2bef64792> in experiment()
      8         target_len = target_len,
      9         rnn_cell=rnn_cell,
---> 10         eos_token=eos_token,
     11     )

/home/studenthp/pligor.george@gmail.com/msc_Artificial_Intelligence/dissertation/04_time_series_prediction/models/price_history_seq2seq_native.py in run(self, npz_path, epochs, batch_size, num_units, input_len, target_len, eos_token, preds_gather_enabled, cost_func, rnn_cell)
     68 
     69         return self.train_validate(train_data=train_data, valid_data=None, graph=graph, epochs=epochs,
---> 70                                    preds_gather_enabled=preds_gather_enabled, preds_dp=preds_dp, batch_size=batch_size)
     71 
     72     def train_validate(self, train_data, valid_data, **kwargs):

/home/studenthp/pligor.george@gmail.com/msc_Artificial_Intelligence/dissertation/04_time_series_prediction/models/price_history_seq2seq_native.py in train_validate(self, train_data, valid_data, **kwargs)
     88             for epoch in range(epochs):
     89                 train_error, runTime = getRunTime(
---> 90                     lambda:
     91                     self.trainEpoch(
     92                         sess=sess,

/home/studenthp/pligor.george@gmail.com/msc_Artificial_Intelligence/dissertation/04_time_series_prediction/mylibs/jupyter_notebook_helper.pyc in getRunTime(function)
     38 def getRunTime(function):  # a = lambda _ = None : 3 or #a = lambda : 3
     39     run_start_time = time.time()
---> 40     result = function()
     41     run_time = time.time() - run_start_time
     42     return result, run_time

/home/studenthp/pligor.george@gmail.com/msc_Artificial_Intelligence/dissertation/04_time_series_prediction/models/price_history_seq2seq_native.py in <lambda>()
     91                     self.trainEpoch(
     92                         sess=sess,
---> 93                         data_provider=train_data,
     94                     )
     95                 )

/home/studenthp/pligor.george@gmail.com/msc_Artificial_Intelligence/dissertation/04_time_series_prediction/models/price_history_seq2seq_native.py in trainEpoch(self, sess, data_provider, extraFeedDict)
    173                                     }, extraFeedDict)
    174 
--> 175             _, batch_error = sess.run([self.train_step, self.error], feed_dict=feed_dic)
    176 
    177             train_error += batch_error

/home/studenthp/anaconda2/envs/dis/lib/python2.7/site-packages/tensorflow/python/client/session.pyc in run(self, fetches, feed_dict, options, run_metadata)
    776     try:
    777       result = self._run(None, fetches, feed_dict, options_ptr,
--> 778                          run_metadata_ptr)
    779       if run_metadata:
    780         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

/home/studenthp/anaconda2/envs/dis/lib/python2.7/site-packages/tensorflow/python/client/session.pyc in _run(self, handle, fetches, feed_dict, options, run_metadata)
    980     if final_fetches or final_targets:
    981       results = self._do_run(handle, final_targets, final_fetches,
--> 982                              feed_dict_string, options, run_metadata)
    983     else:
    984       results = []

/home/studenthp/anaconda2/envs/dis/lib/python2.7/site-packages/tensorflow/python/client/session.pyc in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
   1030     if handle is None:
   1031       return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
-> 1032                            target_list, options, run_metadata)
   1033     else:
   1034       return self._do_call(_prun_fn, self._session, handle, feed_dict,

/home/studenthp/anaconda2/envs/dis/lib/python2.7/site-packages/tensorflow/python/client/session.pyc in _do_call(self, fn, *args)
   1037   def _do_call(self, fn, *args):
   1038     try:
-> 1039       return fn(*args)
   1040     except errors.OpError as e:
   1041       message = compat.as_text(e.message)

/home/studenthp/anaconda2/envs/dis/lib/python2.7/site-packages/tensorflow/python/client/session.pyc in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
   1019         return tf_session.TF_Run(session, options,
   1020                                  feed_dict, fetch_list, target_list,
-> 1021                                  status, run_metadata)
   1022 
   1023     def _prun_fn(session, handle, feed_dict, fetch_list):

KeyboardInterrupt: 

In [27]:
dyn_stats, preds_dict = get_or_run_nn(experiment, filename='007_gru_seq2seq_native_EOS1000_60to30_30epochs')

In [28]:
dyn_stats.plotStats()
plt.show()


TODO autocorrelation


In [29]:
r2_scores = [r2_score(y_true=dp.targets[ind], y_pred=preds_dict[ind])
            for ind in range(len(dp.targets))]

In [30]:
ind = np.argmin(r2_scores)
ind


Out[30]:
3918

In [31]:
reals = dp.targets[ind]
preds = preds_dict[ind]

In [32]:
r2_score(y_true=reals, y_pred=preds)


Out[32]:
-3.2110824170500828e+30

In [33]:
sns.tsplot(data=dp.inputs[ind].flatten())


Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f505c5c6e50>

In [34]:
fig = plt.figure(figsize=(15,6))
plt.plot(reals, 'b')
plt.plot(preds, 'g')
plt.legend(['reals','preds'])
plt.show()



In [35]:
%%time
dtw_scores = [fastdtw(dp.targets[ind], preds_dict[ind])[0]
             for ind in range(len(dp.targets))]


CPU times: user 25.9 s, sys: 144 ms, total: 26 s
Wall time: 25.9 s

In [36]:
np.mean(dtw_scores)


Out[36]:
51.556931203833273

In [37]:
coint(preds, reals)


Out[37]:
(-0.83291084298420559,
 0.93023940691597928,
 array([-4.31395736, -3.55493606, -3.19393252]))

In [48]:
cur_ind = np.random.randint(len(dp.targets))
reals = dp.targets[cur_ind]
preds = preds_dict[cur_ind]
fig = plt.figure(figsize=(15,6))
plt.plot(reals, 'b')
plt.plot(preds, 'g')
plt.legend(['reals','preds'])
plt.show()


Conclusion

???


In [ ]: