In [1]:
# -*- coding: UTF-8 -*-
#%load_ext autoreload
%reload_ext autoreload
%autoreload 2

In [2]:
from __future__ import division
import tensorflow as tf
from os import path
import numpy as np
import pandas as pd
import csv
from sklearn.model_selection import StratifiedShuffleSplit
from time import time
from matplotlib import pyplot as plt
import seaborn as sns
from mylibs.jupyter_notebook_helper import show_graph
from tensorflow.contrib import rnn
from tensorflow.contrib import learn
import shutil
from tensorflow.contrib.learn.python.learn import learn_runner
from IPython.display import Image
from IPython.core.display import HTML
from mylibs.tf_helper import getDefaultGPUconfig
from data_providers.binary_shifter_varlen_data_provider import \
    BinaryShifterVarLenDataProvider
from data_providers.price_history_varlen_data_provider import PriceHistoryVarLenDataProvider
from models.model_05_price_history_rnn_varlen import PriceHistoryRnnVarlen
from sklearn.metrics import r2_score
from mylibs.py_helper import factors
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean
from statsmodels.tsa.stattools import coint


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-2-6e6c99a120d5> in <module>()
     19 from data_providers.binary_shifter_varlen_data_provider import     BinaryShifterVarLenDataProvider
     20 from data_providers.price_history_varlen_data_provider import PriceHistoryVarLenDataProvider
---> 21 from models.price_history_rnn_varlen import PriceHistoryRnnVarlen
     22 from sklearn.metrics import r2_score
     23 from mylibs.py_helper import factors

ImportError: No module named price_history_rnn_varlen

In [ ]:
dtype = tf.float32
seed = 16011984
random_state = np.random.RandomState(seed=seed)
config = getDefaultGPUconfig()
%matplotlib inline

In [ ]:
from common import get_or_run_nn

Step 0 - hyperparams


In [ ]:
num_epochs = 10
series_max_len = 60
num_features = 1  #just one here, the function we are predicting is one-dimensional
state_size = 400
target_len = 30
batch_size = 47

Step 1 - collect data (and/or generate them)


In [ ]:
csv_in = '../price_history_03a_fixed_width.csv'
npz_path = '../price_history_03_dp_60to30_from_fixed_len.npz'

In [ ]:
# XX, YY, sequence_lens, seq_mask = PriceHistoryVarLenDataProvider.createAndSaveDataset(
#     csv_in=csv_in,
#     npz_out=npz_path,
#     input_seq_len=60, target_seq_len=30)
# XX.shape, YY.shape, sequence_lens.shape, seq_mask.shape

In [ ]:
dp = PriceHistoryVarLenDataProvider(filteringSeqLens = lambda xx : xx >= target_len,
                                    npz_path=npz_path)
dp.inputs.shape, dp.targets.shape, dp.sequence_lengths.shape, dp.sequence_masks.shape

Step 2 - Build model


In [ ]:
model = PriceHistoryRnnVarlen(rng=random_state, dtype=dtype, config=config)

In [ ]:
graph = model.getGraph(batch_size=batch_size, state_size=state_size,
                       target_len=target_len, series_max_len=series_max_len)

In [ ]:
show_graph(graph)

Step 3 training the network


In [ ]:
num_epochs, state_size, batch_size

In [ ]:
def experiment():
    dynStats, predictions_dict = model.run(epochs=num_epochs,
                                        state_size=state_size,
                                         series_max_len=series_max_len,
                                         target_len=target_len,
                                         npz_path=npz_path,
                                         batch_size=batch_size)
    return dynStats, predictions_dict

In [ ]:
from os.path import isdir
data_folder = '../../../../Dropbox/data'
assert isdir(data_folder)

In [16]:
dyn_stats, preds_dict = get_or_run_nn(experiment,
                                      filename='001_plain_rnn_60to30', nn_runs_folder= data_folder + '/nn_runs')


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-16-dbc4d412c4d6> in <module>()
      1 dyn_stats, preds_dict = get_or_run_nn(experiment,
----> 2                                       filename='001_plain_rnn_60to30', nn_runs_folder= data_folder + '/nn_runs')

/home/student/pligor.george@gmail.com/msc_Artificial_Intelligence/dissertation/04_time_series_prediction/common.pyc in get_or_run_nn(callback, filename, nn_runs_folder)
      9     if isfile(filepath):
     10         arr = np.load(filepath)
---> 11         return arr['dyn_stats'][()], arr['preds_dict'][()], arr['targets'][()]
     12     else:
     13         dyn_stats, preds_dict, targets = callback()

/home/student/anaconda2/envs/dis/lib/python2.7/site-packages/numpy/lib/npyio.pyc in __getitem__(self, key)
    235                 return self.zip.read(key)
    236         else:
--> 237             raise KeyError("%s is not a file in the archive" % key)
    238 
    239     def __iter__(self):

KeyError: 'targets is not a file in the archive'

In [48]:
dyn_stats.plotStats()
plt.show()



In [49]:
r2_scores = [r2_score(y_true=dp.targets[ind], y_pred=preds_dict[ind])
            for ind in range(len(dp.targets))]

In [50]:
ind = np.argmin(r2_scores)
ind


Out[50]:
3918

In [51]:
sns.tsplot(data=dp.inputs[ind].flatten())


Out[51]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f21d50c1590>

In [52]:
reals = dp.targets[ind]
preds = preds_dict[ind]

In [53]:
r2_score(y_true=reals, y_pred=preds)


Out[53]:
-2.4957652385534625e+31

In [54]:
fig = plt.figure(figsize=(15,6))
plt.plot(reals, 'b')
plt.plot(preds, 'g')
plt.legend(['reals','preds'])
plt.show()



In [24]:
%%time
dtw_scores = [fastdtw(dp.targets[ind], preds_dict[ind])[0]
             for ind in range(len(dp.targets))]


CPU times: user 25.7 s, sys: 212 ms, total: 25.9 s
Wall time: 25.7 s

In [25]:
np.mean(dtw_scores)


Out[25]:
219.4250192073776

In [26]:
coint(preds, reals)


Out[26]:
(-1.0835250967946168,
 0.88582660333674834,
 array([-4.31395736, -3.55493606, -3.19393252]))

Mean Squared Error (instead of huber loss)


In [55]:
num_epochs, state_size, batch_size
cost_func = PriceHistoryRnnVarlen.COST_FUNCS.MSE

In [56]:
def experiment():
    dynStats, predictions_dict = model.run(epochs=num_epochs,
                                           cost_func= cost_func,
                                           state_size=state_size,
                                           series_max_len=series_max_len,
                                           target_len=target_len,
                                           npz_path=npz_path,
                                           batch_size=batch_size)
    return dynStats, predictions_dict

In [57]:
dyn_stats, preds_dict = get_or_run_nn(experiment,
                                      filename='001_plain_rnn_60to30_mse')

In [58]:
dyn_stats.plotStats()
plt.show()



In [59]:
r2_scores = [r2_score(y_true=dp.targets[ind], y_pred=preds_dict[ind])
            for ind in range(len(dp.targets))]

In [60]:
ind = np.argmin(r2_scores)
ind


Out[60]:
3918

In [61]:
reals = dp.targets[ind]
preds = preds_dict[ind]

In [62]:
r2_score(y_true=reals, y_pred=preds)


Out[62]:
-3.9319565891147985e+31

In [63]:
sns.tsplot(data=dp.inputs[ind].flatten())


Out[63]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f21c87a1d50>

In [64]:
fig = plt.figure(figsize=(15,6))
plt.plot(reals, 'b')
plt.plot(preds, 'g')
plt.legend(['reals','preds'])
plt.show()



In [43]:
%%time
dtw_scores = [fastdtw(dp.targets[ind], preds_dict[ind])[0]
             for ind in range(len(dp.targets))]


CPU times: user 25.3 s, sys: 208 ms, total: 25.5 s
Wall time: 25.4 s

In [44]:
np.mean(dtw_scores)


Out[44]:
238.41310051455611

In [65]:
coint(preds, reals)


Out[65]:
(-1.2081181303105468,
 0.85576237543792955,
 array([-4.31395736, -3.55493606, -3.19393252]))

Conclusion

The first indication is that the MSE error function does not seem better than the Huber Loss function, only slightly faster


In [72]:


In [ ]: