notebook.community

Edit and run



In [1]:

    
# -*- coding: UTF-8 -*-
#%load_ext autoreload
%reload_ext autoreload
%autoreload 2

https://r2rt.com/recurrent-neural-networks-in-tensorflow-iii-variable-length-sequences.html



In [2]:

    
from __future__ import division
import tensorflow as tf
from os import path
import numpy as np
import pandas as pd
import csv
from sklearn.model_selection import StratifiedShuffleSplit
from time import time
from matplotlib import pyplot as plt
import seaborn as sns
from mylibs.jupyter_notebook_helper import show_graph
from tensorflow.contrib import rnn
from tensorflow.contrib import learn
import shutil
from tensorflow.contrib.learn.python.learn import learn_runner
from IPython.display import Image
from IPython.core.display import HTML
from mylibs.tf_helper import getDefaultGPUconfig
from data_providers.binary_shifter_varlen_data_provider import \
    BinaryShifterVarLenDataProvider
from data_providers.price_history_varlen_data_provider import PriceHistoryVarLenDataProvider
from models.model_05_price_history_rnn_varlen import PriceHistoryRnnVarlen
from sklearn.metrics import r2_score
from mylibs.py_helper import factors
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean
from statsmodels.tsa.stattools import coint









    



---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-2-6e6c99a120d5> in <module>()
     19 from data_providers.binary_shifter_varlen_data_provider import     BinaryShifterVarLenDataProvider
     20 from data_providers.price_history_varlen_data_provider import PriceHistoryVarLenDataProvider
---> 21 from models.price_history_rnn_varlen import PriceHistoryRnnVarlen
     22 from sklearn.metrics import r2_score
     23 from mylibs.py_helper import factors

ImportError: No module named price_history_rnn_varlen



In [ ]:

    
dtype = tf.float32
seed = 16011984
random_state = np.random.RandomState(seed=seed)
config = getDefaultGPUconfig()
%matplotlib inline



In [ ]:

    
from common import get_or_run_nn

Step 0 - hyperparams



In [ ]:

    
num_epochs = 10
series_max_len = 60
num_features = 1  #just one here, the function we are predicting is one-dimensional
state_size = 400
target_len = 30
batch_size = 47

Step 1 - collect data (and/or generate them)



In [ ]:

    
csv_in = '../price_history_03a_fixed_width.csv'
npz_path = '../price_history_03_dp_60to30_from_fixed_len.npz'



In [ ]:

    
# XX, YY, sequence_lens, seq_mask = PriceHistoryVarLenDataProvider.createAndSaveDataset(
#     csv_in=csv_in,
#     npz_out=npz_path,
#     input_seq_len=60, target_seq_len=30)
# XX.shape, YY.shape, sequence_lens.shape, seq_mask.shape



In [ ]:

    
dp = PriceHistoryVarLenDataProvider(filteringSeqLens = lambda xx : xx >= target_len,
                                    npz_path=npz_path)
dp.inputs.shape, dp.targets.shape, dp.sequence_lengths.shape, dp.sequence_masks.shape

Step 2 - Build model



In [ ]:

    
model = PriceHistoryRnnVarlen(rng=random_state, dtype=dtype, config=config)



In [ ]:

    
graph = model.getGraph(batch_size=batch_size, state_size=state_size,
                       target_len=target_len, series_max_len=series_max_len)



In [ ]:

    
show_graph(graph)

Step 3 training the network



In [ ]:

    
num_epochs, state_size, batch_size



In [ ]:

    
def experiment():
    dynStats, predictions_dict = model.run(epochs=num_epochs,
                                        state_size=state_size,
                                         series_max_len=series_max_len,
                                         target_len=target_len,
                                         npz_path=npz_path,
                                         batch_size=batch_size)
    return dynStats, predictions_dict



In [ ]:

    
from os.path import isdir
data_folder = '../../../../Dropbox/data'
assert isdir(data_folder)



In [16]:

    
dyn_stats, preds_dict = get_or_run_nn(experiment,
                                      filename='001_plain_rnn_60to30', nn_runs_folder= data_folder + '/nn_runs')









    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-16-dbc4d412c4d6> in <module>()
      1 dyn_stats, preds_dict = get_or_run_nn(experiment,
----> 2                                       filename='001_plain_rnn_60to30', nn_runs_folder= data_folder + '/nn_runs')

/home/student/pligor.george@gmail.com/msc_Artificial_Intelligence/dissertation/04_time_series_prediction/common.pyc in get_or_run_nn(callback, filename, nn_runs_folder)
      9     if isfile(filepath):
     10         arr = np.load(filepath)
---> 11         return arr['dyn_stats'][()], arr['preds_dict'][()], arr['targets'][()]
     12     else:
     13         dyn_stats, preds_dict, targets = callback()

/home/student/anaconda2/envs/dis/lib/python2.7/site-packages/numpy/lib/npyio.pyc in __getitem__(self, key)
    235                 return self.zip.read(key)
    236         else:
--> 237             raise KeyError("%s is not a file in the archive" % key)
    238 
    239     def __iter__(self):

KeyError: 'targets is not a file in the archive'



In [48]:

    
dyn_stats.plotStats()
plt.show()



In [49]:

    
r2_scores = [r2_score(y_true=dp.targets[ind], y_pred=preds_dict[ind])
            for ind in range(len(dp.targets))]



In [50]:

    
ind = np.argmin(r2_scores)
ind









    Out[50]:





3918



In [51]:

    
sns.tsplot(data=dp.inputs[ind].flatten())









    Out[51]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f21d50c1590>



In [52]:

    
reals = dp.targets[ind]
preds = preds_dict[ind]



In [53]:

    
r2_score(y_true=reals, y_pred=preds)









    Out[53]:





-2.4957652385534625e+31



In [54]:

    
fig = plt.figure(figsize=(15,6))
plt.plot(reals, 'b')
plt.plot(preds, 'g')
plt.legend(['reals','preds'])
plt.show()



In [24]:

    
%%time
dtw_scores = [fastdtw(dp.targets[ind], preds_dict[ind])[0]
             for ind in range(len(dp.targets))]









    



CPU times: user 25.7 s, sys: 212 ms, total: 25.9 s
Wall time: 25.7 s



In [25]:

    
np.mean(dtw_scores)









    Out[25]:





219.4250192073776



In [26]:

    
coint(preds, reals)









    Out[26]:





(-1.0835250967946168,
 0.88582660333674834,
 array([-4.31395736, -3.55493606, -3.19393252]))

TODO Co integration

https://en.wikipedia.org/wiki/Cointegration
https://www.quora.com/What-are-some-methods-to-check-similarities-between-two-time-series-data-sets
https://stackoverflow.com/questions/11362943/efficient-cointegration-test-in-python

Mean Squared Error (instead of huber loss)



In [55]:

    
num_epochs, state_size, batch_size
cost_func = PriceHistoryRnnVarlen.COST_FUNCS.MSE



In [56]:

    
def experiment():
    dynStats, predictions_dict = model.run(epochs=num_epochs,
                                           cost_func= cost_func,
                                           state_size=state_size,
                                           series_max_len=series_max_len,
                                           target_len=target_len,
                                           npz_path=npz_path,
                                           batch_size=batch_size)
    return dynStats, predictions_dict



In [57]:

    
dyn_stats, preds_dict = get_or_run_nn(experiment,
                                      filename='001_plain_rnn_60to30_mse')



In [58]:

    
dyn_stats.plotStats()
plt.show()



In [59]:

    
r2_scores = [r2_score(y_true=dp.targets[ind], y_pred=preds_dict[ind])
            for ind in range(len(dp.targets))]



In [60]:

    
ind = np.argmin(r2_scores)
ind









    Out[60]:





3918



In [61]:

    
reals = dp.targets[ind]
preds = preds_dict[ind]



In [62]:

    
r2_score(y_true=reals, y_pred=preds)









    Out[62]:





-3.9319565891147985e+31



In [63]:

    
sns.tsplot(data=dp.inputs[ind].flatten())









    Out[63]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f21c87a1d50>



In [64]:

    
fig = plt.figure(figsize=(15,6))
plt.plot(reals, 'b')
plt.plot(preds, 'g')
plt.legend(['reals','preds'])
plt.show()



In [43]:

    
%%time
dtw_scores = [fastdtw(dp.targets[ind], preds_dict[ind])[0]
             for ind in range(len(dp.targets))]









    



CPU times: user 25.3 s, sys: 208 ms, total: 25.5 s
Wall time: 25.4 s



In [44]:

    
np.mean(dtw_scores)









    Out[44]:





238.41310051455611



In [65]:

    
coint(preds, reals)









    Out[65]:





(-1.2081181303105468,
 0.85576237543792955,
 array([-4.31395736, -3.55493606, -3.19393252]))

Conclusion

The first indication is that the MSE error function does not seem better than the Huber Loss function, only slightly faster



In [72]:



In [ ]: