In [1]:

    
# -*- coding: UTF-8 -*-
#%load_ext autoreload
%reload_ext autoreload
%autoreload 2



In [2]:

    
from __future__ import division
import tensorflow as tf
from os import path, remove
import numpy as np
import pandas as pd
import csv
from sklearn.model_selection import StratifiedShuffleSplit
from time import time
from matplotlib import pyplot as plt
import seaborn as sns
from mylibs.jupyter_notebook_helper import show_graph, renderStatsList, renderStatsCollection, \
    renderStatsListWithLabels, renderStatsCollectionOfCrossValids, plot_res_gp, my_plot_convergence
from tensorflow.contrib import rnn
from tensorflow.contrib import learn
import shutil
from tensorflow.contrib.learn.python.learn import learn_runner
from mylibs.tf_helper import getDefaultGPUconfig
from sklearn.metrics import r2_score
from mylibs.py_helper import factors
from fastdtw import fastdtw
from collections import OrderedDict
from scipy.spatial.distance import euclidean
from statsmodels.tsa.stattools import coint
from common_33 import get_or_run_nn
from skopt.space.space import Integer, Real
from skopt import gp_minimize
from skopt.plots import plot_convergence
import pickle
import inspect
import dill
import sys

from models.model_36_price_history_autoencoder import PriceHistoryAutoencoder
from data_providers.data_provider_33_price_history_autoencoder import PriceHistoryAutoEncDataProvider
#from gp_opt.price_history_27_gp_opt import PriceHistoryGpOpt









    



/home/studenthp/anaconda2/envs/dis/lib/python2.7/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools



In [3]:

    
dtype = tf.float32
seed = 16011984
random_state = np.random.RandomState(seed=seed)
config = getDefaultGPUconfig()
n_jobs = 1
%matplotlib inline

Step 0 - hyperparams



In [4]:

    
factors(689)









    Out[4]:





[1, 53, 13, 689]



In [5]:

    
max_seq_len = 682



In [6]:

    
data_path = '../../../../Dropbox/data'



In [7]:

    
phae_path = data_path + '/price_hist_autoencoder'



In [8]:

    
npz_dates = phae_path + '/price_history_full_seqs_dates.npz'
assert path.isfile(npz_dates)



In [9]:

    
npz_train = phae_path + '/price_history_seqs_dates_normed_train.npz'
assert path.isfile(npz_train)



In [10]:

    
npz_test = phae_path + '/price_history_seqs_dates_normed_test.npz'
assert path.isfile(npz_test)



In [11]:

    
npz_path = npz_train[:-len('_train.npz')]



In [12]:

    
for key, val in np.load(npz_train).iteritems():
    print key, ",", val.shape









    



inputs , (689, 682, 7)
sku_ids , (689,)
sequence_masks , (689, 682)
extra_inputs , (689, 682, 6)
sequence_lengths , (689,)

Step 1 - collect data



In [13]:

    
# dp = PriceHistoryAutoEncDataProvider(npz_path=npz_path, batch_size=53, with_EOS=False)
# for data in dp.datalist:
#     print data.shape

(689, 682, 7)
(689, 682, 6)
(689,)
(689, 682)



In [14]:

    
# for item in dp.next():
#     print item.shape

Step 2 - Build model



In [15]:

    
# model = PriceHistoryAutoencoder(rng=random_state, dtype=dtype, config=config)
# graph = model.getGraph(batch_size=53,
                       
#                        #the way we have it these two must be equal for now
#                        enc_num_units = 10,
#                        hidden_enc_num_units = 10,
                       
#                        hidden_enc_dim = 12,
#                        hidden_dec_dim = 13,
                       
#                        #the way we have it these two must be equal for now
#                        hidden_dec_num_units = 14,
#                        dec_num_units = 14,
                       
#                        ts_len=max_seq_len)



In [16]:

    
#show_graph(graph)

Step 3 training the network



In [17]:

    
model = PriceHistoryAutoencoder(rng=random_state, dtype=dtype, config=config)



In [18]:

    
npz_test = npz_path + '_test.npz'
assert path.isfile(npz_test)
path.abspath(npz_test)









    Out[18]:





'/home/studenthp/Dropbox/data/price_hist_autoencoder/price_history_seqs_dates_normed_test.npz'



In [19]:

    
def experiment():
    return model.run(npz_path=npz_path,
                     epochs=50,
                     batch_size = 13,
                     
                     enc_num_units = 250,
                     hidden_enc_num_units = 250,
                     
                     hidden_enc_dim = 101,
                     hidden_dec_dim = 101,
                     
                     hidden_dec_num_units = 250,
                     dec_num_units = 250,
                     
                     ts_len=max_seq_len,
                     learning_rate = 1e-3,
                     learning_rate_diff = 1e-5,
                     preds_gather_enabled = True,
                    )



In [20]:

    
#%%time
# dyn_stats_dic, preds_dict, targets, twods = experiment()
dyn_stats_dic, preds_dict, targets, twods = get_or_run_nn(experiment, filename='036_autoencoder_000',
                                                      nn_runs_folder = data_path + "/nn_runs")









    



epochs: 50
dec_num_units: 250
learning_rate: 0.001
enc_num_units: 250
learning_rate_diff: 1e-05
End Epoch 01 (330.771 secs): err(train) = 0.010194, err_diff(train) = 0.000186
End Epoch 02 (328.088 secs): err(train) = 0.004593, err_diff(train) = 0.000159
End Epoch 03 (328.042 secs): err(train) = 0.004937, err_diff(train) = 0.000158
End Epoch 04 (328.102 secs): err(train) = 0.004761, err_diff(train) = 0.000157
End Epoch 05 (334.092 secs): err(train) = 0.004762, err_diff(train) = 0.000156
End Epoch 06 (332.798 secs): err(train) = 0.004744, err_diff(train) = 0.000158
End Epoch 07 (332.767 secs): err(train) = 0.004913, err_diff(train) = 0.000159
End Epoch 08 (332.537 secs): err(train) = 0.004847, err_diff(train) = 0.000157
End Epoch 09 (332.986 secs): err(train) = 0.004791, err_diff(train) = 0.000156
End Epoch 10 (332.988 secs): err(train) = 0.004777, err_diff(train) = 0.000156
End Epoch 11 (332.980 secs): err(train) = 0.004670, err_diff(train) = 0.000157
End Epoch 12 (332.938 secs): err(train) = 0.004852, err_diff(train) = 0.000157
End Epoch 13 (333.061 secs): err(train) = 0.004705, err_diff(train) = 0.000157
End Epoch 14 (332.992 secs): err(train) = 0.004741, err_diff(train) = 0.000158
End Epoch 15 (333.019 secs): err(train) = 0.004912, err_diff(train) = 0.000157
End Epoch 16 (333.035 secs): err(train) = 0.004804, err_diff(train) = 0.000156
End Epoch 17 (333.005 secs): err(train) = 0.004787, err_diff(train) = 0.000156
End Epoch 18 (333.012 secs): err(train) = 0.004727, err_diff(train) = 0.000155
End Epoch 19 (332.998 secs): err(train) = 0.004646, err_diff(train) = 0.000156
End Epoch 20 (333.039 secs): err(train) = 0.004727, err_diff(train) = 0.000157
End Epoch 21 (333.020 secs): err(train) = 0.004799, err_diff(train) = 0.000157
End Epoch 22 (332.877 secs): err(train) = 0.004676, err_diff(train) = 0.000156
End Epoch 23 (332.913 secs): err(train) = 0.004768, err_diff(train) = 0.000156
End Epoch 24 (332.807 secs): err(train) = 0.004652, err_diff(train) = 0.000156
End Epoch 25 (332.920 secs): err(train) = 0.004823, err_diff(train) = 0.000158
End Epoch 26 (332.853 secs): err(train) = 0.004778, err_diff(train) = 0.000157
End Epoch 27 (332.790 secs): err(train) = 0.004717, err_diff(train) = 0.000157
End Epoch 28 (332.547 secs): err(train) = 0.004281, err_diff(train) = 0.000158
End Epoch 29 (332.686 secs): err(train) = 0.003597, err_diff(train) = 0.000157
End Epoch 30 (332.696 secs): err(train) = 0.003277, err_diff(train) = 0.000157
End Epoch 31 (332.648 secs): err(train) = 0.002890, err_diff(train) = 0.000156
End Epoch 32 (332.779 secs): err(train) = 0.002410, err_diff(train) = 0.000156
End Epoch 33 (332.756 secs): err(train) = 0.002785, err_diff(train) = 0.000156
End Epoch 34 (332.666 secs): err(train) = 0.002189, err_diff(train) = 0.000156
End Epoch 35 (332.685 secs): err(train) = 0.002568, err_diff(train) = 0.000156
End Epoch 36 (332.713 secs): err(train) = 0.002466, err_diff(train) = 0.000156
End Epoch 37 (332.673 secs): err(train) = 0.002306, err_diff(train) = 0.000156
End Epoch 38 (332.647 secs): err(train) = 0.001951, err_diff(train) = 0.000155
End Epoch 39 (332.800 secs): err(train) = 0.001870, err_diff(train) = 0.000156
End Epoch 40 (332.752 secs): err(train) = 0.002223, err_diff(train) = 0.000156
End Epoch 41 (332.648 secs): err(train) = 0.001797, err_diff(train) = 0.000156
End Epoch 42 (332.650 secs): err(train) = 0.002142, err_diff(train) = 0.000156
End Epoch 43 (332.633 secs): err(train) = 0.002246, err_diff(train) = 0.000155
End Epoch 44 (332.652 secs): err(train) = 0.001875, err_diff(train) = 0.000156
End Epoch 45 (332.630 secs): err(train) = 0.001692, err_diff(train) = 0.000155
End Epoch 46 (332.603 secs): err(train) = 0.001782, err_diff(train) = 0.000155
End Epoch 47 (332.881 secs): err(train) = 0.001445, err_diff(train) = 0.000155
End Epoch 48 (332.995 secs): err(train) = 0.001631, err_diff(train) = 0.000155
End Epoch 49 (332.910 secs): err(train) = 0.001796, err_diff(train) = 0.000155
End Epoch 50 (332.567 secs): err(train) = 0.001534, err_diff(train) = 0.000155
total test error: 0.00490289210465
total test diff error: 0.000155301308865



In [21]:

    
dyn_stats_dic['dyn_stats'].plotStats()
plt.show()
dyn_stats_dic['dyn_stats_diff'].plotStats()
plt.show()



In [22]:

    
r2_scores = [r2_score(y_true=targets[ind], y_pred=preds_dict[ind])
            for ind in range(len(targets))]



In [23]:

    
ind = np.argmin(r2_scores)
ind









    Out[23]:





35



In [24]:

    
reals = targets[ind]
preds = preds_dict[ind]



In [25]:

    
r2_score(y_true=reals, y_pred=preds)









    Out[25]:





-269853566.37884647



In [26]:

    
#sns.tsplot(data=dp.inputs[ind].flatten())



In [27]:

    
fig = plt.figure(figsize=(15,6))
plt.plot(reals, 'b')
plt.plot(preds, 'g')
plt.legend(['reals','preds'])
plt.show()



In [28]:

    
%%time
dtw_scores = [fastdtw(targets[ind], preds_dict[ind])[0]
             for ind in range(len(targets))]









    



CPU times: user 10.1 s, sys: 64 ms, total: 10.2 s
Wall time: 10.1 s



In [29]:

    
np.mean(dtw_scores)









    Out[29]:





20.081989750539115



In [30]:

    
coint(preds, reals)









    Out[30]:





(-0.79621380079257409,
 0.93519951574637317,
 array([-4.21692744, -3.50625177, -3.16106388]))



In [45]:

    
cur_ind = np.random.randint(len(targets))
reals = targets[cur_ind]
preds = preds_dict[cur_ind]
fig = plt.figure(figsize=(15,6))
plt.plot(reals, 'b', label='reals')
plt.plot(preds, 'g')
plt.legend(['reals','preds'])
plt.show()

Conclusion

The autoencoder is not able to represent in a visibly obvious way our price history time series

TS in Two Dimensions



In [32]:

    
twod_arr = np.array(twods.values())
twod_arr.shape









    Out[32]:





(689, 2)



In [33]:

    
plt.figure(figsize=(16,7))
plt.plot(twod_arr[:, 0], twod_arr[:, 1], 'r.')
plt.title('two dimensional representation of our time series after dimensionality reduction')
plt.xlabel('first dimension')
plt.ylabel('second dimension')
plt.show()



In [ ]: