In [1]:
# -*- coding: UTF-8 -*-
#%load_ext autoreload
%reload_ext autoreload
%autoreload 2

In [2]:
from __future__ import division
import tensorflow as tf
from os import path, remove
import numpy as np
import pandas as pd
import csv
from sklearn.model_selection import StratifiedShuffleSplit
from time import time
from matplotlib import pyplot as plt
import seaborn as sns
from mylibs.jupyter_notebook_helper import show_graph, renderStatsList, renderStatsCollection, \
    renderStatsListWithLabels, renderStatsCollectionOfCrossValids, plot_res_gp, my_plot_convergence
from tensorflow.contrib import rnn
from tensorflow.contrib import learn
import shutil
from tensorflow.contrib.learn.python.learn import learn_runner
from mylibs.tf_helper import getDefaultGPUconfig
from sklearn.metrics import r2_score
from mylibs.py_helper import factors
from fastdtw import fastdtw
from collections import OrderedDict
from scipy.spatial.distance import euclidean
from statsmodels.tsa.stattools import coint
from common_33 import get_or_run_nn
from skopt.space.space import Integer, Real
from skopt import gp_minimize
from skopt.plots import plot_convergence
import pickle
import inspect
import dill
import sys

from models.model_36_price_history_autoencoder import PriceHistoryAutoencoder
from data_providers.data_provider_33_price_history_autoencoder import PriceHistoryAutoEncDataProvider
#from gp_opt.price_history_27_gp_opt import PriceHistoryGpOpt


/home/studenthp/anaconda2/envs/dis/lib/python2.7/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools

In [3]:
dtype = tf.float32
seed = 16011984
random_state = np.random.RandomState(seed=seed)
config = getDefaultGPUconfig()
n_jobs = 1
%matplotlib inline

Step 0 - hyperparams


In [4]:
factors(689)


Out[4]:
[1, 53, 13, 689]

In [5]:
max_seq_len = 682

In [6]:
data_path = '../../../../Dropbox/data'

In [7]:
phae_path = data_path + '/price_hist_autoencoder'

In [8]:
npz_dates = phae_path + '/price_history_full_seqs_dates.npz'
assert path.isfile(npz_dates)

In [9]:
npz_train = phae_path + '/price_history_seqs_dates_normed_train.npz'
assert path.isfile(npz_train)

In [10]:
npz_test = phae_path + '/price_history_seqs_dates_normed_test.npz'
assert path.isfile(npz_test)

In [11]:
npz_path = npz_train[:-len('_train.npz')]

In [12]:
for key, val in np.load(npz_train).iteritems():
    print key, ",", val.shape


inputs , (689, 682, 7)
sku_ids , (689,)
sequence_masks , (689, 682)
extra_inputs , (689, 682, 6)
sequence_lengths , (689,)

Step 1 - collect data


In [13]:
# dp = PriceHistoryAutoEncDataProvider(npz_path=npz_path, batch_size=53, with_EOS=False)
# for data in dp.datalist:
#     print data.shape

(689, 682, 7)
(689, 682, 6)
(689,)
(689, 682)


In [14]:
# for item in dp.next():
#     print item.shape

Step 2 - Build model


In [15]:
# model = PriceHistoryAutoencoder(rng=random_state, dtype=dtype, config=config)
# graph = model.getGraph(batch_size=53,
                       
#                        #the way we have it these two must be equal for now
#                        enc_num_units = 10,
#                        hidden_enc_num_units = 10,
                       
#                        hidden_enc_dim = 12,
#                        hidden_dec_dim = 13,
                       
#                        #the way we have it these two must be equal for now
#                        hidden_dec_num_units = 14,
#                        dec_num_units = 14,
                       
#                        ts_len=max_seq_len)

In [16]:
#show_graph(graph)

Step 3 training the network


In [17]:
model = PriceHistoryAutoencoder(rng=random_state, dtype=dtype, config=config)

In [18]:
npz_test = npz_path + '_test.npz'
assert path.isfile(npz_test)
path.abspath(npz_test)


Out[18]:
'/home/studenthp/Dropbox/data/price_hist_autoencoder/price_history_seqs_dates_normed_test.npz'

In [19]:
def experiment():
    return model.run(npz_path=npz_path,
                     epochs=50,
                     batch_size = 13,
                     
                     enc_num_units = 250,
                     hidden_enc_num_units = 250,
                     
                     hidden_enc_dim = 101,
                     hidden_dec_dim = 101,
                     
                     hidden_dec_num_units = 250,
                     dec_num_units = 250,
                     
                     ts_len=max_seq_len,
                     learning_rate = 1e-3,
                     learning_rate_diff = 1e-5,
                     preds_gather_enabled = True,
                    )

In [20]:
#%%time
# dyn_stats_dic, preds_dict, targets, twods = experiment()
dyn_stats_dic, preds_dict, targets, twods = get_or_run_nn(experiment, filename='036_autoencoder_000',
                                                      nn_runs_folder = data_path + "/nn_runs")


epochs: 50
dec_num_units: 250
learning_rate: 0.001
enc_num_units: 250
learning_rate_diff: 1e-05
End Epoch 01 (330.771 secs): err(train) = 0.010194, err_diff(train) = 0.000186
End Epoch 02 (328.088 secs): err(train) = 0.004593, err_diff(train) = 0.000159
End Epoch 03 (328.042 secs): err(train) = 0.004937, err_diff(train) = 0.000158
End Epoch 04 (328.102 secs): err(train) = 0.004761, err_diff(train) = 0.000157
End Epoch 05 (334.092 secs): err(train) = 0.004762, err_diff(train) = 0.000156
End Epoch 06 (332.798 secs): err(train) = 0.004744, err_diff(train) = 0.000158
End Epoch 07 (332.767 secs): err(train) = 0.004913, err_diff(train) = 0.000159
End Epoch 08 (332.537 secs): err(train) = 0.004847, err_diff(train) = 0.000157
End Epoch 09 (332.986 secs): err(train) = 0.004791, err_diff(train) = 0.000156
End Epoch 10 (332.988 secs): err(train) = 0.004777, err_diff(train) = 0.000156
End Epoch 11 (332.980 secs): err(train) = 0.004670, err_diff(train) = 0.000157
End Epoch 12 (332.938 secs): err(train) = 0.004852, err_diff(train) = 0.000157
End Epoch 13 (333.061 secs): err(train) = 0.004705, err_diff(train) = 0.000157
End Epoch 14 (332.992 secs): err(train) = 0.004741, err_diff(train) = 0.000158
End Epoch 15 (333.019 secs): err(train) = 0.004912, err_diff(train) = 0.000157
End Epoch 16 (333.035 secs): err(train) = 0.004804, err_diff(train) = 0.000156
End Epoch 17 (333.005 secs): err(train) = 0.004787, err_diff(train) = 0.000156
End Epoch 18 (333.012 secs): err(train) = 0.004727, err_diff(train) = 0.000155
End Epoch 19 (332.998 secs): err(train) = 0.004646, err_diff(train) = 0.000156
End Epoch 20 (333.039 secs): err(train) = 0.004727, err_diff(train) = 0.000157
End Epoch 21 (333.020 secs): err(train) = 0.004799, err_diff(train) = 0.000157
End Epoch 22 (332.877 secs): err(train) = 0.004676, err_diff(train) = 0.000156
End Epoch 23 (332.913 secs): err(train) = 0.004768, err_diff(train) = 0.000156
End Epoch 24 (332.807 secs): err(train) = 0.004652, err_diff(train) = 0.000156
End Epoch 25 (332.920 secs): err(train) = 0.004823, err_diff(train) = 0.000158
End Epoch 26 (332.853 secs): err(train) = 0.004778, err_diff(train) = 0.000157
End Epoch 27 (332.790 secs): err(train) = 0.004717, err_diff(train) = 0.000157
End Epoch 28 (332.547 secs): err(train) = 0.004281, err_diff(train) = 0.000158
End Epoch 29 (332.686 secs): err(train) = 0.003597, err_diff(train) = 0.000157
End Epoch 30 (332.696 secs): err(train) = 0.003277, err_diff(train) = 0.000157
End Epoch 31 (332.648 secs): err(train) = 0.002890, err_diff(train) = 0.000156
End Epoch 32 (332.779 secs): err(train) = 0.002410, err_diff(train) = 0.000156
End Epoch 33 (332.756 secs): err(train) = 0.002785, err_diff(train) = 0.000156
End Epoch 34 (332.666 secs): err(train) = 0.002189, err_diff(train) = 0.000156
End Epoch 35 (332.685 secs): err(train) = 0.002568, err_diff(train) = 0.000156
End Epoch 36 (332.713 secs): err(train) = 0.002466, err_diff(train) = 0.000156
End Epoch 37 (332.673 secs): err(train) = 0.002306, err_diff(train) = 0.000156
End Epoch 38 (332.647 secs): err(train) = 0.001951, err_diff(train) = 0.000155
End Epoch 39 (332.800 secs): err(train) = 0.001870, err_diff(train) = 0.000156
End Epoch 40 (332.752 secs): err(train) = 0.002223, err_diff(train) = 0.000156
End Epoch 41 (332.648 secs): err(train) = 0.001797, err_diff(train) = 0.000156
End Epoch 42 (332.650 secs): err(train) = 0.002142, err_diff(train) = 0.000156
End Epoch 43 (332.633 secs): err(train) = 0.002246, err_diff(train) = 0.000155
End Epoch 44 (332.652 secs): err(train) = 0.001875, err_diff(train) = 0.000156
End Epoch 45 (332.630 secs): err(train) = 0.001692, err_diff(train) = 0.000155
End Epoch 46 (332.603 secs): err(train) = 0.001782, err_diff(train) = 0.000155
End Epoch 47 (332.881 secs): err(train) = 0.001445, err_diff(train) = 0.000155
End Epoch 48 (332.995 secs): err(train) = 0.001631, err_diff(train) = 0.000155
End Epoch 49 (332.910 secs): err(train) = 0.001796, err_diff(train) = 0.000155
End Epoch 50 (332.567 secs): err(train) = 0.001534, err_diff(train) = 0.000155
total test error: 0.00490289210465
total test diff error: 0.000155301308865


In [21]:
dyn_stats_dic['dyn_stats'].plotStats()
plt.show()
dyn_stats_dic['dyn_stats_diff'].plotStats()
plt.show()



In [22]:
r2_scores = [r2_score(y_true=targets[ind], y_pred=preds_dict[ind])
            for ind in range(len(targets))]

In [23]:
ind = np.argmin(r2_scores)
ind


Out[23]:
35

In [24]:
reals = targets[ind]
preds = preds_dict[ind]

In [25]:
r2_score(y_true=reals, y_pred=preds)


Out[25]:
-269853566.37884647

In [26]:
#sns.tsplot(data=dp.inputs[ind].flatten())

In [27]:
fig = plt.figure(figsize=(15,6))
plt.plot(reals, 'b')
plt.plot(preds, 'g')
plt.legend(['reals','preds'])
plt.show()



In [28]:
%%time
dtw_scores = [fastdtw(targets[ind], preds_dict[ind])[0]
             for ind in range(len(targets))]


CPU times: user 10.1 s, sys: 64 ms, total: 10.2 s
Wall time: 10.1 s

In [29]:
np.mean(dtw_scores)


Out[29]:
20.081989750539115

In [30]:
coint(preds, reals)


Out[30]:
(-0.79621380079257409,
 0.93519951574637317,
 array([-4.21692744, -3.50625177, -3.16106388]))

In [45]:
cur_ind = np.random.randint(len(targets))
reals = targets[cur_ind]
preds = preds_dict[cur_ind]
fig = plt.figure(figsize=(15,6))
plt.plot(reals, 'b', label='reals')
plt.plot(preds, 'g')
plt.legend(['reals','preds'])
plt.show()


Conclusion

The autoencoder is not able to represent in a visibly obvious way our price history time series

TS in Two Dimensions


In [32]:
twod_arr = np.array(twods.values())
twod_arr.shape


Out[32]:
(689, 2)

In [33]:
plt.figure(figsize=(16,7))
plt.plot(twod_arr[:, 0], twod_arr[:, 1], 'r.')
plt.title('two dimensional representation of our time series after dimensionality reduction')
plt.xlabel('first dimension')
plt.ylabel('second dimension')
plt.show()



In [ ]: