In [1]:
# -*- coding: UTF-8 -*-
#%load_ext autoreload
%reload_ext autoreload
%autoreload 2

In [2]:
from __future__ import division
import tensorflow as tf
from os import path, remove
import numpy as np
import pandas as pd
import csv
from sklearn.model_selection import StratifiedShuffleSplit
from time import time
from matplotlib import pyplot as plt
import seaborn as sns
from mylibs.jupyter_notebook_helper import show_graph, renderStatsList, renderStatsCollection, \
    renderStatsListWithLabels, renderStatsCollectionOfCrossValids, plot_res_gp, my_plot_convergence
from tensorflow.contrib import rnn
from tensorflow.contrib import learn
import shutil
from tensorflow.contrib.learn.python.learn import learn_runner
from mylibs.tf_helper import getDefaultGPUconfig
from sklearn.metrics import r2_score
from mylibs.py_helper import factors
from fastdtw import fastdtw
from collections import OrderedDict
from scipy.spatial.distance import euclidean
from statsmodels.tsa.stattools import coint
from common_33 import get_or_run_nn
from skopt.space.space import Integer, Real
from skopt import gp_minimize
from skopt.plots import plot_convergence
import pickle
import inspect
import dill
import sys

from models.model_33_price_history_autoencoder import PriceHistoryAutoencoder
from data_providers.data_provider_33_price_history_autoencoder import PriceHistoryAutoEncDataProvider
#from gp_opt.price_history_27_gp_opt import PriceHistoryGpOpt


/home/studenthp/anaconda2/envs/dis/lib/python2.7/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools

In [3]:
dtype = tf.float32
seed = 16011984
random_state = np.random.RandomState(seed=seed)
config = getDefaultGPUconfig()
n_jobs = 1
%matplotlib inline

Step 0 - hyperparams


In [4]:
factors(689)


Out[4]:
[1, 53, 13, 689]

In [5]:
max_seq_len = 682

In [6]:
#full_train_size = 55820
#train_size = 55800
#small_train_size = 6000 #just because of performance reasons, no statistics behind this decision
#test_size = 6200

In [7]:
data_path = '../../../../Dropbox/data'

In [8]:
phae_path = data_path + '/price_hist_autoencoder'

In [9]:
csv_in = '../price_history_03_seq_start_suddens_trimmed.csv'
assert path.isfile(csv_in)

In [10]:
npz_unprocessed = phae_path + '/price_history_full_seqs.npz'
assert path.isfile(npz_unprocessed)

In [11]:
npz_dates = phae_path + '/price_history_full_seqs_dates.npz'
assert path.isfile(npz_dates)

In [12]:
npz_train = phae_path + '/price_history_seqs_dates_normed_train.npz'
assert path.isfile(npz_train)

In [13]:
npz_test = phae_path + '/price_history_seqs_dates_normed_test.npz'
assert path.isfile(npz_test)

In [14]:
npz_path = npz_train[:-len('_train.npz')]

In [15]:
for key, val in np.load(npz_train).iteritems():
    print key, ",", val.shape


inputs , (689, 682, 7)
sku_ids , (689,)
sequence_masks , (689, 682)
extra_inputs , (689, 682, 6)
sequence_lengths , (689,)

Step 1 - collect data


In [16]:
dp = PriceHistoryAutoEncDataProvider(npz_path=npz_path, batch_size=1, with_EOS=False)
for data in dp.datalist:
    print data.shape


(689, 682, 7)
(689, 682, 6)
(689,)
(689, 682)

In [17]:
# for item in dp.next():
#     print item.shape

Step 2 - Build model


In [18]:
model = PriceHistoryAutoencoder(rng=random_state, dtype=dtype, config=config)

In [19]:
# graph = model.getGraph(batch_size=batch_size,
#                        enc_num_units = 10,
#                        dec_num_units = 10,
#                        ts_len=max_seq_len)

targets Tensor("data/strided_slice:0", shape=(50, 682), dtype=float32)

Tensor("encoder_rnn_layer/rnn/while/Exit_2:0", shape=(?, 10), dtype=float32)

Tensor("encoder_state_out_process/Elu:0", shape=(?, 2), dtype=float32)

Tensor("decoder_state_in_process/Elu:0", shape=(?, 10), dtype=float32)

Tensor("decoder_rnn_layer/rnn/transpose:0", shape=(50, 682, 10), dtype=float32)

Tensor("decoder_outs/Reshape:0", shape=(34100, 10), dtype=float32)

Tensor("readout_affine/Identity:0", shape=(34100, 1), dtype=float32) Tensor("readout_affine/Reshape:0", shape=(50, 682), dtype=float32)

Tensor("error/mul_1:0", shape=(50, 682), dtype=float32)

Tensor("error/Mean:0", shape=(), dtype=float32) Tensor("error/Mean:0", shape=(), dtype=float32)


In [20]:
#show_graph(graph)

Quick test run


In [21]:
def experiment():
    return model.run(npz_path=npz_path,
                     epochs=100,
                     batch_size = 13,
                     enc_num_units = 400,
                     dec_num_units = 400,
                     ts_len=max_seq_len,
                     learning_rate = 1e-4,
                     preds_gather_enabled = False,
                    )
dyn_stats = experiment()


epochs: 100
dec_num_units: 400
learning_rate: 0.0001
enc_num_units: 400
End Epoch 01 (172.430 secs): err(train) = 0.005301
End Epoch 02 (166.935 secs): err(train) = 0.004788
End Epoch 03 (166.902 secs): err(train) = 0.003232
End Epoch 04 (166.902 secs): err(train) = 0.002585
End Epoch 05 (166.859 secs): err(train) = 0.002257
End Epoch 06 (166.815 secs): err(train) = 0.002584
End Epoch 07 (167.011 secs): err(train) = 0.002258
End Epoch 08 (167.015 secs): err(train) = 0.002260
End Epoch 09 (166.811 secs): err(train) = 0.002352
End Epoch 10 (166.914 secs): err(train) = 0.002306
End Epoch 11 (166.977 secs): err(train) = 0.002516
End Epoch 12 (166.881 secs): err(train) = 0.002208
End Epoch 13 (166.880 secs): err(train) = 0.002184
End Epoch 14 (166.993 secs): err(train) = 0.002175
End Epoch 15 (167.000 secs): err(train) = 0.002197
End Epoch 16 (166.903 secs): err(train) = 0.002008
End Epoch 17 (166.833 secs): err(train) = 0.002040
End Epoch 18 (166.921 secs): err(train) = 0.002187
End Epoch 19 (166.872 secs): err(train) = 0.001902
End Epoch 20 (166.933 secs): err(train) = 0.002023
End Epoch 21 (166.912 secs): err(train) = 0.001857
End Epoch 22 (166.883 secs): err(train) = 0.002100
End Epoch 23 (166.920 secs): err(train) = 0.002539
End Epoch 24 (166.919 secs): err(train) = 0.001932
End Epoch 25 (166.939 secs): err(train) = 0.002119
End Epoch 26 (166.876 secs): err(train) = 0.002195
End Epoch 27 (166.902 secs): err(train) = 0.001917
End Epoch 28 (166.876 secs): err(train) = 0.001952
End Epoch 29 (166.890 secs): err(train) = 0.001881
End Epoch 30 (166.922 secs): err(train) = 0.002071
End Epoch 31 (166.856 secs): err(train) = 0.002016
End Epoch 32 (166.868 secs): err(train) = 0.001701
End Epoch 33 (166.875 secs): err(train) = 0.001852
End Epoch 34 (166.899 secs): err(train) = 0.001813
End Epoch 35 (166.881 secs): err(train) = 0.001756
End Epoch 36 (166.854 secs): err(train) = 0.002028
End Epoch 37 (166.925 secs): err(train) = 0.001930
End Epoch 38 (166.959 secs): err(train) = 0.001996
End Epoch 39 (167.037 secs): err(train) = 0.001754
End Epoch 40 (166.911 secs): err(train) = 0.001900
End Epoch 41 (166.880 secs): err(train) = 0.001753
End Epoch 42 (166.892 secs): err(train) = 0.001593
End Epoch 43 (166.944 secs): err(train) = 0.002103
End Epoch 44 (166.846 secs): err(train) = 0.002011
End Epoch 45 (166.994 secs): err(train) = 0.001657
End Epoch 46 (166.915 secs): err(train) = 0.001831
End Epoch 47 (166.899 secs): err(train) = 0.001463
End Epoch 48 (166.915 secs): err(train) = 0.001746
End Epoch 49 (166.907 secs): err(train) = 0.001885
End Epoch 50 (166.956 secs): err(train) = 0.001803
End Epoch 51 (166.937 secs): err(train) = 0.001727
End Epoch 52 (166.894 secs): err(train) = 0.001626
End Epoch 53 (166.907 secs): err(train) = 0.001723
End Epoch 54 (166.847 secs): err(train) = 0.001732
End Epoch 55 (166.865 secs): err(train) = 0.001447
End Epoch 56 (166.870 secs): err(train) = 0.001452
End Epoch 57 (166.965 secs): err(train) = 0.001623
End Epoch 58 (166.987 secs): err(train) = 0.001635
End Epoch 59 (166.922 secs): err(train) = 0.001518
End Epoch 60 (166.877 secs): err(train) = 0.001692
End Epoch 61 (167.048 secs): err(train) = 0.001811
End Epoch 62 (166.895 secs): err(train) = 0.001574
End Epoch 63 (166.939 secs): err(train) = 0.001462
End Epoch 64 (166.932 secs): err(train) = 0.001682
End Epoch 65 (166.970 secs): err(train) = 0.001382
End Epoch 66 (166.923 secs): err(train) = 0.001398
End Epoch 67 (166.876 secs): err(train) = 0.001341
End Epoch 68 (166.903 secs): err(train) = 0.001542
End Epoch 69 (166.931 secs): err(train) = 0.001445
End Epoch 70 (166.849 secs): err(train) = 0.001631
End Epoch 71 (166.845 secs): err(train) = 0.001720
End Epoch 72 (167.022 secs): err(train) = 0.002437
End Epoch 73 (167.010 secs): err(train) = 0.002112
End Epoch 74 (166.930 secs): err(train) = 0.001751
End Epoch 75 (166.872 secs): err(train) = 0.001598
End Epoch 76 (166.876 secs): err(train) = 0.001589
End Epoch 77 (166.894 secs): err(train) = 0.001414
End Epoch 78 (166.844 secs): err(train) = 0.001607
End Epoch 79 (166.963 secs): err(train) = 0.001353
End Epoch 80 (166.822 secs): err(train) = 0.001456
End Epoch 81 (166.966 secs): err(train) = 0.001497
End Epoch 82 (166.964 secs): err(train) = 0.001458
End Epoch 83 (166.892 secs): err(train) = 0.001258
End Epoch 84 (166.779 secs): err(train) = 0.001353
End Epoch 85 (166.848 secs): err(train) = 0.001430
End Epoch 86 (166.893 secs): err(train) = 0.001505
End Epoch 87 (166.888 secs): err(train) = 0.001518
End Epoch 88 (166.893 secs): err(train) = 0.001432
End Epoch 89 (166.913 secs): err(train) = 0.001295
End Epoch 90 (166.869 secs): err(train) = 0.001217
End Epoch 91 (166.882 secs): err(train) = 0.001261
End Epoch 92 (166.907 secs): err(train) = 0.001251
End Epoch 93 (166.913 secs): err(train) = 0.001427
End Epoch 94 (166.931 secs): err(train) = 0.001282
End Epoch 95 (166.943 secs): err(train) = 0.001471
End Epoch 96 (166.925 secs): err(train) = 0.001268
End Epoch 97 (166.932 secs): err(train) = 0.001293
End Epoch 98 (166.911 secs): err(train) = 0.001327
End Epoch 99 (166.929 secs): err(train) = 0.001320
End Epoch 100 (166.970 secs): err(train) = 0.001520


In [22]:
dyn_stats.plotStats()


Out[22]:
([<matplotlib.figure.Figure at 0x7f9f9c198990>],
 [<matplotlib.axes._subplots.AxesSubplot at 0x7f9f3b789610>])

Conclusion

For one instance that is the most easy case it seems to be trainable, let's get the predicted values to observe how it actually looks like

Step 3 training the network


In [21]:
model = PriceHistoryAutoencoder(rng=random_state, dtype=dtype, config=config)

In [22]:
npz_test = npz_path + '_test.npz'
assert path.isfile(npz_test)
path.abspath(npz_test)


Out[22]:
'/home/studenthp/Dropbox/data/price_hist_autoencoder/price_history_seqs_dates_normed_test.npz'

In [23]:
def experiment():
    return model.run(npz_path=npz_path,
                     epochs=1,
                     batch_size = 53,
                     enc_num_units = 50,
                     dec_num_units = 50,
                     ts_len=max_seq_len,
                     learning_rate = 1e-4,
                     preds_gather_enabled = True,
                    )

In [24]:
#%%time
dyn_stats, preds_dict, targets, twods = experiment()
# dyn_stats, preds_dict, targets, twods = get_or_run_nn(experiment, filename='033_autoencoder_000',
#                                                       nn_runs_folder = data_path + "/nn_runs")


epochs: 1
dec_num_units: 50
learning_rate: 0.0001
enc_num_units: 50
End Epoch 01 (14.173 secs): err(train) = 0.020087
total test error: 0.0132300918922


In [27]:
dyn_stats.plotStats()
plt.show()



In [28]:
r2_scores = [r2_score(y_true=targets[ind], y_pred=preds_dict[ind])
            for ind in range(len(targets))]

In [29]:
ind = np.argmin(r2_scores)
ind


Out[29]:
306

In [30]:
reals = targets[ind]
preds = preds_dict[ind]

In [31]:
r2_score(y_true=reals, y_pred=preds)


Out[31]:
-40526425.395093553

In [32]:
#sns.tsplot(data=dp.inputs[ind].flatten())

In [33]:
fig = plt.figure(figsize=(15,6))
plt.plot(reals, 'b')
plt.plot(preds, 'g')
plt.legend(['reals','preds'])
plt.show()



In [34]:
%%time
dtw_scores = [fastdtw(targets[ind], preds_dict[ind])[0]
             for ind in range(len(targets))]


CPU times: user 10.3 s, sys: 68 ms, total: 10.4 s
Wall time: 10.3 s

In [35]:
np.mean(dtw_scores)


Out[35]:
25.161366957432119

In [36]:
coint(preds, reals)


Out[36]:
(-1.056565105771567,
 0.89158322966696035,
 array([-4.22652904, -3.51111966, -3.16435988]))

In [75]:
cur_ind = np.random.randint(len(targets))
reals = targets[cur_ind]
preds = preds_dict[cur_ind]
fig = plt.figure(figsize=(15,6))
plt.plot(reals, 'b', label='reals')
plt.plot(preds, 'g')
plt.legend(['reals','preds'])
plt.show()


Conclusion

So we see visually that the autoencoder for the time series at least for the very easy case of training (super-overfitting!!) on only one instance it works pretty well, at the end we have an autoencoder that is able to resemble the original input


In [ ]: