In [1]:
# -*- coding: UTF-8 -*-
#%load_ext autoreload
%reload_ext autoreload
%autoreload 2

In [2]:
from __future__ import division
import tensorflow as tf
from os import path, remove
import numpy as np
import pandas as pd
import csv
from sklearn.model_selection import StratifiedShuffleSplit
from time import time
from matplotlib import pyplot as plt
import seaborn as sns
from mylibs.jupyter_notebook_helper import show_graph, renderStatsList, renderStatsCollection, \
    renderStatsListWithLabels, renderStatsCollectionOfCrossValids
from tensorflow.contrib import rnn
from tensorflow.contrib import learn
import shutil
from tensorflow.contrib.learn.python.learn import learn_runner
from mylibs.tf_helper import getDefaultGPUconfig
from sklearn.metrics import r2_score
from mylibs.py_helper import factors
from fastdtw import fastdtw
from collections import OrderedDict
from scipy.spatial.distance import euclidean
from statsmodels.tsa.stattools import coint
from common import get_or_run_nn
from data_providers.price_history_seq2seq_data_provider import PriceHistorySeq2SeqDataProvider
from data_providers.price_history_dataset_generator import PriceHistoryDatasetGenerator
from skopt.space.space import Integer, Real
from skopt import gp_minimize
from skopt.plots import plot_convergence
import pickle
import inspect
import dill
import sys
from models.price_history_seq2seq_raw_dummy import PriceHistorySeq2SeqRawDummy


/home/studenthp/anaconda2/envs/dis/lib/python2.7/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools

In [3]:
dtype = tf.float32
seed = 16011984
random_state = np.random.RandomState(seed=seed)
config = getDefaultGPUconfig()
n_jobs = 1
%matplotlib inline

In [4]:
bb = tf.constant(0., dtype=tf.float32)
bb.get_shape()


Out[4]:
TensorShape([])

In [5]:
aa = tf.zeros((40, 2))
aa.get_shape().concatenate(tf.TensorShape([1]))


Out[5]:
TensorShape([Dimension(40), Dimension(2), Dimension(1)])

Step 0 - hyperparams

vocab_size is all the potential words you could have (classification for translation case) and max sequence length are the SAME thing

decoder RNN hidden units are usually same size as encoder RNN hidden units in translation but for our case it does not seem really to be a relationship there but we can experiment and find out later, not a priority thing right now


In [6]:
epochs = 15

num_features = 1
num_units = 400 #state size

input_len = 60
target_len = 30

batch_size = 50 #47
#trunc_backprop_len = ??
rnn_cell = PriceHistorySeq2SeqRawDummy.RNN_CELLS.GRU

with_EOS = False

In [7]:
total_train_size = 57994
train_size = 6400 
test_size = 1282

Once generate data


In [8]:
data_path = '../data/price_history'

In [9]:
#npz_full_train = data_path + '/price_history_03_dp_60to30_train.npz'
#npz_full_train = data_path + '/price_history_60to30_targets_normed_train.npz'

#npz_train = data_path + '/price_history_03_dp_60to30_57980_train.npz'
#npz_train = data_path + '/price_history_03_dp_60to30_6400_train.npz'
npz_train = data_path + '/price_history_60to30_6400_targets_normed_train.npz'

#npz_test = data_path + '/price_history_03_dp_60to30_test.npz'
npz_test = data_path + '/price_history_60to30_targets_normed_test.npz'

In [10]:
# PriceHistoryDatasetGenerator.create_subsampled(inpath=npz_full_train, target_size=6400, outpath=npz_train,
#                                                random_state=random_state)

In [11]:
# %%time
# csv_in = '../price_history_03_seq_start_suddens_trimmed.csv'

# train_sku_ids, train_XX, train_YY, train_sequence_lens, train_seq_mask, test_pack = \
# PriceHistoryDatasetGenerator(random_state=random_state).\
#     createAndSaveDataset(
#         csv_in=csv_in,
#         input_seq_len=input_len,
#         target_seq_len=target_len,
#         allowSmallerSequencesThanWindow=False,
#         #min_date = '2016-11-01',
#         split_fraction = 0.40,
#         #keep_training_fraction = 0.22, #57994 * 0.22 = 12758.68
#         normalize_targets = True,
#         #disable saving for now since we have already created them
#         save_files_dic = {"train": npz_full_train, "test": npz_test,},
# )

In [12]:
# print train_sku_ids.shape, train_XX.shape, train_YY.shape, train_sequence_lens.shape, train_seq_mask.shape
# aa,bb,cc,dd,ee = test_pack.get_data()
# aa.shape,bb.shape,cc.shape,dd.shape,ee.shape

Step 1 - collect data


In [13]:
dp = PriceHistorySeq2SeqDataProvider(npz_path=npz_train, batch_size=batch_size, with_EOS=with_EOS)
dp.inputs.shape, dp.targets.shape


Out[13]:
((6400, 60, 1), (6400, 30))

In [14]:
aa, bb = dp.next()
aa.shape, bb.shape


Out[14]:
((50, 60, 1), (50, 30))

Step 2 - Build model


In [15]:
model = PriceHistorySeq2SeqRawDummy(rng=random_state, dtype=dtype, config=config, with_EOS=with_EOS)

In [16]:
graph = model.getGraph(batch_size=batch_size,
                       num_units=num_units,
                       input_len=input_len,
                       target_len=target_len,
                       rnn_cell=rnn_cell)


60
Tensor("inputs/unstack:0", shape=(50, 1), dtype=float32)

Tensor("encoder_rnn_layer/rnn/gru_cell_59/add:0", shape=(50, 400), dtype=float32)

time
Tensor("decoder_rnn_layer/rnn/while/add:0", shape=(), dtype=int32)

decoder_final_loop_state
Tensor("decoder_rnn_layer/rnn/while/Exit_5:0", shape=(50, 30), dtype=float32)

<tensorflow.python.ops.tensor_array_ops.TensorArray object at 0x7f3e7b92e150>

Tensor("error/Select:0", shape=(50, 30), dtype=float32)

Tensor("error/Mean:0", shape=(), dtype=float32)
Tensor("error/Mean:0", shape=(), dtype=float32)


In [17]:
#show_graph(graph)

Step 3 training the network

RECALL: baseline is around 4 for huber loss for current problem, anything above 4 should be considered as major errors


In [13]:
#rnn_cell = PriceHistorySeq2SeqCV.RNN_CELLS.GRU
#cross_val_n_splits = 5
epochs, num_units, batch_size


Out[13]:
(15, 100, 256)

In [14]:
#set(factors(train_size)).intersection(factors(train_size/5))

In [25]:
best_learning_rate = 1e-3 #0.0026945952539362472

In [26]:
def experiment():
    return model.run(npz_path=npz_train,
             epochs=10,
             batch_size = 50,
             num_units = 400,
             input_len=input_len,
             target_len=target_len,
             learning_rate = best_learning_rate,
             preds_gather_enabled=True,
             #eos_token = float(1e3),
             rnn_cell=rnn_cell)

In [27]:
dyn_stats, preds_dict = experiment()


learning rate: 0.001000
epochs: 10
End Epoch 01 (29.347 secs): err(train) = 1.787
End Epoch 02 (28.642 secs): err(train) = 1.228
End Epoch 03 (28.666 secs): err(train) = 1.131
End Epoch 04 (28.699 secs): err(train) = 1.085
End Epoch 05 (28.658 secs): err(train) = 1.058
End Epoch 06 (28.659 secs): err(train) = 1.041
End Epoch 07 (28.673 secs): err(train) = 1.037
End Epoch 08 (28.706 secs): err(train) = 1.018
End Epoch 09 (28.668 secs): err(train) = 1.007
End Epoch 10 (28.667 secs): err(train) = 1.006

Recall that without batch normalization within 10 epochs with num units 400 and batch_size 64 we reached at 4.940 and with having the decoder inputs NOT filled from the outputs


In [61]:
%%time
dyn_stats, preds_dict = get_or_run_nn(experiment,
                                      filename='017_seq2seq_60to30_epochs{}_learning_rate_{:.4f}'.format(
                                          epochs, best_learning_rate
                                      ))


CPU times: user 732 ms, sys: 4 ms, total: 736 ms
Wall time: 730 ms

In [28]:
dyn_stats.plotStats()
plt.show()



In [29]:
r2_scores = [r2_score(y_true=dp.targets[ind], y_pred=preds_dict[ind])
            for ind in range(len(dp.targets))]

In [30]:
ind = np.argmin(r2_scores)
ind


Out[30]:
4398

In [31]:
reals = dp.targets[ind]
preds = preds_dict[ind]

In [32]:
r2_score(y_true=reals, y_pred=preds)


Out[32]:
-2.8188612051742737e+29

In [33]:
sns.tsplot(data=dp.inputs[ind].flatten())


Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3e78b271d0>

In [34]:
fig = plt.figure(figsize=(15,6))
plt.plot(reals, 'b')
plt.plot(preds, 'g')
plt.legend(['reals','preds'])
plt.show()



In [35]:
%%time
dtw_scores = [fastdtw(dp.targets[ind], preds_dict[ind])[0]
             for ind in range(len(dp.targets))]


CPU times: user 12.8 s, sys: 92 ms, total: 12.9 s
Wall time: 12.7 s

In [36]:
np.mean(dtw_scores)


Out[36]:
8.4468486058793957

In [37]:
coint(preds, reals)


Out[37]:
(0.73290943567916722,
 0.9937045145056902,
 array([-4.31395736, -3.55493606, -3.19393252]))

In [54]:
cur_ind = np.random.randint(len(dp.targets))
reals = dp.targets[cur_ind]
preds = preds_dict[cur_ind]
fig = plt.figure(figsize=(15,6))
plt.plot(reals, 'b')
plt.plot(preds, 'g')
plt.legend(['reals','preds'])
plt.show()


Conclusion

???


In [ ]: