In [1]:
# -*- coding: UTF-8 -*-
#%load_ext autoreload
%reload_ext autoreload
%autoreload 2

In [2]:
from __future__ import division
import tensorflow as tf
from os import path, remove
import numpy as np
import pandas as pd
import csv
from sklearn.model_selection import StratifiedShuffleSplit
from time import time
from matplotlib import pyplot as plt
import seaborn as sns
from mylibs.jupyter_notebook_helper import show_graph, renderStatsList, renderStatsCollection, \
    renderStatsListWithLabels, renderStatsCollectionOfCrossValids
from tensorflow.contrib import rnn
from tensorflow.contrib import learn
import shutil
from tensorflow.contrib.learn.python.learn import learn_runner
from mylibs.tf_helper import getDefaultGPUconfig
from sklearn.metrics import r2_score
from mylibs.py_helper import factors
from fastdtw import fastdtw
from collections import OrderedDict
from scipy.spatial.distance import euclidean
from statsmodels.tsa.stattools import coint
from common import get_or_run_nn
from data_providers.price_history_seq2seq_data_provider import PriceHistorySeq2SeqDataProvider
from data_providers.price_history_27_dataset_generator import PriceHistory27DatasetGenerator
from skopt.space.space import Integer, Real
from skopt import gp_minimize
from skopt.plots import plot_convergence
import pickle
import inspect
import dill
import sys
#from models.price_history_21_seq2seq_dyn_dec_ins import PriceHistorySeq2SeqDynDecIns
from data_providers.PriceHistoryMobileAttrsCombinator import PriceHistoryMobileAttrsCombinator


/home/student/anaconda2/envs/dis/lib/python2.7/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools

In [3]:
dtype = tf.float32
seed = 16011984
random_state = np.random.RandomState(seed=seed)
config = getDefaultGPUconfig()
n_jobs = 1
%matplotlib inline

Step 0 - hyperparams

vocab_size is all the potential words you could have (classification for translation case) and max sequence length are the SAME thing

decoder RNN hidden units are usually same size as encoder RNN hidden units in translation but for our case it does not seem really to be a relationship there but we can experiment and find out later, not a priority thing right now


In [4]:
num_units = 400 #state size

input_len = 60
target_len = 30

batch_size = 50
with_EOS = False

In [5]:
total_train_size = 57994

Generate with date info


In [6]:
ph_data_path = '../data/price_history'

In [7]:
npz_full = ph_data_path + '/price_history_dp_60to30_63548.npz'

#npz_train = ph_data_path + '/price_history_dp_60to30_63548_46400_train.npz'
#npz_train_mobattrs = ph_data_path + '/price_history_mobattrs_dp_60to30_57994_train.npz'

# npz_test = ph_data_path + '/price_history_dp_60to30_57994_11584_test.npz'
# npz_test_mobattrs = ph_data_path + '/price_history_mobattrs_dp_60to30_57994_test.npz'

In [9]:
%%time
csv_in = '../price_history_03_seq_start_suddens_trimmed.csv'

train_pack, test_pack = \
    PriceHistory27DatasetGenerator(random_state=random_state).createAndSaveDataset(
            csv_in=csv_in,
            input_seq_len=input_len,
            target_seq_len=target_len,
            allowSmallerSequencesThanWindow=False,
            #min_date = '2016-11-01',
            #split_fraction = None,
            normalize_targets = True,
            do_global_norm_scale = True,
            save_files_dic = {"train": npz_full, "test": None},
    )

for item in train_pack.get_data():
    print item.shape
print
for item in test_pack.get_data():
    print item.shape


(63548,)
(63548, 60)
(63548, 30)
(63548,)
(63548, 60)
(63548, 60)
(63548, 30)

(0,)
(0,)
(0,)
(0,)
(0,)
(0,)
(0,)
CPU times: user 24.7 s, sys: 496 ms, total: 25.2 s
Wall time: 25.3 s

(63548,)
(63548, 60)
(63548, 30)
(63548,)
(63548, 60)
(63548, 60)
(63548, 30)

Useful info that we could exploit

1) Day of the year
2) Day of the month
3) day of the week
4) week of the year
5) month
6) year

This is taking longer than expected but ok


In [28]:
%%time
dic = PriceHistory27DatasetGenerator.merge_date_info(npz_path=npz_full)


CPU times: user 5min 21s, sys: 464 ms, total: 5min 22s
Wall time: 5min 22s

In [30]:
for key, val in dic.iteritems():
    print val.shape


(63548, 60, 7)
(63548, 60)
(63548,)
(63548, 30, 7)
(63548, 30, 6)
(63548,)
(63548, 30)

In [33]:
# npz_full_with_date = ph_data_path + '/price_history_dp_60to30_63548_date_info.npz'
# np.savez(npz_full_with_date, **dic)

Combine Data


In [31]:
combinator = PriceHistoryMobileAttrsCombinator()

In [34]:
%%time
dic, inds, count_key_errors, key_errors = combinator.combine(npz_in=npz_full_with_date)


CPU times: user 10.4 s, sys: 464 ms, total: 10.9 s
Wall time: 10.8 s

In [36]:
for key, val in dic.iteritems():
    print val.shape


(62020, 60, 7)
(62020,)
(62020,)
(62020, 139)
(62020, 30, 7)
(62020, 30, 6)
(62020, 60)
(62020, 30)

In [37]:
npz_full_mobattrs_date = ph_data_path + '/price_history_mobattrs_date_dp_60to30_62020.npz'

In [38]:
np.savez(npz_full_mobattrs_date, **dic)

In [39]:
count_key_errors#, key_errors


Out[39]:
1528

Train - Test Split


In [42]:
npz_train_mobattrs_date = ph_data_path + '/price_history_mobattrs_date_dp_60to30_62020_train.npz'
npz_test_mobattrs_date = ph_data_path + '/price_history_mobattrs_date_dp_60to30_62020_test.npz'

In [43]:
PriceHistory27DatasetGenerator.train_test_split(fullpath=npz_full_mobattrs_date, test_size=6200,
                                                train_path=npz_train_mobattrs_date,
                                                test_path=npz_test_mobattrs_date, random_state=random_state)

In [50]:
npz_train_mobattrs_date_small = ph_data_path + '/price_history_mobattrs_date_dp_60to30_62020_6000_train.npz'

In [51]:
PriceHistory27DatasetGenerator.create_subsampled(inpath=npz_train_mobattrs_date, target_size=6000,
                                               outpath=npz_train_mobattrs_date_small, random_state=random_state)

In [ ]: