In [1]:

    
# -*- coding: UTF-8 -*-
#%load_ext autoreload
%reload_ext autoreload
%autoreload 2



In [2]:

    
from __future__ import division
import tensorflow as tf
from os import path, remove
import numpy as np
import pandas as pd
import csv
from sklearn.model_selection import StratifiedShuffleSplit
from time import time
from matplotlib import pyplot as plt
import seaborn as sns
from mylibs.jupyter_notebook_helper import show_graph, renderStatsList, renderStatsCollection, \
    renderStatsListWithLabels, renderStatsCollectionOfCrossValids
from tensorflow.contrib import rnn
from tensorflow.contrib import learn
import shutil
from tensorflow.contrib.learn.python.learn import learn_runner
from mylibs.tf_helper import getDefaultGPUconfig
from sklearn.metrics import r2_score
from mylibs.py_helper import factors
from fastdtw import fastdtw
from collections import OrderedDict
from scipy.spatial.distance import euclidean
from statsmodels.tsa.stattools import coint
from common import get_or_run_nn
from data_providers.price_history_seq2seq_data_provider import PriceHistorySeq2SeqDataProvider
from data_providers.price_history_27_dataset_generator import PriceHistory27DatasetGenerator
from skopt.space.space import Integer, Real
from skopt import gp_minimize
from skopt.plots import plot_convergence
import pickle
import inspect
import dill
import sys
#from models.price_history_21_seq2seq_dyn_dec_ins import PriceHistorySeq2SeqDynDecIns
from data_providers.PriceHistoryMobileAttrsCombinator import PriceHistoryMobileAttrsCombinator
from sklearn.neighbors import NearestNeighbors
from datetime import datetime
from data_providers.price_hist_with_relevant_deals import PriceHistWithRelevantDeals









    



/home/student/anaconda2/envs/dis/lib/python2.7/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools



In [3]:

    
dtype = tf.float32
seed = 16011984
random_state = np.random.RandomState(seed=seed)
config = getDefaultGPUconfig()
n_jobs = 1
%matplotlib inline

Step 0 - hyperparams

vocab_size is all the potential words you could have (classification for translation case) and max sequence length are the SAME thing

decoder RNN hidden units are usually same size as encoder RNN hidden units in translation but for our case it does not seem really to be a relationship there but we can experiment and find out later, not a priority thing right now



In [4]:

    
num_units = 400 #state size

input_len = 60
target_len = 30

batch_size = 50
with_EOS = False



In [5]:

    
total_train_size = 57994

Include relevant deals



In [6]:

    
from time import sleep



In [7]:

    
data_path = '../../../../Dropbox/data'
ph_data_path = data_path + '/price_history'
npz_full = ph_data_path + '/price_history_mobattrs_date_dp_60to30_62020.npz'
assert path.isfile(npz_full)



In [8]:

    
csv_in = '../price_history_03_seq_start_suddens_trimmed.csv'



In [9]:

    
#npz_train = ph_data_path + '/price_history_dp_60to30_63548_46400_train.npz'
#npz_train_mobattrs = ph_data_path + '/price_history_mobattrs_dp_60to30_57994_train.npz'

# npz_test = ph_data_path + '/price_history_dp_60to30_57994_11584_test.npz'
# npz_test_mobattrs = ph_data_path + '/price_history_mobattrs_dp_60to30_57994_test.npz'



In [10]:

    
obj = PriceHistWithRelevantDeals(npz_path=npz_full, price_history_csv_path=csv_in, random_state=random_state,
                                verbose=False)



In [11]:

    
dic = obj.execute(relevancy_count=2)



In [12]:

    
npz_augmented = ph_data_path + '/price_history_mobattrs_date_deals_dp_60to30.npz'



In [13]:

    
dic.keys()









    Out[13]:





['inputs',
 'sequence_masks',
 'mobile_attrs',
 'decoder_inputs',
 'decoder_extra_inputs',
 'sku_ids',
 'targets',
 'sequence_lengths']



In [15]:

    
dic['inputs'][0].shape









    Out[15]:





(60, 9)



In [36]:

    
for key, val in dic.iteritems():
    print key, len(val)









    



inputs 59790
sequence_masks 59790
mobile_attrs 59790
decoder_inputs 59790
decoder_extra_inputs 59790
sku_ids 59790
targets 59790
sequence_lengths 59790



In [22]:

    
len(dic['targets'])









    Out[22]:





59790



In [23]:

    
len(dic['inputs'])









    Out[23]:





59790



In [29]:

    
args = np.argwhere([curin.shape != (60, 9) for curin in dic['inputs']]).flatten()
len(args)









    Out[29]:





403



In [33]:

    
args = list(args)



In [35]:

    
# for cur in dic['inputs'][args]:
#     print cur.shape
for arg in args:
    print dic['inputs'][arg].shape









    



(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)
(60, 8)



In [37]:

    
keep_args = set(range(len(dic['inputs']))).difference(args)



In [41]:

    
assert len(keep_args) == len(dic['inputs']) - len(args)
keep_args = list(keep_args)



In [43]:

    
newdic = {
    'inputs': np.array([dic[key][keep_arg] for keep_arg in keep_args])
}
newdic['inputs'].shape









    Out[43]:





(59387, 60, 9)



In [44]:

    
for key, val in dic.iteritems():
    if key == 'inputs':
        continue
    newdic[key] = dic[key][keep_args]
    print newdic[key].shape









    



(59387, 60)
(59387, 139)
(59387, 30, 7)
(59387, 30, 6)
(59387,)
(59387, 30)
(59387,)



In [45]:

    
#dic['inputs'] = np.array(dic['inputs'])
np.array(newdic['inputs']).shape









    Out[45]:





(59387, 60, 9)



In [46]:

    
np.savez(npz_augmented, **newdic)



In [11]:

    
npz = np.load(npz_full)
for key, val in npz.iteritems():
    print key,val.shape









    



inputs (62020, 60, 7)
sequence_masks (62020, 60)
mobile_attrs (62020, 139)
decoder_inputs (62020, 30, 7)
decoder_extra_inputs (62020, 30, 6)
sku_ids (62020,)
targets (62020, 30)
sequence_lengths (62020,)



In [12]:

    
my_current_ind = 100 #just because



In [13]:

    
target_ind = npz['sku_ids'][my_current_ind]
target_ind #this is the SKU ID we are interested in now









    Out[13]:





10084353



In [14]:

    
from relevant_deals import RelevantDeals



In [15]:

    
rd = RelevantDeals()



In [16]:

    
all_deals = rd.getSome(target_ind)



In [17]:

    
relevancy_order = 2 #2 extra sku ids to keep



In [18]:

    
relevant_sku_ids = all_deals[:relevancy_order]
relevant_sku_ids









    Out[18]:





array([9875453, 8720352])



In [19]:

    
#get everything normalized globally
df = PriceHistory27DatasetGenerator(random_state=random_state).global_norm_scale(
    pd.read_csv(csv_in, index_col=0, quoting=csv.QUOTE_ALL, encoding='utf-8')
)



In [20]:

    
yearday = 1
month_ind = 2
weekday_ind = 3
year_ind = 4
yearweek_ind = 5
day_ind = 6

year_ind, month_ind, day_ind









    Out[20]:





(4, 2, 6)



In [21]:

    
print np.unique(npz['inputs'][my_current_ind][:, 1]) #<--- this is year day
print np.unique(npz['inputs'][my_current_ind][:, 2]) #<--- this is month
print np.unique(npz['inputs'][my_current_ind][:, 3]) #<--- this is weekday
print np.unique(npz['inputs'][my_current_ind][:, 4]) #<---- this is the year
print np.unique(npz['inputs'][my_current_ind][:, 5]) #<--- this is year week
print np.unique(npz['inputs'][my_current_ind][:, 6]) #<--- this is month day









    



[  56.   57.   58.   59.   60.   61.   62.   63.   64.   65.   66.   67.
   68.   69.   70.   71.   72.   73.   74.   75.   76.   77.   78.   79.
   80.   81.   82.   83.   84.   85.   86.   87.   88.   89.   90.   91.
   92.   93.   94.   95.   96.   97.   98.   99.  100.  101.  102.  103.
  104.  105.  106.  107.  108.  109.  110.  111.  112.  113.  114.  115.]
[ 2.  3.  4.]
[ 0.  1.  2.  3.  4.  5.  6.]
[ 2017.]
[  8.   9.  10.  11.  12.  13.  14.  15.  16.  17.]
[  1.   2.   3.   4.   5.   6.   7.   8.   9.  10.  11.  12.  13.  14.  15.
  16.  17.  18.  19.  20.  21.  22.  23.  24.  25.  26.  27.  28.  29.  30.
  31.]



In [22]:

    
the_input = npz['inputs'][my_current_ind]



In [23]:

    
start_item = the_input[0].astype(np.int)
start_item.shape









    Out[23]:





(7,)



In [24]:

    
start_date = "{}-{:02d}-{:02d}".format(start_item[year_ind], start_item[month_ind], start_item[day_ind])
#this format is useful because we can compare them as strings without conversion
start_date









    Out[24]:





'2017-02-25'



In [25]:

    
end_item = npz['inputs'][my_current_ind][-1].astype(np.int)
end_date = "{}-{:02d}-{:02d}".format(end_item[year_ind], end_item[month_ind], end_item[day_ind])
end_date









    Out[25]:





'2017-04-25'

So for the same date window if we find data from the relevant deal we are good to go
If we do NOT find them ... we could go and search the next deal within some limit of course
So if we exceed this limit, what could we do?

We could pass zeros is one solution
We could drop this item completely is another solution



In [26]:

    
# for one sku id
cur_sku_id = relevant_sku_ids[0]
cur_sku_id









    Out[26]:





9875453



In [ ]:



In [27]:

    
seq = PriceHistory27DatasetGenerator.extractSequence(df.loc[cur_sku_id])
len(seq)









    Out[27]:





210



In [28]:

    
check = seq.index[0] <= start_date and end_date <= seq.index[-1]
check









    Out[28]:





True



In [29]:

    
#extract the sequence of interest
begin_ind = np.argwhere(seq.index == start_date).flatten()[0]
begin_ind









    Out[29]:





100



In [30]:

    
ending_ind = np.argwhere(seq.index == end_date).flatten()[0]
ending_ind









    Out[30]:





159



In [31]:

    
seq_of_interest = seq[begin_ind:ending_ind+1]
seq_of_interest.shape









    Out[31]:





(60,)



In [32]:

    
the_input.shape









    Out[32]:





(60, 7)



In [33]:

    
sns.tsplot(seq_of_interest)
plt.show()



In [34]:

    
unbiased = PriceHistory27DatasetGenerator.removeBiasFromSeq(seq_of_interest)
sns.tsplot(unbiased)
plt.show()



In [35]:

    
ready_deal = unbiased.values[np.newaxis].T
ready_deal.shape









    Out[35]:





(60, 1)



In [36]:

    
newinput = np.hstack((the_input, ready_deal))
newinput.shape









    Out[36]:





(60, 8)



In [37]:

    
np.array([unbiased, unbiased]).T









    Out[37]:





(60, 2)



In [40]:

    
aa = np.array([unbiased, unbiased]).T



In [42]:

    
ee = np.array([]).T
ee.shape









    Out[42]:





(0,)



In [43]:

    
np.hstack((the_input, ee)).shape









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-43-becc3f89ac5f> in <module>()
----> 1 np.hstack((the_input, ee)).shape

/home/student/anaconda2/envs/dis/lib/python2.7/site-packages/numpy/core/shape_base.pyc in hstack(tup)
    291         return _nx.concatenate(arrs, 0)
    292     else:
--> 293         return _nx.concatenate(arrs, 1)
    294 
    295 

ValueError: all the input arrays must have same number of dimensions



In [47]:









    Out[47]:





[0, 1, 2]



In [46]:









    Out[46]:





[-3, -2, -1]



In [48]:

    
mylist = []
for ii, jj in zip(range(-3, 0), range(0, 3)):
    mylist.append((ii, jj))



In [55]:

    
mylist









    Out[55]:





[(-3, 0), (-2, 1), (-1, 2)]



In [58]:

    
map(list, zip(*mylist))[1]









    Out[58]:





[0, 1, 2]



In [118]:

    
sns.tsplot(aa[:, 1])









    Out[118]:





<matplotlib.axes._subplots.AxesSubplot at 0x7efbda885510>

This is taking longer than expected but ok



In [28]:

    
%%time
dic = PriceHistory27DatasetGenerator.merge_date_info(npz_path=npz_full)









    



CPU times: user 5min 21s, sys: 464 ms, total: 5min 22s
Wall time: 5min 22s



In [30]:

    
for key, val in dic.iteritems():
    print val.shape









    



(63548, 60, 7)
(63548, 60)
(63548,)
(63548, 30, 7)
(63548, 30, 6)
(63548,)
(63548, 30)



In [33]:

    
# npz_full_with_date = ph_data_path + '/price_history_dp_60to30_63548_date_info.npz'
# np.savez(npz_full_with_date, **dic)

Combine Data



In [31]:

    
combinator = PriceHistoryMobileAttrsCombinator()



In [34]:

    
%%time
dic, inds, count_key_errors, key_errors = combinator.combine(npz_in=npz_full_with_date)









    



CPU times: user 10.4 s, sys: 464 ms, total: 10.9 s
Wall time: 10.8 s



In [36]:

    
for key, val in dic.iteritems():
    print val.shape









    



(62020, 60, 7)
(62020,)
(62020,)
(62020, 139)
(62020, 30, 7)
(62020, 30, 6)
(62020, 60)
(62020, 30)



In [37]:

    
npz_full_mobattrs_date = ph_data_path + '/price_history_mobattrs_date_dp_60to30_62020.npz'



In [38]:

    
np.savez(npz_full_mobattrs_date, **dic)



In [39]:

    
count_key_errors#, key_errors









    Out[39]:





1528

Train - Test Split



In [42]:

    
npz_train_mobattrs_date = ph_data_path + '/price_history_mobattrs_date_dp_60to30_62020_train.npz'
npz_test_mobattrs_date = ph_data_path + '/price_history_mobattrs_date_dp_60to30_62020_test.npz'



In [43]:

    
PriceHistory27DatasetGenerator.train_test_split(fullpath=npz_full_mobattrs_date, test_size=6200,
                                                train_path=npz_train_mobattrs_date,
                                                test_path=npz_test_mobattrs_date, random_state=random_state)



In [50]:

    
npz_train_mobattrs_date_small = ph_data_path + '/price_history_mobattrs_date_dp_60to30_62020_6000_train.npz'



In [51]:

    
PriceHistory27DatasetGenerator.create_subsampled(inpath=npz_train_mobattrs_date, target_size=6000,
                                               outpath=npz_train_mobattrs_date_small, random_state=random_state)



In [ ]: