In [78]:

    
# -*- coding: UTF-8 -*-
#%load_ext autoreload
%reload_ext autoreload
%autoreload 2



In [79]:

    
from __future__ import division
import tensorflow as tf
from os import path, remove
import numpy as np
import pandas as pd
import csv
from sklearn.model_selection import StratifiedShuffleSplit
from time import time
from matplotlib import pyplot as plt
import seaborn as sns
from mylibs.jupyter_notebook_helper import show_graph, renderStatsList, renderStatsCollection, \
    renderStatsListWithLabels, renderStatsCollectionOfCrossValids
from tensorflow.contrib import rnn
from tensorflow.contrib import learn
import shutil
from tensorflow.contrib.learn.python.learn import learn_runner
from mylibs.tf_helper import getDefaultGPUconfig
from sklearn.metrics import r2_score
from mylibs.py_helper import factors
from fastdtw import fastdtw
from collections import OrderedDict
from scipy.spatial.distance import euclidean
from statsmodels.tsa.stattools import coint
from common import get_or_run_nn
from data_providers.price_history_seq2seq_data_provider import PriceHistorySeq2SeqDataProvider
from skopt.space.space import Integer, Real
from skopt import gp_minimize
from skopt.plots import plot_convergence
import pickle
import inspect
import dill
import sys
#from models.price_history_21_seq2seq_dyn_dec_ins import PriceHistorySeq2SeqDynDecIns
from data_providers.PriceHistoryMobileAttrsCombinator import PriceHistoryMobileAttrsCombinator
from sklearn.neighbors import NearestNeighbors
from datetime import datetime
from data_providers.price_hist_with_relevant_deals import PriceHistWithRelevantDeals
from data_providers.price_history_29_dataset_per_mobile_phone import PriceHistoryDatasetPerMobilePhone



In [80]:

    
dtype = tf.float32
seed = 16011984
random_state = np.random.RandomState(seed=seed)
config = getDefaultGPUconfig()
n_jobs = 1
%matplotlib inline

Step 0 - hyperparams

vocab_size is all the potential words you could have (classification for translation case) and max sequence length are the SAME thing

decoder RNN hidden units are usually same size as encoder RNN hidden units in translation but for our case it does not seem really to be a relationship there but we can experiment and find out later, not a priority thing right now



In [81]:

    
input_len = 60
target_len = 30
batch_size = 50
with_EOS = False



In [82]:

    
csv_in = '../price_history_03_seq_start_suddens_trimmed.csv'

Actual Run



In [83]:

    
data_path = '../../../../Dropbox/data'
ph_data_path = data_path + '/price_history'
assert path.isdir(ph_data_path)



In [84]:

    
npz_full = ph_data_path + '/price_history_per_mobile_phone.npz'



In [85]:

    
dataset_gen = PriceHistoryDatasetPerMobilePhone(random_state=random_state)



In [86]:

    
dic = dataset_gen.genSaveDictionary(csv_in=csv_in, window_len=90, npz_out=npz_full)
dic.keys()









    Out[86]:





['9820435',
 '8332719',
 '7357394',
 '9351583',
 '8012281',
 '7655259',
 '6253594',
 '8138004',
 '10576161',
 '7408246',
 '7967487',
 '9130370',
 '8779166',
 '7653378',
 '10536998',
 '8669008',
 '9896674',
 '9941958',
 '7321695',
 '10499793',
 '9259167',
 '8221632',
 '9986194',
 '9898913',
 '10129132',
 '9055991',
 '10327862',
 '7364333',
 '8176770',
 '7507905',
 '3656048',
 '10112367',
 '8695009',
 '8735993',
 '10242128',
 '8648639',
 '8414311',
 '9875453',
 '7508833',
 '9426447',
 '6261140',
 '7294741',
 '9674179',
 '9064268',
 '7321837',
 '10409332',
 '10020902',
 '10002261',
 '6870822',
 '7351792',
 '10620877',
 '7620866',
 '9301596',
 '9981614',
 '8087014',
 '9941623',
 '5308163',
 '9469401',
 '8938578',
 '10046764',
 '10468270',
 '9035623',
 '9558425',
 '10340158',
 '7514433',
 '9758209',
 '9757914',
 '9028851',
 '10339856',
 '7509017',
 '6989466',
 '6999080',
 '10340038',
 '10242193',
 '9306016',
 '7401406',
 '8379645',
 '8379646',
 '9783218',
 '9783213',
 '9956199',
 '9268867',
 '9783217',
 '9783216',
 '9783215',
 '6998933',
 '7634031',
 '8999919',
 '6756290',
 '6536691',
 '7448041',
 '9017595',
 '8864711',
 '10550085',
 '8438203',
 '10430833',
 '7504732',
 '10620558',
 '10456126',
 '8130418',
 '9549977',
 '9562308',
 '10251789',
 '7811257',
 '7541851',
 '6808160',
 '9473245',
 '8095786',
 '8770390',
 '10513102',
 '10446143',
 '8797638',
 '8873832',
 '9272109',
 '8909629',
 '6044652',
 '7723431',
 '9287995',
 '9395784',
 '8515393',
 '9561760',
 '9098838',
 '10598138',
 '7129068',
 '9333571',
 '7780575',
 '8972925',
 '10117891',
 '9454972',
 '10620346',
 '10430368',
 '8684161',
 '10263538',
 '10327727',
 '9669402',
 '8820025',
 '8046417',
 '8783760',
 '10021828',
 '7946058',
 '10646927',
 '8735524',
 '9039085',
 '10372691',
 '3136502',
 '9536994',
 '7599025',
 '7621172',
 '6317061',
 '9338643',
 '10529304',
 '6264653',
 '8414880',
 '7335154',
 '8515183',
 '9192579',
 '10537404',
 '10019886',
 '9524580',
 '9383253',
 '8628040',
 '7282995',
 '9730380',
 '8082430',
 '6487622',
 '10000553',
 '9535485',
 '10455691',
 '7957675',
 '9555755',
 '9654138',
 '8136245',
 '9535232',
 '10000323',
 '8791213',
 '8909064',
 '9524579',
 '9524578',
 '8239538',
 '9759446',
 '9815040',
 '9898380',
 '10339621',
 '8153953',
 '7753482',
 '8130646',
 '9845360',
 '9034797',
 '9374090',
 '10327645',
 '9889976',
 '10603019',
 '8913842',
 '9618789',
 '8693332',
 '10093039',
 '8784094',
 '9685830',
 '8340504',
 '8435610',
 '5983545',
 '8317727',
 '9624747',
 '9216668',
 '10106328',
 '9844865',
 '10250011',
 '4763156',
 '10455448',
 '7333160',
 '10351727',
 '10550505',
 '7988857',
 '7050290',
 '10512952',
 '8864706',
 '8934572',
 '9232277',
 '8490920',
 '8286967',
 '9658075',
 '9416983',
 '9468414',
 '5535970',
 '9517024',
 '8742945',
 '10081282',
 '7219275',
 '10603203',
 '9547257',
 '6696391',
 '8597734',
 '8842715',
 '9668098',
 '7751495',
 '9259567',
 '9668091',
 '7288733',
 '5450546',
 '8863487',
 '9132174',
 '10550986',
 '9615391',
 '9925594',
 '7562927',
 '7257310',
 '10455301',
 '10549703',
 '9192081',
 '8116861',
 '7310752',
 '6744233',
 '10620599',
 '6733376',
 '8395434',
 '10351487',
 '9969844',
 '9410037',
 '7180618',
 '10263525',
 '9304030',
 '9995201',
 '10445742',
 '9615498',
 '9615369',
 '8444503',
 '10296471',
 '8349172',
 '8284007',
 '10513434',
 '10445832',
 '7264929',
 '6957383',
 '6724273',
 '9561798',
 '9672674',
 '7630281',
 '10036784',
 '9445080',
 '8384618',
 '9468830',
 '10597333',
 '10569851',
 '8984512',
 '8333136',
 '10430269',
 '9724038',
 '10297324',
 '8515689',
 '7487368',
 '10631640',
 '7423345',
 '6808527',
 '7904356',
 '10020336',
 '10551073',
 '8436609',
 '8913485',
 '8437937',
 '7992726',
 '8153619',
 '9001596',
 '9608073',
 '8874019',
 '8294272',
 '10468510',
 '7259273',
 '9542492',
 '8348233',
 '6666214',
 '9402788',
 '7314178',
 '8460398',
 '10199444',
 '8864161',
 '7344391',
 '6871862',
 '6601918',
 '8720352',
 '10328199',
 '9189255',
 '9970245',
 '6871261',
 '9043554',
 '9198786',
 '10057571',
 '9489172',
 '9192250',
 '8880414',
 '7750303',
 '10643104',
 '7505653',
 '8281639',
 '10617623',
 '7750919',
 '9668397',
 '9664652',
 '8107566',
 '9329087',
 '10350760',
 '8864604',
 '3783654',
 '9520852',
 '6918686',
 '7335955',
 '9501836',
 '10000278',
 '7356761',
 '9595109',
 '10282598',
 '10619619',
 '9360800',
 '9669553',
 '8244331',
 '9409925',
 '8617989',
 '9824249',
 '9332994',
 '10644470',
 '8645920',
 '8758228',
 '10619076',
 '10315304',
 '8874945',
 '8874195',
 '9352876',
 '9409718',
 '8221631',
 '9079935',
 '9107905',
 '5898447',
 '9542574',
 '9898535',
 '9042908',
 '6592480',
 '8311334',
 '8145112',
 '10315447',
 '9815037',
 '10373479',
 '9815039',
 '8435811',
 '8880028',
 '9655436',
 '9500739',
 '9550139',
 '7621093',
 '8864337',
 '8256116',
 '9572084',
 '6798407',
 '7311062',
 '10084353',
 '10001441',
 '9445259',
 '10603460',
 '8669043',
 '9757586',
 '5804541',
 '8995944',
 '5558359',
 '10529171',
 '9195103',
 '9561935',
 '7360931',
 '7945834',
 '9877558',
 '6677566',
 '7945421',
 '8436601',
 '9672740',
 '10085889',
 '10252434',
 '10019997',
 '9188134',
 '10283639',
 '9981788',
 '9412950',
 '7426369',
 '6317107',
 '8873827',
 '9633962',
 '8988217',
 '7697920',
 '9419455',
 '9105758',
 '6487975',
 '9177791',
 '10315407',
 '6933062',
 '9173758',
 '7868248',
 '10065912',
 '9517102',
 '7860520',
 '6260915',
 '8985553']

Arima



In [104]:

    
from arima.arima_estimator import ArimaEstimator
import warnings
from collections import OrderedDict
from mylibs.py_helper import cartesian_coord
from arima.arima_cv import ArimaCV



In [91]:

    
parameters = OrderedDict([
    ('p_auto_regression_order', range(6)), #0-5
    ('d_integration_level', range(3)), #0-2
    ('q_moving_average', range(6)), #0-5
])



In [93]:

    
cart = cartesian_coord(*parameters.values())
cart.shape









    Out[93]:





(108, 3)



In [105]:

    
cur_sku = dic.values()[0]
cur_sku.keys()









    Out[105]:





['test', 'train', 'train_dates']



In [110]:

    
full_mat = cur_sku['train']
full_mat.shape









    Out[110]:





(31, 90)



In [111]:

    
target_len









    Out[111]:





30



In [113]:

    
inputs = full_mat[:, :-target_len]
inputs.shape









    Out[113]:





(31, 60)



In [115]:

    
targets = full_mat[:, -target_len:]
targets.shape









    Out[115]:





(31, 30)



In [117]:

    
%%time
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    ae = ArimaEstimator(p_auto_regression_order=0, d_integration_level=1, q_moving_average=0, easy_mode=True)
    ae.fit(inputs, targets).score(inputs, targets)









    



CPU times: user 15.4 s, sys: 53.1 s, total: 1min 8s
Wall time: 9.06 s



In [118]:

    
score_dic_filepath = data_path + "/arima/scoredic_testing.npy"



In [119]:

    
path.abspath(score_dic_filepath)









    Out[119]:





'/home/student/Dropbox/data/arima/scoredic_testing.npy'



In [121]:

    
%%time
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    scoredic = ArimaCV.cross_validate(inputs=inputs, targets=targets, cartesian_combinations=cart,
                                      score_dic_filepath=score_dic_filepath, easy_mode=True)









    



CPU times: user 4h 41min 23s, sys: 13h 11min 9s, total: 17h 52min 33s
Wall time: 4h 4min 51s



In [ ]:

    
#4h 4min 51s / 108 cases => ~= 136 seconds per case !



In [127]:

    
arr = np.array(list(scoredic.iteritems()))
arr.shape









    Out[127]:





(108, 2)



In [137]:

    
np.NaN









    Out[137]:





nan



In [146]:

    
import math



In [149]:

    
float('nan') == np.NaN









    Out[149]:





False



In [152]:

    
#np.isnan()
filtered_arr = arr[ np.logical_not(arr[:, 1] != arr[:, 1]) ]
filtered_arr.shape









    Out[152]:





(81, 2)



In [154]:

    
plt.plot(filtered_arr[:, 1])









    Out[154]:





[<matplotlib.lines.Line2D at 0x7fd2a3a19cd0>]



In [156]:

    
minarg = np.argmin(filtered_arr[:, 1])
minarg









    Out[156]:





35



In [161]:

    
best_params = filtered_arr[minarg, 0]
best_params









    Out[161]:





(0, 1, 0)



In [173]:

    
test_mat = cur_sku['test']
test_ins = test_mat[:-target_len]
test_ins.shape









    Out[173]:





(60,)



In [171]:

    
test_tars = test_mat[-target_len:]
test_tars.shape









    Out[171]:





(30,)



In [174]:

    
test_ins_vals = test_ins.values.reshape(1, -1)
test_ins_vals.shape









    Out[174]:





(1, 60)



In [172]:

    
test_tars_vals = test_tars.values.reshape(1, -1)
test_tars_vals.shape









    Out[172]:





(1, 30)

Testing with easy mode on



In [176]:

    
%%time
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    ae = ArimaEstimator(p_auto_regression_order=best_params[0],
                        d_integration_level=best_params[1],
                        q_moving_average=best_params[2],
                        easy_mode=True)
    score = ae.fit(test_ins_vals, test_tars_vals).score(test_ins_vals, test_tars_vals)









    



CPU times: user 316 ms, sys: 1.2 s, total: 1.51 s
Wall time: 200 ms



In [179]:

    
score









    Out[179]:





0.0084708222952833978



In [184]:

    
plt.figure(figsize=(15,7))
plt.plot(ae.preds.flatten(), label='preds')
test_tars.plot(label='real')
plt.legend()
plt.show()

Testing with easy mode off



In [185]:

    
%%time
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    ae = ArimaEstimator(p_auto_regression_order=best_params[0],
                        d_integration_level=best_params[1],
                        q_moving_average=best_params[2],
                        easy_mode=False)
    score = ae.fit(test_ins_vals, test_tars_vals).score(test_ins_vals, test_tars_vals)









    



CPU times: user 276 ms, sys: 956 ms, total: 1.23 s
Wall time: 203 ms



In [186]:

    
score









    Out[186]:





0.16127036409641349



In [187]:

    
plt.figure(figsize=(15,7))
plt.plot(ae.preds.flatten(), label='preds')
test_tars.plot(label='real')
plt.legend()
plt.show()

Conclusion

If you are training in easy mode then what you get at the end is that the model only cares for the previous value in order to do its predictions and this makes it much easier for everybody but in reality we might not have advantage



In [ ]: