In [1]:
# -*- coding: UTF-8 -*-
#%load_ext autoreload
%reload_ext autoreload
%autoreload 2

In [2]:
from __future__ import division
import tensorflow as tf
from os import path, remove
import numpy as np
import pandas as pd
import csv
from sklearn.model_selection import StratifiedShuffleSplit
from time import time
from matplotlib import pyplot as plt
import seaborn as sns
from mylibs.jupyter_notebook_helper import show_graph, renderStatsList, renderStatsCollection, \
    renderStatsListWithLabels, renderStatsCollectionOfCrossValids
from tensorflow.contrib import rnn
from tensorflow.contrib import learn
import shutil
from tensorflow.contrib.learn.python.learn import learn_runner
from mylibs.tf_helper import getDefaultGPUconfig
from sklearn.metrics import r2_score
from mylibs.py_helper import factors
from fastdtw import fastdtw
from collections import OrderedDict
from scipy.spatial.distance import euclidean
from statsmodels.tsa.stattools import coint
from common import get_or_run_nn
from data_providers.price_history_seq2seq_data_provider import PriceHistorySeq2SeqDataProvider
from skopt.space.space import Integer, Real
from skopt import gp_minimize
from skopt.plots import plot_convergence
import pickle
import inspect
import dill
import sys
#from models.price_history_21_seq2seq_dyn_dec_ins import PriceHistorySeq2SeqDynDecIns
from data_providers.PriceHistoryMobileAttrsCombinator import PriceHistoryMobileAttrsCombinator
from sklearn.neighbors import NearestNeighbors
from datetime import datetime
from data_providers.price_hist_with_relevant_deals import PriceHistWithRelevantDeals
from data_providers.price_history_29_dataset_per_mobile_phone import PriceHistoryDatasetPerMobilePhone
from arima.arima_estimator import ArimaEstimator
import warnings
from collections import OrderedDict
from mylibs.py_helper import cartesian_coord
from arima.arima_cv import ArimaCV


/home/studenthp/anaconda2/envs/dis/lib/python2.7/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools

In [3]:
dtype = tf.float32
seed = 16011984
random_state = np.random.RandomState(seed=seed)
config = getDefaultGPUconfig()
n_jobs = 1
%matplotlib inline

Step 0 - hyperparams

vocab_size is all the potential words you could have (classification for translation case) and max sequence length are the SAME thing

decoder RNN hidden units are usually same size as encoder RNN hidden units in translation but for our case it does not seem really to be a relationship there but we can experiment and find out later, not a priority thing right now


In [4]:
input_len = 60
target_len = 30
batch_size = 50
with_EOS = False

In [5]:
csv_in = '../price_history_03_seq_start_suddens_trimmed.csv'

Actual Run


In [6]:
data_path = '../../../../Dropbox/data'
ph_data_path = data_path + '/price_history'
assert path.isdir(ph_data_path)

In [7]:
npz_full = ph_data_path + '/price_history_per_mobile_phone.npz'

In [8]:
#dataset_gen = PriceHistoryDatasetPerMobilePhone(random_state=random_state)

In [9]:
dic = np.load(npz_full)
dic.keys()[:10]


Out[9]:
['9820435',
 '8332719',
 '7357394',
 '9351583',
 '7655259',
 '6253594',
 '8138004',
 '10576161',
 '7408246',
 '7967487']

Arima


In [10]:
parameters = OrderedDict([
    ('p_auto_regression_order', range(6)), #0-5
    ('d_integration_level', range(3)), #0-2
    ('q_moving_average', range(6)), #0-5
])

In [11]:
cart = cartesian_coord(*parameters.values())
cart.shape


Out[11]:
(108, 3)

In [12]:
cur_key = dic.keys()[0]
cur_key


Out[12]:
'9820435'

In [13]:
cur_sku = dic[cur_key][()]
cur_sku.keys()


Out[13]:
['test', 'train', 'train_dates']

In [14]:
train_mat = cur_sku['train']
train_mat.shape


Out[14]:
(31, 90)

In [15]:
target_len


Out[15]:
30

In [16]:
inputs = train_mat[:, :-target_len]
inputs.shape


Out[16]:
(31, 60)

In [17]:
targets = train_mat[:, -target_len:]
targets.shape


Out[17]:
(31, 30)

In [18]:
easy_mode = False

In [19]:
score_dic_filepath = data_path + "/arima/scoredic_easy_mode_{}_{}.npy".format(easy_mode, cur_key)
path.abspath(score_dic_filepath)


Out[19]:
'/home/studenthp/Dropbox/data/arima/scoredic_easy_mode_False_9820435.npy'

In [20]:
%%time
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    scoredic = ArimaCV.cross_validate(inputs=inputs, targets=targets, cartesian_combinations=cart,
                                      score_dic_filepath=score_dic_filepath, easy_mode=easy_mode)


CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 2.79 ms

In [21]:
#4h 4min 51s / 108 cases => ~= 136 seconds per case !

In [22]:
arr = np.array(list(scoredic.iteritems()))
arr.shape


Out[22]:
(108, 2)

In [23]:
#np.isnan()
filtered_arr = arr[ np.logical_not(arr[:, 1] != arr[:, 1]) ]
filtered_arr.shape


Out[23]:
(102, 2)

In [24]:
plt.plot(filtered_arr[:, 1])


Out[24]:
[<matplotlib.lines.Line2D at 0x7fe2d219e110>]

In [25]:
minarg = np.argmin(filtered_arr[:, 1])
minarg


Out[25]:
62

In [26]:
best_params = filtered_arr[minarg, 0]
best_params


Out[26]:
(4, 1, 3)

In [27]:
test_mat = cur_sku['test']
test_ins = test_mat[:-target_len]
test_ins.shape


Out[27]:
(60,)

In [28]:
test_tars = test_mat[-target_len:]
test_tars.shape


Out[28]:
(30,)

In [29]:
test_ins_vals = test_ins.values.reshape(1, -1)
test_ins_vals.shape


Out[29]:
(1, 60)

In [30]:
test_tars_vals = test_tars.values.reshape(1, -1)
test_tars_vals.shape


Out[30]:
(1, 30)

Testing with easy mode on


In [31]:
%%time
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    ae = ArimaEstimator(p_auto_regression_order=best_params[0],
                        d_integration_level=best_params[1],
                        q_moving_average=best_params[2],
                        easy_mode=True)
    score = ae.fit(test_ins_vals, test_tars_vals).score(test_ins_vals, test_tars_vals)


CPU times: user 6.86 s, sys: 7.2 s, total: 14.1 s
Wall time: 6.02 s

In [32]:
score


Out[32]:
6.567110444670174

In [33]:
plt.figure(figsize=(15,7))
plt.plot(ae.preds.flatten(), label='preds')
test_tars.plot(label='real')
plt.legend()
plt.show()


Testing with easy mode off


In [34]:
%%time
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    ae = ArimaEstimator(p_auto_regression_order=best_params[0],
                        d_integration_level=best_params[1],
                        q_moving_average=best_params[2],
                        easy_mode=False)
    score = ae.fit(test_ins_vals, test_tars_vals).score(test_ins_vals, test_tars_vals)


CPU times: user 6.75 s, sys: 7.15 s, total: 13.9 s
Wall time: 5.81 s

In [35]:
score


Out[35]:
72.308500086941933

In [36]:
plt.figure(figsize=(15,7))
plt.plot(ae.preds.flatten(), label='preds')
test_tars.plot(label='real')
plt.legend()
plt.show()


Conclusion

If you are training in easy mode then what you get at the end is that the model only cares for the previous value in order to do its predictions and this makes it much easier for everybody but in reality we might not have advantage

Trying


In [37]:
args = np.argsort(filtered_arr[:, 1])
args


Out[37]:
array([ 62,  63,  69,   5,  39,  41,  67,  40,  20,  26,  23,   4,  17,
         3,  16,   2,  14,  27,   8,  29,  38,  22,  11,  37,  54,  10,
        21,  42,  48,   9,  36,  61,  59,  28,  34,  68,   6,  12,   7,
        56,  18,  13,  65,  24,   1,  60,  55,  19,  25,  15,   0,  30,
        66,  43,  31,  32,  35,  58,  46,  53,  47,  33,  64,  45,  49,
        50,  52,  57,  44,  51,  86,  78,  94,  87,  85,  88,  80, 101,
        95,  77, 100,  74,  93,  83,  98,  75,  71,  81,  99,  92,  97,
        90,  84,  72,  91,  96,  79,  73,  70,  89,  82,  76])

In [38]:
filtered_arr[args[:10], 0]


Out[38]:
array([(4, 1, 3), (4, 1, 4), (5, 1, 4), (0, 0, 5), (0, 1, 3), (0, 1, 5),
       (5, 1, 2), (0, 1, 4), (3, 0, 2), (4, 0, 2)], dtype=object)

In [39]:
%%time
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    ae = ArimaEstimator(p_auto_regression_order=4,
                        d_integration_level=1,
                        q_moving_average=3,
                        easy_mode=False)
    print ae.fit(test_ins_vals, test_tars_vals).score(test_ins_vals, test_tars_vals)

plt.figure(figsize=(15,7))
plt.plot(ae.preds.flatten(), label='preds')
test_tars.plot(label='real')
plt.legend()
plt.show()


72.3085000869
CPU times: user 6.85 s, sys: 7.51 s, total: 14.4 s
Wall time: 5.87 s

All tests


In [44]:
from arima.arima_testing import ArimaTesting

In [49]:
best_params, target_len, npz_full


Out[49]:
((4, 1, 3),
 30,
 '../../../../Dropbox/data/price_history/price_history_per_mobile_phone.npz')

In [53]:
%%time
keys, scores, preds = ArimaTesting.full_testing(best_params=best_params, target_len=target_len,
                                                    npz_full=npz_full)


0 , 9820435
1 , 8332719
2 , 7357394
3 , 9351583
4 , 7655259
5 , 6253594
6 , 8138004
7 , 10576161
8 , 7408246
9 , 7967487
10 , 9130370
11 , 8779166
12 , 7653378
13 , 10536998
14 , 8669008
15 , 9896674
16 , 9941958
17 , 7780575
18 , 10499793
19 , 9259167
20 , 9409718
21 , 9986194
22 , 9898913
23 , 10129132
24 , 9055991
25 , 8515393
26 , 7364333
27 , 8176770
28 , 7507905
29 , 3656048
30 , 10112367
31 , 8695009
32 , 8735993
33 , 10242128
34 , 8414311
35 , 9547257
36 , 7508833
37 , 9426447
38 , 6261140
39 , 7294741
40 , 9674179
41 , 9064268
42 , 9672674
43 , 10409332
44 , 7697920
45 , 10020902
46 , 10002261
47 , 6870822
48 , 7992726
49 , 10620877
50 , 7620866
51 , 9301596
52 , 9981614
53 , 8087014
54 , 9941623
55 , 5308163
56 , 9469401
57 , 8938578
58 , 10046764
59 , 10468270
60 , 9035623
61 , 9558425
62 , 10340158
63 , 7514433
64 , 9758209
65 , 9757914
66 , 9028851
67 , 10339856
68 , 7509017
69 , 6989466
70 , 6999080
71 , 10340038
72 , 10242193
73 , 9306016
74 , 7401406
75 , 8379645
76 , 8379646
77 , 9783218
78 , 9783213
79 , 9956199
80 , 9268867
81 , 9783217
82 , 9783216
83 , 9783215
84 , 6998933
85 , 7634031
86 , 8999919
87 , 6756290
88 , 6536691
89 , 7448041
90 , 9017595
91 , 8864711
92 , 10550085
93 , 7351792
94 , 8438203
95 , 10430833
96 , 7504732
97 , 10620558
98 , 10456126
99 , 8130418
100 , 9549977
101 , 9562308
102 , 10430368
103 , 7811257
104 , 8820025
105 , 6808160
106 , 9730380
107 , 8095786
108 , 8770390
109 , 10513102
110 , 8340504
111 , 8797638
112 , 8873832
113 , 9272109
114 , 8909629
115 , 6044652
116 , 7723431
117 , 9287995
118 , 9395784
119 , 9561760
120 , 9098838
121 , 10598138
122 , 7129068
123 , 9333571
124 , 7321695
125 , 8972925
126 , 10117891
127 , 9454972
128 , 10620346
129 , 10251789
130 , 8684161
131 , 10263538
132 , 10327727
133 , 9669402
134 , 7541851
135 , 8783760
136 , 10021828
137 , 7946058
138 , 10646927
139 , 8735524
140 , 9039085
141 , 10372691
142 , 3136502
143 , 9536994
144 , 7599025
145 , 7621172
146 , 6317061
147 , 9338643
148 , 10529304
149 , 7621093
150 , 8414880
151 , 7335154
152 , 8515183
153 , 9192579
154 , 10537404
155 , 10019886
156 , 9524580
157 , 9383253
158 , 8628040
159 , 7282995
160 , 9473245
161 , 8082430
162 , 6487622
163 , 10000553
164 , 10455691
165 , 7957675
166 , 9555755
167 , 9654138
168 , 8136245
169 , 8116861
170 , 10000323
171 , 8791213
172 , 8909064
173 , 9524579
174 , 9524578
175 , 8239538
176 , 9759446
177 , 9815040
178 , 10620599
179 , 10339621
180 , 8153953
181 , 7753482
182 , 8130646
183 , 9845360
184 , 9034797
185 , 9374090
186 , 10327645
187 , 9889976
188 , 9633962
189 , 10603019
190 , 8913842
191 , 9618789
192 , 8693332
193 , 10093039
194 , 8784094
195 , 9685830
196 , 10446143
197 , 8435610
198 , 5983545
199 , 6918686
200 , 8317727
201 , 9624747
202 , 9216668
203 , 10106328
204 , 9844865
205 , 10250011
206 , 4763156
207 , 10455448
208 , 7333160
209 , 10351727
210 , 10550505
211 , 7988857
212 , 7050290
213 , 10512952
214 , 8864706
215 , 8934572
216 , 9232277
217 , 8286967
218 , 9658075
219 , 9416983
220 , 9468414
221 , 5535970
222 , 9517024
223 , 8742945
224 , 10081282
225 , 7219275
226 , 10327862
227 , 9875453
228 , 6696391
229 , 8597734
230 , 8842715
231 , 9668098
232 , 7751495
233 , 9259567
234 , 9668091
235 , 7288733
236 , 5450546
237 , 8863487
238 , 9132174
239 , 10550986
240 , 9615391
241 , 9925594
242 , 7562927
243 , 7257310
244 , 10455301
245 , 10549703
246 , 9192081
247 , 9535232
248 , 7310752
249 , 6744233
250 , 9898380
251 , 6733376
252 , 9195103
253 , 10351487
254 , 9535485
255 , 9410037
256 , 7180618
257 , 10263525
258 , 9304030
259 , 9995201
260 , 10445742
261 , 9615498
262 , 9615369
263 , 8444503
264 , 10296471
265 , 8349172
266 , 8284007
267 , 10513434
268 , 10445832
269 , 7264929
270 , 6957383
271 , 6724273
272 , 9561798
273 , 7321837
274 , 7630281
275 , 7945834
276 , 9445080
277 , 8384618
278 , 9468830
279 , 10597333
280 , 10569851
281 , 8984512
282 , 8333136
283 , 10430269
284 , 9724038
285 , 10297324
286 , 8515689
287 , 7487368
288 , 10631640
289 , 7423345
290 , 6808527
291 , 7904356
292 , 10020336
293 , 10551073
294 , 9188134
295 , 8913485
296 , 8437937
297 , 8153619
298 , 9001596
299 , 9608073
300 , 8874019
301 , 8294272
302 , 10468510
303 , 7259273
304 , 9542492
305 , 8348233
306 , 6666214
307 , 9402788
308 , 7314178
309 , 8460398
310 , 10199444
311 , 7344391
312 , 6871862
313 , 6601918
314 , 8720352
315 , 10328199
316 , 9189255
317 , 9970245
318 , 6871261
319 , 9043554
320 , 9198786
321 , 10057571
322 , 9489172
323 , 9192250
324 , 8880414
325 , 7750303
326 , 10643104
327 , 7505653
328 , 8281639
329 , 10617623
330 , 7750919
331 , 9668397
332 , 9664652
333 , 8107566
334 , 9329087
335 , 10350760
336 , 8864604
337 , 3783654
338 , 9520852
339 , 8864161
340 , 7335955
341 , 9501836
342 , 10000278
343 , 7356761
344 , 9595109
345 , 10282598
346 , 10619619
347 , 9360800
348 , 9669553
349 , 8244331
350 , 9409925
351 , 8617989
352 , 9824249
353 , 9332994
354 , 10644470
355 , 8645920
356 , 8758228
357 , 10619076
358 , 10315304
359 , 8874945
360 , 8046417
361 , 8648639
362 , 8221632
363 , 8221631
364 , 9079935
365 , 9107905
366 , 5898447
367 , 9542574
368 , 9898535
369 , 9042908
370 , 6592480
371 , 8311334
372 , 8145112
373 , 10315447
374 , 9815037
375 , 9352876
376 , 10373479
377 , 10603203
378 , 9815039
379 , 8435811
380 , 8880028
381 , 9655436
382 , 9500739
383 , 9550139
384 , 6264653
385 , 8864337
386 , 8256116
387 , 9572084
388 , 6798407
389 , 7311062
390 , 10084353
391 , 10001441
392 , 7945421
393 , 9445259
394 , 10603460
395 , 8669043
396 , 9757586
397 , 5804541
398 , 8995944
399 , 5558359
400 , 10529171
401 , 8395434
402 , 9561935
403 , 7360931
404 , 10036784
405 , 9969844
406 , 9877558
407 , 6677566
408 , 8012281
409 , 8436601
410 , 9672740
411 , 10085889
412 , 10252434
413 , 10019997
414 , 8436609
415 , 10283639
416 , 9981788
417 , 9412950
418 , 7426369
419 , 6317107
420 , 8873827
421 , 8874195
422 , 8988217
423 , 8490920
424 , 9419455
425 , 9105758
426 , 6487975
427 , 9177791
428 , 10315407
429 , 6933062
430 , 9173758
431 , 7868248
432 , 10065912
433 , 9517102
434 , 7860520
435 , 6260915
436 , 8985553
CPU times: user 34min 12s, sys: 44min 9s, total: 1h 18min 22s
Wall time: 28min 34s

In [52]:
# render graphs here

In [56]:
score_arr = np.array(scores)

In [59]:
np.mean(score_arr[np.logical_not(score_arr != score_arr)])


Out[59]:
21.673050084413859

In [ ]: