In [1]:
# -*- coding: UTF-8 -*-
#%load_ext autoreload
%reload_ext autoreload
%autoreload 2

The idea with ARIMA models is that the final residual should look like white noise otherwise, there is juice or information available in the data to extract


In [51]:
from __future__ import division
from os import path, remove
import numpy as np
import pandas as pd
import csv
from sklearn.model_selection import StratifiedShuffleSplit
from time import time
from matplotlib import pyplot as plt
import seaborn as sns
from tensorflow.contrib import rnn
from tensorflow.contrib import learn
import shutil
from tensorflow.contrib.learn.python.learn import learn_runner
from sklearn.metrics import r2_score
from fastdtw import fastdtw
from collections import OrderedDict
from scipy.spatial.distance import euclidean
from statsmodels.tsa.stattools import coint
from skopt.space.space import Integer, Real
from skopt import gp_minimize
from skopt.plots import plot_convergence
import pickle
import inspect
import dill
import sys
from pandas import read_csv
from pandas import datetime
from pandas.plotting import autocorrelation_plot
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.arima_model import ARIMA
from pandas import DataFrame
from sklearn.metrics import mean_squared_error
from price_history import PriceHistory
from arima.arima_estimator import ArimaEstimator
import warnings
from collections import OrderedDict

In [3]:
%matplotlib inline

p: The number of lag observations included in the model, also called the lag order.

d: The number of times that the raw observations are differenced, also called the degree of differencing.

q: The size of the moving average window, also called the order of moving average.


In [4]:
data_path = '../../../../../Dropbox/data'

In [5]:
score_dic_filepath = data_path + "/arima/scoredic_non_stationary.npy"

In [6]:
ph_data_path = data_path + '/price_history'

In [7]:
csv_in = "../price_history_03_seq_start_suddens_trimmed.csv"

In [11]:
ph = PriceHistory(csv_in)

In [12]:
seqs = ph.extractAllSequences()

In [13]:
def custom_transformation(sequence, window):
    #logged_vals = np.log(sequence.values)
    logged_vals = sequence.values
    logged_seq = sequence.copy()
    logged_seq[logged_seq.index] = logged_vals
    return logged_seq - logged_seq.rolling(window=window).mean()

In [43]:
orig_seq = seqs[0]

In [21]:
window = 3

In [22]:
seq_tr = custom_transformation(sequence=seqs[0], window=window)

In [60]:
np.all(seq_tr == seq_tr)


Out[60]:
False

In [62]:
seq = seq_tr[window-1:]

In [63]:
np.all(seq == seq)


Out[63]:
True

In [64]:
plt.figure(figsize=(15,6))
seq.plot()
plt.show()



In [65]:
ae = ArimaEstimator(p_auto_regression_order=4, d_integration_level=1, q_moving_average=3, easy_mode=False)

In [66]:
target_len = 30

In [67]:
XX = seq[:-target_len].values.reshape((1, -1))
XX.shape


Out[67]:
(1, 178)

In [68]:
YY = seq[-target_len:].values.reshape((1, -1))
YY.shape


Out[68]:
(1, 30)

In [69]:
%%time
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    ae.fit(inputs=XX, targets=YY)


CPU times: user 7.16 s, sys: 7.36 s, total: 14.5 s
Wall time: 6.33 s

In [70]:
preds = ae.preds.flatten()
yy = YY.flatten()
yy.shape, preds.shape


Out[70]:
((30,), (30,))

In [71]:
plt.figure(figsize=(15,6))
plt.plot(yy, label='real')
plt.plot(preds, label='preds')
plt.legend()
plt.show()



In [72]:
orig_seq.plot()


Out[72]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0a259b77d0>

In [99]:
aa = np.zeros((orig_seq.size - window +1, orig_seq.size))
aa.shape


Out[99]:
(208, 210)

In [100]:
for ii, item in enumerate(aa):
    aa[ii, ii:ii+window] = 1/window

In [101]:
trans1 = aa.dot(orig_seq)
trans1.shape


Out[101]:
(208,)

In [102]:
from numpy.linalg import pinv

In [103]:
trans2 = orig_seq.rolling(window=window).mean()[window-1:].values
trans2.shape


Out[103]:
(208,)

In [104]:
np.allclose(trans1, trans2)


Out[104]:
True

In [105]:
pA = pinv(aa)
pA.shape


Out[105]:
(210, 208)

In [107]:
reconstructed = pA.dot(trans1)
reconstructed.shape


Out[107]:
(210,)

In [112]:
plt.plot(reconstructed, label='reconstructed')
plt.plot(orig_seq.values, label='original')
plt.legend()
plt.show()



In [121]:
np.allclose(reconstructed, orig_seq.values, rtol=1e-3)


Out[121]:
True

In [128]:
#np.sqrt(np.mean((reconstructed - orig_seq.values)**2)) #RMSE
np.corrcoef(orig_seq.values, reconstructed)


Out[128]:
array([[ 1.        ,  0.99998796],
       [ 0.99998796,  1.        ]])

Conclusion

That is great we have managed to recover the moving average
Now let's see if we can recover the subtracted moving average


In [129]:
seq.plot()


Out[129]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0a25685d90>

In [132]:
bb = 1 - aa
print bb.shape
bb


(208, 210)
Out[132]:
array([[ 0.66666667,  0.66666667,  0.66666667, ...,  1.        ,
         1.        ,  1.        ],
       [ 1.        ,  0.66666667,  0.66666667, ...,  1.        ,
         1.        ,  1.        ],
       [ 1.        ,  1.        ,  0.66666667, ...,  1.        ,
         1.        ,  1.        ],
       ..., 
       [ 1.        ,  1.        ,  1.        , ...,  0.66666667,
         1.        ,  1.        ],
       [ 1.        ,  1.        ,  1.        , ...,  0.66666667,
         0.66666667,  1.        ],
       [ 1.        ,  1.        ,  1.        , ...,  0.66666667,
         0.66666667,  0.66666667]])

In [133]:
pOneMinusA = pinv(bb)
pOneMinusA.shape


Out[133]:
(210, 208)

In [135]:
custom_recons = pOneMinusA.dot(seq.values)
custom_recons.shape


Out[135]:
(210,)

In [142]:
np.allclose(custom_recons, orig_seq.values, rtol=1e-3)


Out[142]:
False

In [143]:
plt.plot(custom_recons, label='reconstructed')
plt.plot(orig_seq.values, label='original')
plt.legend()
plt.show()


WTF What is going on ??


In [144]:
np.corrcoef(orig_seq.values, custom_recons)


Out[144]:
array([[  1.00000000e+00,  -1.00888560e-04],
       [ -1.00888560e-04,   1.00000000e+00]])

In [165]:
aa.shape, orig_seq.values.shape


Out[165]:
((208, 210), (210,))

In [157]:
trans4 = orig_seq.values[window-1:] - aa.dot(orig_seq.values)
np.allclose(trans4, seq.values)


Out[157]:
True

In [166]:
bb = 1 - aa
bb.shape


Out[166]:
(208, 210)

In [171]:
trans3 = bb.dot(orig_seq.values)
trans3.shape


Out[171]:
(208,)

In [172]:
plt.plot(trans3)
#plt.plot(orig_seq.values)


Out[172]:
[<matplotlib.lines.Line2D at 0x7f0a20a593d0>]




In [27]:
parameters = OrderedDict([
    ('p_auto_regression_order', range(6)), #0-5
    ('d_integration_level', range(3)), #0-2
    ('q_moving_average', range(6)), #0-5
])

In [28]:
def cartesian_coord(*arrays):
    grid = np.meshgrid(*arrays)        
    coord_list = [entry.ravel() for entry in grid]
    points = np.vstack(coord_list).T
    return points

In [29]:
cart = cartesian_coord(*parameters.values())
cart.shape


Out[29]:
(108, 3)

In [30]:
108 * 15 / 60 / 2 #in hours!


Out[30]:
13.5

In [31]:
count_inds=100
random_state = np.random.RandomState(seed=16011984)

In [32]:
random_inds = random_state.choice(range(len(XX)), count_inds, replace=False)
xx = XX[random_inds]
yy = YY[random_inds]

In [33]:
filepath = 'scoredic.npy'

In [34]:
%%time
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    for pp, dd, qq in cart:
        scoredic = np.load(filepath)[()] if path.isfile(filepath) else OrderedDict()
        cur_tuple = (pp, dd, qq)
        if cur_tuple in scoredic:
            continue
        else:
            ae = ArimaEstimator(p_auto_regression_order=pp, d_integration_level=dd, q_moving_average=qq,
                                easy_mode=True)
            scoredic[cur_tuple] = ae.fit(inputs=xx, targets=yy).score(xx, yy)
            np.save(filepath, scoredic)


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-34-4f044cfdce60> in <module>()
----> 1 get_ipython().run_cell_magic(u'time', u'', u'with warnings.catch_warnings():\n    warnings.filterwarnings("ignore")\n    for pp, dd, qq in cart:\n        scoredic = np.load(filepath)[()] if path.isfile(filepath) else OrderedDict()\n        cur_tuple = (pp, dd, qq)\n        if cur_tuple in scoredic:\n            continue\n        else:\n            ae = ArimaEstimator(p_auto_regression_order=pp, d_integration_level=dd, q_moving_average=qq,\n                                easy_mode=True)\n            scoredic[cur_tuple] = ae.fit(inputs=xx, targets=yy).score(xx, yy)\n            np.save(filepath, scoredic)')

/home/student/anaconda2/envs/dis/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in run_cell_magic(self, magic_name, line, cell)
   2113             magic_arg_s = self.var_expand(line, stack_depth)
   2114             with self.builtin_trap:
-> 2115                 result = fn(magic_arg_s, cell)
   2116             return result
   2117 

<decorator-gen-59> in time(self, line, cell, local_ns)

/home/student/anaconda2/envs/dis/lib/python2.7/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
    186     # but it's overkill for just that one bit of state.
    187     def magic_deco(arg):
--> 188         call = lambda f, *a, **k: f(*a, **k)
    189 
    190         if callable(arg):

/home/student/anaconda2/envs/dis/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
   1183         else:
   1184             st = clock2()
-> 1185             exec(code, glob, local_ns)
   1186             end = clock2()
   1187             out = None

<timed exec> in <module>()

/home/student/pligor.george@gmail.com/msc_Artificial_Intelligence/dissertation/04_time_series_prediction/arima/arima_estimator.pyc in fit(self, inputs, targets)
     32             # print input_stream.shape
     33             # print target_stream.shape
---> 34             pred_stream = self.__get_forecast(input_stream=input_stream, target_stream=target_stream)
     35 
     36             if pred_stream is None:

/home/student/pligor.george@gmail.com/msc_Artificial_Intelligence/dissertation/04_time_series_prediction/arima/arima_estimator.pyc in __get_forecast(self, input_stream, target_stream)
     67             try:
     68                 forecasted_value = \
---> 69                     ARIMA(history, order=self.params).fit(method=self.method, disp=0, transparams=False).forecast()[0]
     70             except LinAlgError:
     71                 # print "this is the history: {}".format(history)

/home/student/anaconda2/envs/dis/lib/python2.7/site-packages/statsmodels/tsa/arima_model.pyc in fit(self, start_params, trend, method, transparams, solver, maxiter, full_output, disp, callback, start_ar_lags, **kwargs)
   1149                                            method, transparams, solver,
   1150                                            maxiter, full_output, disp,
-> 1151                                            callback, start_ar_lags, **kwargs)
   1152         normalized_cov_params = None  # TODO: fix this?
   1153         arima_fit = ARIMAResults(self, mlefit._results.params,

/home/student/anaconda2/envs/dis/lib/python2.7/site-packages/statsmodels/tsa/arima_model.pyc in fit(self, start_params, trend, method, transparams, solver, maxiter, full_output, disp, callback, start_ar_lags, **kwargs)
    967                                        maxiter=maxiter,
    968                                        full_output=full_output, disp=disp,
--> 969                                        callback=callback, **kwargs)
    970         params = mlefit.params
    971 

/home/student/anaconda2/envs/dis/lib/python2.7/site-packages/statsmodels/base/model.pyc in fit(self, start_params, method, maxiter, full_output, disp, fargs, callback, retall, skip_hessian, **kwargs)
    449                                                        callback=callback,
    450                                                        retall=retall,
--> 451                                                        full_output=full_output)
    452 
    453         #NOTE: this is for fit_regularized and should be generalized

/home/student/anaconda2/envs/dis/lib/python2.7/site-packages/statsmodels/base/optimizer.pyc in _fit(self, objective, gradient, start_params, fargs, kwargs, hessian, method, maxiter, full_output, disp, callback, retall)
    182                             disp=disp, maxiter=maxiter, callback=callback,
    183                             retall=retall, full_output=full_output,
--> 184                             hess=hessian)
    185 
    186         optim_settings = {'optimizer': method, 'start_params': start_params,

/home/student/anaconda2/envs/dis/lib/python2.7/site-packages/statsmodels/base/optimizer.pyc in _fit_lbfgs(f, score, start_params, fargs, kwargs, disp, maxiter, callback, retall, full_output, hess)
    376                                          callback=callback, args=fargs,
    377                                          bounds=bounds, disp=disp,
--> 378                                          **extra_kwargs)
    379 
    380     if full_output:

/home/student/anaconda2/envs/dis/lib/python2.7/site-packages/scipy/optimize/lbfgsb.pyc in fmin_l_bfgs_b(func, x0, fprime, args, approx_grad, bounds, m, factr, pgtol, epsilon, iprint, maxfun, maxiter, disp, callback, maxls)
    191 
    192     res = _minimize_lbfgsb(fun, x0, args=args, jac=jac, bounds=bounds,
--> 193                            **opts)
    194     d = {'grad': res['jac'],
    195          'task': res['message'],

/home/student/anaconda2/envs/dis/lib/python2.7/site-packages/scipy/optimize/lbfgsb.pyc in _minimize_lbfgsb(fun, x0, args, jac, bounds, disp, maxcor, ftol, gtol, eps, maxfun, maxiter, iprint, callback, maxls, **unknown_options)
    319         _lbfgsb.setulb(m, x, low_bnd, upper_bnd, nbd, f, g, factr,
    320                        pgtol, wa, iwa, task, iprint, csave, lsave,
--> 321                        isave, dsave, maxls)
    322         task_str = task.tostring()
    323         if task_str.startswith(b'FG'):

KeyboardInterrupt: 

In [77]:
from sklearn.model_selection import GridSearchCV

In [79]:
# Pick hyperparameters
# There is NO separation of training and testing set because there is no transfer of knowledge from the training 
# that could be useful for validation
# Run Arima prediction steps for all instances of training dataset and get a fastdtw score from each case
# Get the mean of the fastdtw and this is the CV score
# 
# Do this for all possible parameters

In [ ]:


In [76]:
fitted = model.fit()

In [ ]:
fitted.

In [29]:
model_fit = model.fit(disp=0)

In [30]:
model_fit.summary()


Out[30]:
ARIMA Model Results
Dep. Variable: D.Sales of shampoo over a three year period No. Observations: 35
Model: ARIMA(5, 1, 0) Log Likelihood -196.170
Method: css-mle S.D. of innovations 64.241
Date: Wed, 19 Jul 2017 AIC 406.340
Time: 19:38:55 BIC 417.227
Sample: 02-01-1901 HQIC 410.098
- 12-01-1903
coef std err z P>|z| [0.025 0.975]
const 12.0649 3.652 3.304 0.003 4.908 19.222
ar.L1.D.Sales of shampoo over a three year period -1.1082 0.183 -6.063 0.000 -1.466 -0.750
ar.L2.D.Sales of shampoo over a three year period -0.6203 0.282 -2.203 0.036 -1.172 -0.068
ar.L3.D.Sales of shampoo over a three year period -0.3606 0.295 -1.222 0.231 -0.939 0.218
ar.L4.D.Sales of shampoo over a three year period -0.1252 0.280 -0.447 0.658 -0.674 0.424
ar.L5.D.Sales of shampoo over a three year period 0.1289 0.191 0.673 0.506 -0.246 0.504
-1.0617 -0.5064j 1.1763 -0.4292 -1.0617 +0.5064j 1.1763 0.4292 0.0816 -1.3804j 1.3828 -0.2406 0.0816 +1.3804j 1.3828 0.2406 2.9315 -0.0000j 2.9315 -0.0000
Roots
Real Imaginary Modulus Frequency
AR.1
AR.2
AR.3
AR.4
AR.5

In [31]:
residuals = DataFrame(model_fit.resid)

In [72]:
residuals.plot()
plt.show()



In [71]:
residuals.plot(kind='kde')
plt.show()



In [34]:
residuals.describe()


Out[34]:
0
count 35.000000
mean -5.495203
std 68.132882
min -133.296589
25% -42.477874
50% -7.186603
75% 24.748348
max 133.237993

Train - Test


In [60]:
XX = series.values
XX.shape


Out[60]:
(36,)

In [61]:
#splitting
train_size = int(len(XX)*2/3)
train_size


Out[61]:
24

In [62]:
train_set = XX[:train_size]
test_set = XX[train_size:]
train_set.shape, test_set.shape


Out[62]:
((24,), (12,))

In [63]:
train_set[0]


Out[63]:
266.0

In [64]:
history = list(train_set)

In [65]:
history[0] = 3333

In [66]:
history = list(train_set)
len(history)


Out[66]:
24

In [67]:
predictions = []

In [68]:
for tt in range(len(test_set)):
    output = ARIMA(history, order=(5,1,0)).fit(disp=0).forecast()
    y_hat = output[0] #0th is the index of the prediction
    predictions.append(y_hat)
    
    observation = test_set[tt]
    #history.append(observation)
    history.append(y_hat)
    
    print "predicted: {}, expected: {}".format(y_hat, observation)


predicted: [ 306.51287642], expected: 339.7
predicted: [ 389.00679733], expected: 440.4
predicted: [ 340.99243533], expected: 315.9
predicted: [ 370.26296951], expected: 439.3
predicted: [ 347.42978471], expected: 401.3
predicted: [ 389.06447038], expected: 437.4
predicted: [ 378.3640104], expected: 575.5
predicted: [ 397.62612576], expected: 407.6
predicted: [ 387.34007046], expected: 682.0
predicted: [ 407.76038113], expected: 475.3
predicted: [ 408.07287879], expected: 581.3
predicted: [ 422.86599266], expected: 646.9

In [69]:
error = mean_squared_error(predictions, test_set)
error


Out[69]:
18743.644548971621

In [70]:
plt.figure(figsize=(15,6))
plt.plot(predictions, label='preds')
plt.plot(test_set, label='test set')
plt.legend()
plt.show()



In [ ]: