ARIMA=AutoregRessive Integrated Moving Average



In [1]:

    
import scipy
print('scipy: %s' % scipy.__version__)
# numpy
import numpy
print('numpy: %s' % numpy.__version__)
# matplotlib
import matplotlib
print('matplotlib: %s' % matplotlib.__version__)
# pandas
import pandas
print('pandas: %s' % pandas.__version__)
# scikit-learn
import sklearn
print('sklearn: %s' % sklearn.__version__)
# statsmodels
import statsmodels
print('statsmodels: %s' % statsmodels.__version__)









    



scipy: 0.19.1
numpy: 1.13.1
matplotlib: 1.5.3
pandas: 0.20.3
sklearn: 0.18
statsmodels: 0.8.0

Handling error

/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead. from pandas.core import datetools



In [10]:

    
import numpy as np
#import pandas as pd
import pandas.tseries as pd
import math
import warnings
import itertools
import statsmodels.api as sm
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

import csv
import glob

import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 6
%matplotlib inline
%pylab inline
filename = '/home/octo/Dropbox'+ '/SPY18Dec.csv'









    



Populating the interactive namespace from numpy and matplotlib



In [4]:

    
# loading csv file
def get_csv_pd(path):
    #spy_pd=pd.read_csv('C:\\Users\Michal\Dropbox\IB_data\SPY.csv',sep=' ',names=['askPrice','askSize','bidPrice','bidSize'],index_col=0,parse_dates=True)
    #spy_pd=pd.read_csv(path+'\SPY.csv',sep=',',names=['askPrice','askSize','bidPrice','bidSize'],index_col=0,parse_dates=True)
    spy_pd=pd.read_csv(path,sep=',',dtype={'askPrice':np.float32,'askSize':np.float32,
                                           'bidPrice':np.float32,'bidSize':np.float32},index_col=0,parse_dates=True)
    #spy_pd = pd.read_csv(path, usecols=['askPrice','askSize','bidPrice','bidSize'], engine='python', skipfooter=3)
    return spy_pd

def get_csv_pd_notime(path):
    #spy_pd=pd.read_csv('C:\\Users\Michal\Dropbox\IB_data\SPY.csv',sep=' ',names=['askPrice','askSize','bidPrice','bidSize'],index_col=0,parse_dates=True)
    #spy_pd=pd.read_csv(path+'\SPY.csv',sep=',',names=['askPrice','askSize','bidPrice','bidSize'],index_col=0,parse_dates=True)
    spy_pd = pd.read_csv(path, usecols=['askPrice','askSize','bidPrice','bidSize'], engine='python', skipfooter=3)
    return spy_pd

'''
def preprocessing(df):
    df.bidPrice=df.loc[:,'bidPrice'].replace(to_replace=0, method='ffill')
    df.bidSize=df.loc[:,'bidSize'].replace(to_replace=0, method='ffill')
    df.askPrice=df.loc[:,'askPrice'].replace(to_replace=0, method='ffill')
    df.askSize=df.loc[:,'askSize'].replace(to_replace=0, method='ffill')
    df=df.dropna()
    # to exclude 0
    df=df[df['bidPrice']>df.bidPrice.mean()-df.bidPrice.std()]
    df=df[df['askPrice']>df.askPrice.mean()-df.askPrice.std()]
    df['mid']=(df.askPrice+df.bidPrice)/2
    df['vwap']=((df.loc[:,'bidPrice']*df.loc[:,'bidSize'])+(df.loc[:,'askPrice']*df.loc[:,'askSize']))/(df.loc[:,'bidSize']+df.loc[:,'askSize'])
    df['spread']=df.vwap-(df.askPrice+df.bidPrice)/2
    df['v']=(df.askPrice+df.bidPrice)/2-((df.askPrice+df.bidPrice)/2).shift(60)
    df['return']=(df.askPrice/df.bidPrice.shift(1))-1
    df['sigma']=df.spread.rolling(60).std()
    return df
'''    
def preprocessing(df):
    df=df.dropna()
    # to exclude 0
    df=df[df['bidPrice']>df.bidPrice.mean()-df.bidPrice.std()]
    df=df[df['askPrice']>df.askPrice.mean()-df.askPrice.std()]
    df['mid']=(df.askPrice+df.bidPrice)/2
    df['vwap']=((df.loc[:,'bidPrice']*df.loc[:,'bidSize'])+(df.loc[:,'askPrice']*df.loc[:,'askSize']))/(df.loc[:,'bidSize']+df.loc[:,'askSize'])
    df['spread']=df.vwap-df.mid
    df['v']=(df.mid-df.mid.shift(60))
    df['mom']=np.where(np.logical_and((df.mid-df.mid.shift(12))!=0,df.v!=0),(df.mid-df.mid.shift(12))/df.v,0)
    df['return']=(df.askPrice/df.bidPrice.shift(1))-1
    #df['ret'] = np.log(df.Close/df.Close.shift(1))
    df['sigma']=df.spread.rolling(60).std()
    #df['sigma']=df.Close.rolling(5).std()
    df['high']=df.askPrice.rolling(5).max()
    df['low']=df.bidPrice.rolling(5).min()
    
    #df['mom']=np.where(np.logical_and(df.vel_c==1,df.Close>df.price),1,np.where(np.logical_and(df.vel_c==-1,df.Close<df.price),-1,0))
    #flagD=np.logical_and(np.logical_and(df.Close.shift(10)<df.Close.shift(15),df.Close.shift(15)< df.Close.shift(20)),df.Close< df.Close.shift(10))
    #flagU=np.logical_and(np.logical_and(df.Close.shift(15)>df.Close.shift(20),df.Close.shift(10)> df.Close.shift(15)),df.Close> df.Close.shift(10))
    #df['UD']= np.where(flagU,-1,np.where(flagD,1,0))
    
    #df['P']=(df.High+df.Low+df.Close)/3
    #df['UT']=(pd.rolling_max(df.High,60)+pd.rolling_max(df.P+df.High-df.Low,60))*0.5
    #df['DT']=(pd.rolling_min(df.Low,60)+pd.rolling_min(df.P+df.High-df.Low,60))*0.5
    #df['BA']=np.where(df.Close<=df.DT,-1,np.where(df.Close>=df.UT,1,0))# below or above
    return df
'''
def normalise_min_max(df,window_length=60):
    dfn=df/df/ix[0]
    return dfn

def normalise_min_max(df,window_length=60):
    dfn=(df-df.rolling(window_length).min())/(df.rolling(window_length).max()-df.rolling(window_length).min())
    return dfn

def de_normalise_min_max(data,df,window_length=60):
    dn=(df*(data.rolling(window_length).max()-data.rolling(window_length).min()))+data.rolling(window_length).min()
    return dn
'''    
def normalise(dfn,window_length=60):
    dfn['midN']=dfn.mid/dfn.mid.shift(60)
    dfn['vwapN']=dfn.vwap/dfn.mid.shift(60)
    #dfn['highN']=dfn.high/dfn.high.shift(60)
    #dfn['lowN']=dfn.low/dfn.low.shift(60)
    dfn['HL']=(dfn.high-dfn.low)/dfn.mid.shift(60)
    dfn=dfn[['midN','vwapN','HL','spread','v','mom','return','sigma']]
    return dfn



In [5]:

    
#testing
data=get_csv_pd(filename)
data=preprocessing(data)
data=data.dropna()



In [6]:

    
data.tail()









    Out[6]:







  
    
      
      askPrice
      askSize
      bidPrice
      bidSize
      mid
      vwap
      spread
      v
      mom
      return
      sigma
      high
      low
    
  
  
    
      2017-12-19 09:24:27.494064
      268.380005
      52.0
      268.369995
      132.0
      268.375
      268.372833
      -0.002167
      -0.01001
      1.0
      0.000037
      0.002835
      268.380005
      268.369995
    
    
      2017-12-19 09:24:30.229113
      268.380005
      50.0
      268.369995
      132.0
      268.375
      268.372742
      -0.002258
      -0.01001
      0.5
      0.000037
      0.002810
      268.380005
      268.369995
    
    
      2017-12-19 09:24:33.545713
      268.380005
      50.0
      268.369995
      152.0
      268.375
      268.372467
      -0.002533
      -0.01001
      0.5
      0.000037
      0.002789
      268.380005
      268.369995
    
    
      2017-12-19 09:24:37.270081
      268.380005
      41.0
      268.369995
      152.0
      268.375
      268.372131
      -0.002869
      -0.01001
      0.0
      0.000037
      0.002772
      268.380005
      268.369995
    
    
      2017-12-19 09:24:40.471072
      268.380005
      41.0
      268.369995
      157.0
      268.375
      268.372070
      -0.002930
      -0.01001
      0.0
      0.000037
      0.002745
      268.380005
      268.369995



In [7]:

    
X = data.mid.astype('float32')
ts_log = np.log(X)
plt.plot(ts_log)









    Out[7]:





[<matplotlib.lines.Line2D at 0x7fba16df1b38>]



In [11]:

    
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.arima_model import ARIMAResults



In [16]:

    
'''
## Log normalization
def arima_processing(df):
    #data=df[['vwap','mid']]
    #http://localhost:8888/notebooks/Dropbox/DataScience/ML_20Sep.ipynb
    #ARIMA-preprocessingdf=df.dropna()
    df['Lvwap']=np.log(df.vwap)
    df['Lmid']=np.log(df.mid)
    df['LDvwap']=df.Lvwap-df.Lvwap.shift(60)
    df['LDmid']=df.Lmid-df.Lmid.shift(60)
    df=df.dropna()
    return df
'''
## Lag normalization
def arima_processing(data_):
    dfn=normalise(data_).dropna()
    return dfn



In [17]:

    
data_=arima_processing(data)



In [18]:

    
#data_=data.tail(10000)



In [19]:

    
data_.tail()









    Out[19]:







  
    
      
      midN
      vwapN
      HL
      spread
      v
      mom
      return
      sigma
    
  
  
    
      2017-12-19 09:24:27.494064
      0.999963
      0.999955
      0.000037
      -0.002167
      -0.01001
      1.0
      0.000037
      0.002835
    
    
      2017-12-19 09:24:30.229113
      0.999963
      0.999954
      0.000037
      -0.002258
      -0.01001
      0.5
      0.000037
      0.002810
    
    
      2017-12-19 09:24:33.545713
      0.999963
      0.999953
      0.000037
      -0.002533
      -0.01001
      0.5
      0.000037
      0.002789
    
    
      2017-12-19 09:24:37.270081
      0.999963
      0.999952
      0.000037
      -0.002869
      -0.01001
      0.0
      0.000037
      0.002772
    
    
      2017-12-19 09:24:40.471072
      0.999963
      0.999952
      0.000037
      -0.002930
      -0.01001
      0.0
      0.000037
      0.002745



In [20]:

    
#Take first difference:
## Log value
#plt.plot(data.LDvwap)
## Lag normalization
plt.plot(data.vwapN)









    Out[20]:





[<matplotlib.lines.Line2D at 0x7fba164af668>]



In [21]:

    
#Take first difference:
plt.plot(data.midN)









    Out[21]:





[<matplotlib.lines.Line2D at 0x7fba16413f98>]



In [23]:

    
from pandas import Series
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
#from statsmodels.graphics.tsaplots import plot_pacf
#from matplotlib import pyplot as plt
import pandas
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import numpy as np
import math
from statsmodels.tsa.stattools import acf, pacf
import statsmodels.tsa.stattools as ts
from statsmodels.tsa.arima_model import ARIMA



In [25]:

    
ret=data_.midN
plt.plot(ret)
plt.show()
acf_1 =  acf(ret)[1:20]
plt.plot(acf_1)
plt.show()
test_df = pandas.DataFrame([acf_1]).T
test_df.columns = ['Pandas Autocorrelation']
test_df.index += 1
test_df.plot(kind='bar')
pacf_1 =  pacf(ret)[1:20]
plt.plot(pacf_1)
plt.show()
test_df = pandas.DataFrame([pacf_1]).T
test_df.columns = ['Pandas Partial Autocorrelation']
test_df.index += 1
test_df.plot(kind='bar')
result = ts.adfuller(ret, 1)
result









    












    












    












    Out[25]:





(-14.5084844169024,
 5.7764051324086374e-27,
 1,
 37878,
 {'1%': -3.4305226528120536,
  '10%': -2.5668106165626257,
  '5%': -2.861616308458943},
 -758606.07332068484)

There are three distinct integers (p, d, q) that are used to parametrize ARIMA models. Because of that, ARIMA models are denoted with the notation ARIMA(p, d, q). Together these three parameters account for seasonality, trend, and noise in datasets:

p is the auto-regressive part of the model. It allows us to incorporate the effect of past values into our model. Intuitively, this would be similar to stating that it is likely to be warm tomorrow if it has been warm the past 3 days. d is the integrated part of the model. This includes terms in the model that incorporate the amount of differencing (i.e. the number of past time points to subtract from the current value) to apply to the time series. Intuitively, this would be similar to stating that it is likely to be same temperature tomorrow if the difference in temperature in the last three days has been very small. q is the moving average part of the model. This allows us to set the error of our model as a linear combination of the error values observed at previous time points in the past.

When dealing with seasonal effects, we make use of the seasonal ARIMA, which is denoted as ARIMA(p,d,q)(P,D,Q)s. Here, (p, d, q) are the non-seasonal parameters described above, while (P, D, Q) follow the same definition but are applied to the seasonal component of the time series. The term s is the periodicity of the time series (4 for quarterly periods, 12 for yearly periods, etc.).

The seasonal ARIMA method can appear daunting because of the multiple tuning parameters involved. In the next section, we will describe how to automate the process of identifying the optimal set of parameters for the seasonal ARIMA time series model.



In [26]:

    
data.tail()









    Out[26]:







  
    
      
      askPrice
      askSize
      bidPrice
      bidSize
      mid
      vwap
      spread
      v
      mom
      return
      sigma
      high
      low
      midN
      vwapN
      HL
    
  
  
    
      2017-12-19 09:24:27.494064
      268.380005
      52.0
      268.369995
      132.0
      268.375
      268.372833
      -0.002167
      -0.01001
      1.0
      0.000037
      0.002835
      268.380005
      268.369995
      0.999963
      0.999955
      0.000037
    
    
      2017-12-19 09:24:30.229113
      268.380005
      50.0
      268.369995
      132.0
      268.375
      268.372742
      -0.002258
      -0.01001
      0.5
      0.000037
      0.002810
      268.380005
      268.369995
      0.999963
      0.999954
      0.000037
    
    
      2017-12-19 09:24:33.545713
      268.380005
      50.0
      268.369995
      152.0
      268.375
      268.372467
      -0.002533
      -0.01001
      0.5
      0.000037
      0.002789
      268.380005
      268.369995
      0.999963
      0.999953
      0.000037
    
    
      2017-12-19 09:24:37.270081
      268.380005
      41.0
      268.369995
      152.0
      268.375
      268.372131
      -0.002869
      -0.01001
      0.0
      0.000037
      0.002772
      268.380005
      268.369995
      0.999963
      0.999952
      0.000037
    
    
      2017-12-19 09:24:40.471072
      268.380005
      41.0
      268.369995
      157.0
      268.375
      268.372070
      -0.002930
      -0.01001
      0.0
      0.000037
      0.002745
      268.380005
      268.369995
      0.999963
      0.999952
      0.000037



In [29]:

    
from matplotlib import pyplot
data[['mid','vwap']].tail(200).plot()
pyplot.show()



In [30]:

    
print(data.describe())









    



           askPrice       askSize      bidPrice       bidSize           mid  \
count  37940.000000  37940.000000  37940.000000  37940.000000  37940.000000   
mean     268.296692     99.633896    268.282867     92.608009    268.294830   
std        0.122090    105.618942      0.124438    100.686211      0.120254   
min      268.149994      1.000000    268.160004      1.000000    268.154999   
25%      268.279999     22.000000    268.269989     22.000000    268.274994   
50%      268.380005     52.000000    268.369995     55.000000    268.375000   
75%      268.459991    169.000000    268.450012    138.000000    268.455017   
max      268.549988    776.000000    268.540009   1371.000000    268.544983   

               vwap        spread             v           mom        return  \
count  37940.000000  37940.000000  37940.000000  37940.000000  37940.000000   
mean     268.294037      0.000097     -0.000182      0.112624      0.000037   
std        0.120689      0.003601      0.019770      9.861146      0.000012   
min      268.150391     -0.019653     -0.130005   -491.000000     -0.000186   
25%      268.279572     -0.003143     -0.010010      0.000000      0.000037   
50%      268.374710      0.000000      0.000000      0.000000      0.000037   
75%      268.454681      0.003357      0.010010      0.400488      0.000037   
max      268.549927      0.034180      0.070007    492.000000      0.000298   

              sigma          high           low          midN         vwapN  \
count  37940.000000  37940.000000  37940.000000  37880.000000  37880.000000   
mean       0.003248    268.370500    268.357931      0.999998      0.999998   
std        0.000776      0.098108      0.098376      0.000074      0.000076   
min        0.000253    268.170013    268.160004      0.999516      0.999498   
25%        0.002824    268.279999    268.269989      0.999963      0.999952   
50%        0.003325    268.380005    268.369995      1.000000      1.000001   
75%        0.003751    268.459991    268.450012      1.000037      1.000052   
max        0.007025    268.549988    268.540009      1.000261      1.000277   

                 HL  
count  37880.000000  
mean       0.000047  
std        0.000019  
min        0.000000  
25%        0.000037  
50%        0.000037  
75%        0.000037  
max        0.000298



In [31]:

    
from pandas import DataFrame
from pandas import TimeGrouper
from matplotlib import pyplot



In [32]:

    
groups = data.v.groupby(np.where(data.spread>0,1,0)+np.where(data.spread<0,-1,0))
years = DataFrame()
pyplot.figure()
i = 1
n_groups = len(groups)
for name, group in groups:
	pyplot.subplot((n_groups*100) + 10 + i)
	i += 1
	pyplot.plot(group)
pyplot.show()

BOX and WHISKER plot



In [33]:

    
from pandas import Series
from pandas import DataFrame
from pandas import TimeGrouper
from matplotlib import pyplot

data[['mid','vwap']].boxplot()
pyplot.show()



In [34]:

    
data[['spread','v']].boxplot()
pyplot.show()

Stationarity is defined using very strict criterion. However, for practical purposes we can assume the series to be stationary if it has constant statistical properties over time, ie. the following:

constant mean constant variance an autocovariance that does not depend on time.



In [41]:

    
from matplotlib.pylab import rcParams
from statsmodels.tsa.stattools import adfuller
import pandas as pd



In [42]:

    
def test_stationarity(timeseries):
    
    #Determing rolling statistics
    rolmean = timeseries.rolling(12).mean()
    rolstd = timeseries.rolling(12).std()

    #Plot rolling statistics:
    orig = plt.plot(timeseries, color='blue',label='Original')
    mean = plt.plot(rolmean, color='red', label='Rolling Mean')
    std = plt.plot(rolstd, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)
    
    #Perform Dickey-Fuller test:
    print ('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print (dfoutput)



In [43]:

    
rcParams['figure.figsize'] = 15, 6
test_stationarity(data_.midN)
plt.show()









    












    



Results of Dickey-Fuller Test:
Test Statistic                   -26.185307
p-value                            0.000000
#Lags Used                        53.000000
Number of Observations Used    37826.000000
Critical Value (1%)               -3.430523
Critical Value (5%)               -2.861616
Critical Value (10%)              -2.566811
dtype: float64



In [44]:

    
rcParams['figure.figsize'] = 15, 6
test_stationarity(data.mid)
plt.show()









    












    



Results of Dickey-Fuller Test:
Test Statistic                    -2.775478
p-value                            0.061863
#Lags Used                        20.000000
Number of Observations Used    37919.000000
Critical Value (1%)               -3.430522
Critical Value (5%)               -2.861616
Critical Value (10%)              -2.566811
dtype: float64



In [47]:

    
from pandas import read_csv
from pandas import datetime
from matplotlib import pyplot

Parsing time index



In [45]:

    
def parser(x):
    return datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f')



In [58]:

    
series = read_csv('/home/octo/Dropbox/SPY18Dec.csv', header=0, parse_dates=[0], index_col=0, squeeze=True, date_parser=parser)
print(series.tail())









    



                            askPrice  askSize  bidPrice  bidSize
2017-12-19 09:24:27.494064    268.38     52.0    268.37    132.0
2017-12-19 09:24:30.229113    268.38     50.0    268.37    132.0
2017-12-19 09:24:33.545713    268.38     50.0    268.37    152.0
2017-12-19 09:24:37.270081    268.38     41.0    268.37    152.0
2017-12-19 09:24:40.471072    268.38     41.0    268.37    157.0



In [59]:

    
series[['askSize','bidSize']].plot()
pyplot.show()



In [60]:

    
data[['askSize','bidSize']].plot()
pyplot.show()



In [54]:

    
data = data.dropna()
data[['askPrice','bidPrice']].plot()
pyplot.show()



In [61]:

    
from statsmodels.tsa.seasonal import seasonal_decompose
decomposition = seasonal_decompose(data.mid, freq=12)  
fig = plt.figure()  
fig = decomposition.plot()  
fig.set_size_inches(15, 8)









    





<matplotlib.figure.Figure at 0x7fba15f027b8>



In [62]:

    
from statsmodels.tsa.seasonal import seasonal_decompose
decomposition = seasonal_decompose(data_.midN, freq=12)  
fig = plt.figure()  
fig = decomposition.plot()  
fig.set_size_inches(15, 8)









    





<matplotlib.figure.Figure at 0x7fba142b1908>



In [55]:

    
trend = decomposition.trend
seasonal = decomposition.seasonal 
residual = decomposition.resid



In [ ]:

	askPrice	askSize	bidPrice	bidSize	mid	vwap	spread	v	mom	return	sigma	high	low
2017-12-19 09:24:27.494064	268.380005	52.0	268.369995	132.0	268.375	268.372833	-0.002167	-0.01001	1.0	0.000037	0.002835	268.380005	268.369995
2017-12-19 09:24:30.229113	268.380005	50.0	268.369995	132.0	268.375	268.372742	-0.002258	-0.01001	0.5	0.000037	0.002810	268.380005	268.369995
2017-12-19 09:24:33.545713	268.380005	50.0	268.369995	152.0	268.375	268.372467	-0.002533	-0.01001	0.5	0.000037	0.002789	268.380005	268.369995
2017-12-19 09:24:37.270081	268.380005	41.0	268.369995	152.0	268.375	268.372131	-0.002869	-0.01001	0.0	0.000037	0.002772	268.380005	268.369995
2017-12-19 09:24:40.471072	268.380005	41.0	268.369995	157.0	268.375	268.372070	-0.002930	-0.01001	0.0	0.000037	0.002745	268.380005	268.369995

	midN	vwapN	HL	spread	v	mom	return	sigma
2017-12-19 09:24:27.494064	0.999963	0.999955	0.000037	-0.002167	-0.01001	1.0	0.000037	0.002835
2017-12-19 09:24:30.229113	0.999963	0.999954	0.000037	-0.002258	-0.01001	0.5	0.000037	0.002810
2017-12-19 09:24:33.545713	0.999963	0.999953	0.000037	-0.002533	-0.01001	0.5	0.000037	0.002789
2017-12-19 09:24:37.270081	0.999963	0.999952	0.000037	-0.002869	-0.01001	0.0	0.000037	0.002772
2017-12-19 09:24:40.471072	0.999963	0.999952	0.000037	-0.002930	-0.01001	0.0	0.000037	0.002745