Autoregressive Moving Average (ARMA): Sunspots data


In [1]:
%matplotlib inline

In [2]:
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt

import statsmodels.api as sm

In [3]:
from statsmodels.graphics.api import qqplot

Sunspots Data


In [4]:
print(sm.datasets.sunspots.NOTE)


::

    Number of Observations - 309 (Annual 1700 - 2008)
    Number of Variables - 1
    Variable name definitions::

        SUNACTIVITY - Number of sunspots for each year

    The data file contains a 'YEAR' variable that is not returned by load.


In [5]:
dta = sm.datasets.sunspots.load_pandas().data

In [6]:
dta.index = pd.Index(sm.tsa.datetools.dates_from_range('1700', '2008'))
del dta["YEAR"]

In [7]:
dta.plot(figsize=(12,8));



In [8]:
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(dta.values.squeeze(), lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(dta, lags=40, ax=ax2)



In [9]:
arma_mod20 = sm.tsa.ARMA(dta, (2,0)).fit(disp=False)
print(arma_mod20.params)


const                49.659326
ar.L1.SUNACTIVITY     1.390656
ar.L2.SUNACTIVITY    -0.688571
dtype: float64
/home/travis/build/statsmodels/statsmodels/statsmodels/tsa/arima_model.py:472: FutureWarning: 
statsmodels.tsa.arima_model.ARMA and statsmodels.tsa.arima_model.ARIMA have
been deprecated in favor of statsmodels.tsa.arima.model.ARIMA (note the .
between arima and model) and
statsmodels.tsa.SARIMAX.

statsmodels.tsa.arima.model.ARIMA makes use of the statespace framework and
is both well tested and maintained.

To silence this warning and continue using ARMA and ARIMA until they are
removed, use:

import warnings
warnings.filterwarnings('ignore', 'statsmodels.tsa.arima_model.ARMA',
                        FutureWarning)
warnings.filterwarnings('ignore', 'statsmodels.tsa.arima_model.ARIMA',
                        FutureWarning)

  warnings.warn(ARIMA_DEPRECATION_WARN, FutureWarning)
/home/travis/build/statsmodels/statsmodels/statsmodels/tsa/base/tsa_model.py:162: ValueWarning: No frequency information was provided, so inferred frequency A-DEC will be used.
  % freq, ValueWarning)

In [10]:
arma_mod30 = sm.tsa.ARMA(dta, (3,0)).fit(disp=False)


/home/travis/build/statsmodels/statsmodels/statsmodels/tsa/base/tsa_model.py:162: ValueWarning: No frequency information was provided, so inferred frequency A-DEC will be used.
  % freq, ValueWarning)

In [11]:
print(arma_mod20.aic, arma_mod20.bic, arma_mod20.hqic)


2622.636338064195 2637.569703171786 2628.606725909441

In [12]:
print(arma_mod30.params)


const                49.750011
ar.L1.SUNACTIVITY     1.300810
ar.L2.SUNACTIVITY    -0.508093
ar.L3.SUNACTIVITY    -0.129649
dtype: float64

In [13]:
print(arma_mod30.aic, arma_mod30.bic, arma_mod30.hqic)


2619.4036286970213 2638.07033508151 2626.8666135035787
  • Does our model obey the theory?

In [14]:
sm.stats.durbin_watson(arma_mod30.resid.values)


Out[14]:
1.9564808574755743

In [15]:
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)
ax = arma_mod30.resid.plot(ax=ax);



In [16]:
resid = arma_mod30.resid

In [17]:
stats.normaltest(resid)


Out[17]:
NormaltestResult(statistic=49.84500874549996, pvalue=1.5006999763860587e-11)

In [18]:
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)
fig = qqplot(resid, line='q', ax=ax, fit=True)



In [19]:
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(resid.values.squeeze(), lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(resid, lags=40, ax=ax2)



In [20]:
r,q,p = sm.tsa.acf(resid.values.squeeze(), fft=True, qstat=True)
data = np.c_[range(1,41), r[1:], q, p]
table = pd.DataFrame(data, columns=['lag', "AC", "Q", "Prob(>Q)"])
print(table.set_index('lag'))


            AC          Q      Prob(>Q)
lag                                    
1.0   0.009179   0.026286  8.712035e-01
2.0   0.041793   0.573043  7.508710e-01
3.0  -0.001335   0.573602  9.024480e-01
4.0   0.136089   6.408919  1.706205e-01
5.0   0.092468   9.111822  1.046862e-01
6.0   0.091948  11.793236  6.674364e-02
7.0   0.068748  13.297192  6.519003e-02
8.0  -0.015020  13.369220  9.976160e-02
9.0   0.187592  24.641900  3.393922e-03
10.0  0.213718  39.321985  2.229483e-05
11.0  0.201082  52.361129  2.344958e-07
12.0  0.117182  56.804181  8.574283e-08
13.0 -0.014055  56.868318  1.893908e-07
14.0  0.015398  56.945557  3.997670e-07
15.0 -0.024967  57.149312  7.741489e-07
16.0  0.080916  59.296761  6.872186e-07
17.0  0.041138  59.853729  1.110948e-06
18.0 -0.052021  60.747419  1.548437e-06
19.0  0.062496  62.041683  1.831648e-06
20.0 -0.010301  62.076971  3.381252e-06
21.0  0.074453  63.926646  3.193595e-06
22.0  0.124955  69.154763  8.978379e-07
23.0  0.093162  72.071028  5.799796e-07
24.0 -0.082152  74.346680  4.713027e-07
25.0  0.015695  74.430036  8.289061e-07
26.0 -0.025037  74.642895  1.367287e-06
27.0 -0.125861  80.041143  3.722571e-07
28.0  0.053225  81.009977  4.716284e-07
29.0 -0.038693  81.523803  6.916640e-07
30.0 -0.016904  81.622222  1.151662e-06
31.0 -0.019296  81.750934  1.868768e-06
32.0  0.104990  85.575062  8.927964e-07
33.0  0.040086  86.134564  1.247510e-06
34.0  0.008829  86.161807  2.047826e-06
35.0  0.014588  86.236444  3.263809e-06
36.0 -0.119329  91.248895  1.084455e-06
37.0 -0.036665  91.723864  1.521923e-06
38.0 -0.046193  92.480514  1.938734e-06
39.0 -0.017768  92.592883  2.990679e-06
40.0 -0.006220  92.606705  4.696983e-06
/home/travis/build/statsmodels/statsmodels/statsmodels/tsa/stattools.py:656: FutureWarning: The default number of lags is changing from 40 tomin(int(10 * np.log10(nobs)), nobs - 1) after 0.12is released. Set the number of lags to an integer to  silence this warning.
  FutureWarning,
  • This indicates a lack of fit.
  • In-sample dynamic prediction. How good does our model do?

In [21]:
predict_sunspots = arma_mod30.predict('1990', '2012', dynamic=True)
print(predict_sunspots)


1990-12-31    167.047437
1991-12-31    140.993051
1992-12-31     94.859195
1993-12-31     46.861009
1994-12-31     11.242709
1995-12-31     -4.721168
1996-12-31     -1.166799
1997-12-31     16.185784
1998-12-31     39.021953
1999-12-31     59.449924
2000-12-31     72.170187
2001-12-31     75.376828
2002-12-31     70.436511
2003-12-31     60.731651
2004-12-31     50.201873
2005-12-31     42.076113
2006-12-31     38.114377
2007-12-31     38.454732
2008-12-31     41.963898
2009-12-31     46.869361
2010-12-31     51.423328
2011-12-31     54.399781
2012-12-31     55.321752
Freq: A-DEC, dtype: float64

In [22]:
fig, ax = plt.subplots(figsize=(12, 8))
ax = dta.loc['1950':].plot(ax=ax)
fig = arma_mod30.plot_predict('1990', '2012', dynamic=True, ax=ax, plot_insample=False)



In [23]:
def mean_forecast_err(y, yhat):
    return y.sub(yhat).mean()

In [24]:
mean_forecast_err(dta.SUNACTIVITY, predict_sunspots)


Out[24]:
5.636880847258508

Exercise: Can you obtain a better fit for the Sunspots model? (Hint: sm.tsa.AR has a method select_order)

Simulated ARMA(4,1): Model Identification is Difficult


In [25]:
from statsmodels.tsa.arima_process import ArmaProcess

In [26]:
np.random.seed(1234)
# include zero-th lag
arparams = np.array([1, .75, -.65, -.55, .9])
maparams = np.array([1, .65])

Let's make sure this model is estimable.


In [27]:
arma_t = ArmaProcess(arparams, maparams)

In [28]:
arma_t.isinvertible


Out[28]:
True

In [29]:
arma_t.isstationary


Out[29]:
False
  • What does this mean?

In [30]:
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)
ax.plot(arma_t.generate_sample(nsample=50));



In [31]:
arparams = np.array([1, .35, -.15, .55, .1])
maparams = np.array([1, .65])
arma_t = ArmaProcess(arparams, maparams)
arma_t.isstationary


Out[31]:
True

In [32]:
arma_rvs = arma_t.generate_sample(nsample=500, burnin=250, scale=2.5)

In [33]:
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(arma_rvs, lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(arma_rvs, lags=40, ax=ax2)


  • For mixed ARMA processes the Autocorrelation function is a mixture of exponentials and damped sine waves after (q-p) lags.
  • The partial autocorrelation function is a mixture of exponentials and dampened sine waves after (p-q) lags.

In [34]:
arma11 = sm.tsa.ARMA(arma_rvs, (1,1)).fit(disp=False)
resid = arma11.resid
r,q,p = sm.tsa.acf(resid, fft=True, qstat=True)
data = np.c_[range(1,41), r[1:], q, p]
table = pd.DataFrame(data, columns=['lag', "AC", "Q", "Prob(>Q)"])
print(table.set_index('lag'))


            AC           Q      Prob(>Q)
lag                                     
1.0   0.254921   32.687698  1.082200e-08
2.0  -0.172416   47.670779  4.450634e-11
3.0  -0.420945  137.159413  1.548450e-29
4.0  -0.046875  138.271325  6.617626e-29
5.0   0.103240  143.675937  2.958679e-29
6.0   0.214864  167.133022  1.823698e-33
7.0  -0.000889  167.133424  1.009194e-32
8.0  -0.045418  168.185777  3.094799e-32
9.0  -0.061445  170.115826  5.837150e-32
10.0  0.034623  170.729877  1.958716e-31
11.0  0.006351  170.750579  8.266965e-31
12.0 -0.012882  170.835932  3.220198e-30
13.0 -0.053959  172.336570  6.181131e-30
14.0 -0.016606  172.478987  2.160192e-29
15.0  0.051742  173.864510  4.089502e-29
16.0  0.078917  177.094304  3.217901e-29
17.0 -0.001834  177.096052  1.093156e-28
18.0 -0.101604  182.471961  3.103789e-29
19.0 -0.057342  184.187796  4.624014e-29
20.0  0.026975  184.568311  1.235657e-28
21.0  0.062359  186.605987  1.530242e-28
22.0 -0.009400  186.652389  4.548145e-28
23.0 -0.068037  189.088210  4.561959e-28
24.0 -0.035566  189.755227  9.900984e-28
25.0  0.095679  194.592647  3.354253e-28
26.0  0.065650  196.874903  3.487583e-28
27.0 -0.018404  197.054639  9.008645e-28
28.0 -0.079244  200.394034  5.773650e-28
29.0  0.008499  200.432526  1.541369e-27
30.0  0.053372  201.953801  2.133168e-27
31.0  0.074816  204.949420  1.550144e-27
32.0 -0.071187  207.667266  1.262275e-27
33.0 -0.088145  211.843182  5.480754e-28
34.0 -0.025283  212.187475  1.215214e-27
35.0  0.125690  220.714923  8.231523e-29
36.0  0.142724  231.734147  1.923058e-30
37.0  0.095768  236.706188  5.937709e-31
38.0 -0.084744  240.607830  2.890852e-31
39.0 -0.150126  252.879015  3.962946e-33
40.0 -0.083767  256.707771  1.996145e-33
/home/travis/build/statsmodels/statsmodels/statsmodels/tsa/arima_model.py:472: FutureWarning: 
statsmodels.tsa.arima_model.ARMA and statsmodels.tsa.arima_model.ARIMA have
been deprecated in favor of statsmodels.tsa.arima.model.ARIMA (note the .
between arima and model) and
statsmodels.tsa.SARIMAX.

statsmodels.tsa.arima.model.ARIMA makes use of the statespace framework and
is both well tested and maintained.

To silence this warning and continue using ARMA and ARIMA until they are
removed, use:

import warnings
warnings.filterwarnings('ignore', 'statsmodels.tsa.arima_model.ARMA',
                        FutureWarning)
warnings.filterwarnings('ignore', 'statsmodels.tsa.arima_model.ARIMA',
                        FutureWarning)

  warnings.warn(ARIMA_DEPRECATION_WARN, FutureWarning)
/home/travis/build/statsmodels/statsmodels/statsmodels/tsa/stattools.py:656: FutureWarning: The default number of lags is changing from 40 tomin(int(10 * np.log10(nobs)), nobs - 1) after 0.12is released. Set the number of lags to an integer to  silence this warning.
  FutureWarning,

In [35]:
arma41 = sm.tsa.ARMA(arma_rvs, (4,1)).fit(disp=False)
resid = arma41.resid
r,q,p = sm.tsa.acf(resid, fft=True, qstat=True)
data = np.c_[range(1,41), r[1:], q, p]
table = pd.DataFrame(data, columns=['lag', "AC", "Q", "Prob(>Q)"])
print(table.set_index('lag'))


/home/travis/build/statsmodels/statsmodels/statsmodels/tsa/arima_model.py:472: FutureWarning: 
statsmodels.tsa.arima_model.ARMA and statsmodels.tsa.arima_model.ARIMA have
been deprecated in favor of statsmodels.tsa.arima.model.ARIMA (note the .
between arima and model) and
statsmodels.tsa.SARIMAX.

statsmodels.tsa.arima.model.ARIMA makes use of the statespace framework and
is both well tested and maintained.

To silence this warning and continue using ARMA and ARIMA until they are
removed, use:

import warnings
warnings.filterwarnings('ignore', 'statsmodels.tsa.arima_model.ARMA',
                        FutureWarning)
warnings.filterwarnings('ignore', 'statsmodels.tsa.arima_model.ARIMA',
                        FutureWarning)

  warnings.warn(ARIMA_DEPRECATION_WARN, FutureWarning)
            AC          Q  Prob(>Q)
lag                                
1.0  -0.007889   0.031303  0.859567
2.0   0.004132   0.039907  0.980244
3.0   0.018103   0.205414  0.976711
4.0  -0.006760   0.228541  0.993948
5.0   0.018120   0.395028  0.995465
6.0   0.050688   1.700454  0.945086
7.0   0.010252   1.753962  0.972196
8.0  -0.011206   1.818025  0.986091
9.0   0.020292   2.028525  0.991008
10.0  0.001029   2.029067  0.996112
11.0 -0.014035   2.130175  0.997984
12.0 -0.023858   2.422932  0.998427
13.0 -0.002108   2.425223  0.999339
14.0 -0.018783   2.607435  0.999590
15.0  0.011316   2.673704  0.999805
16.0  0.042159   3.595425  0.999443
17.0  0.007943   3.628211  0.999734
18.0 -0.074311   6.503860  0.993686
19.0 -0.023379   6.789071  0.995256
20.0  0.002398   6.792078  0.997313
21.0  0.000487   6.792202  0.998516
22.0  0.017953   6.961440  0.999024
23.0 -0.038576   7.744470  0.998744
24.0 -0.029816   8.213253  0.998859
25.0  0.077850  11.415826  0.990675
26.0  0.040408  12.280450  0.989479
27.0 -0.018612  12.464277  0.992262
28.0 -0.014764  12.580188  0.994586
29.0  0.017650  12.746192  0.996111
30.0 -0.005486  12.762265  0.997504
31.0  0.058256  14.578546  0.994614
32.0 -0.040840  15.473086  0.993887
33.0 -0.019493  15.677312  0.995393
34.0  0.037269  16.425468  0.995214
35.0  0.086212  20.437451  0.976296
36.0  0.041271  21.358849  0.974774
37.0  0.078704  24.716882  0.938948
38.0 -0.029729  25.197061  0.944895
39.0 -0.078397  28.543399  0.891178
40.0 -0.014466  28.657589  0.909268
/home/travis/build/statsmodels/statsmodels/statsmodels/tsa/stattools.py:656: FutureWarning: The default number of lags is changing from 40 tomin(int(10 * np.log10(nobs)), nobs - 1) after 0.12is released. Set the number of lags to an integer to  silence this warning.
  FutureWarning,

Exercise: How good of in-sample prediction can you do for another series, say, CPI


In [36]:
macrodta = sm.datasets.macrodata.load_pandas().data
macrodta.index = pd.Index(sm.tsa.datetools.dates_from_range('1959Q1', '2009Q3'))
cpi = macrodta["cpi"]

Hint:


In [37]:
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)
ax = cpi.plot(ax=ax);
ax.legend();


P-value of the unit-root test, resoundingly rejects the null of a unit-root.


In [38]:
print(sm.tsa.adfuller(cpi)[1])


0.9904328188337421