notebook.community

Edit and run



In [80]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
%matplotlib inline



In [3]:

    
df = sm.datasets.macrodata.load_pandas().data



In [4]:

    
df.head()



In [6]:

    
print(sm.datasets.macrodata.NOTE)









    



::
    Number of Observations - 203

    Number of Variables - 14

    Variable name definitions::

        year      - 1959q1 - 2009q3
        quarter   - 1-4
        realgdp   - Real gross domestic product (Bil. of chained 2005 US$,
                    seasonally adjusted annual rate)
        realcons  - Real personal consumption expenditures (Bil. of chained
                    2005 US$, seasonally adjusted annual rate)
        realinv   - Real gross private domestic investment (Bil. of chained
                    2005 US$, seasonally adjusted annual rate)
        realgovt  - Real federal consumption expenditures & gross investment
                    (Bil. of chained 2005 US$, seasonally adjusted annual rate)
        realdpi   - Real private disposable income (Bil. of chained 2005
                    US$, seasonally adjusted annual rate)
        cpi       - End of the quarter consumer price index for all urban
                    consumers: all items (1982-84 = 100, seasonally adjusted).
        m1        - End of the quarter M1 nominal money stock (Seasonally
                    adjusted)
        tbilrate  - Quarterly monthly average of the monthly 3-month
                    treasury bill: secondary market rate
        unemp     - Seasonally adjusted unemployment rate (%)
        pop       - End of the quarter total population: all ages incl. armed
                    forces over seas
        infl      - Inflation rate (ln(cpi_{t}/cpi_{t-1}) * 400)
        realint   - Real interest rate (tbilrate - infl)



In [10]:

    
index = pd.Index(sm.tsa.datetools.dates_from_range("1959Q1", "2009Q3"))



In [11]:

    
df.index = index



In [12]:

    
df.head()



In [13]:

    
df["realgdp"].plot()









    Out[13]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f9fbcb4df98>

The Hodrick-Prescott filter

Separates a time-seris y_t into a trend T-t and a cyclical component $\tau$ t



In [19]:

    
gdp_cycle, gdp_trend = sm.tsa.filters.hpfilter(df["realgdp"])



In [20]:

    
df["trend"] = gdp_trend



In [21]:

    
df[["realgdp", "trend"]]["2000-03-31":].plot()









    Out[21]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb977eb38>

EWMA: Exponentially Weighted Moving Average



In [23]:

    
airline = pd.read_csv("airline_passengers.csv", index_col="Month")



In [24]:

    
airline.index









    Out[24]:





Index(['1949-01', '1949-02', '1949-03', '1949-04', '1949-05', '1949-06',
       '1949-07', '1949-08', '1949-09', '1949-10',
       ...
       '1960-04', '1960-05', '1960-06', '1960-07', '1960-08', '1960-09',
       '1960-10', '1960-11', '1960-12',
       'International airline passengers: monthly totals in thousands. Jan 49 ? Dec 60'],
      dtype='object', name='Month', length=145)



In [27]:

    
airline.dropna(inplace=True)



In [29]:

    
airline.index = pd.to_datetime(airline.index)



In [30]:

    
airline.head()









    Out[30]:







  
    
      
      Thousands of Passengers
    
    
      Month
      
    
  
  
    
      1949-01-01
      112.0
    
    
      1949-02-01
      118.0
    
    
      1949-03-01
      132.0
    
    
      1949-04-01
      129.0
    
    
      1949-05-01
      121.0



In [31]:

    
airline.index









    Out[31]:





DatetimeIndex(['1949-01-01', '1949-02-01', '1949-03-01', '1949-04-01',
               '1949-05-01', '1949-06-01', '1949-07-01', '1949-08-01',
               '1949-09-01', '1949-10-01',
               ...
               '1960-03-01', '1960-04-01', '1960-05-01', '1960-06-01',
               '1960-07-01', '1960-08-01', '1960-09-01', '1960-10-01',
               '1960-11-01', '1960-12-01'],
              dtype='datetime64[ns]', name='Month', length=144, freq=None)

SMA



In [40]:

    
airline["6-month-SMA"] = airline["Thousands of Passengers"].rolling(window=6).mean()



In [41]:

    
airline["12-month-SMA"] = airline["Thousands of Passengers"].rolling(window=12).mean()



In [42]:

    
airline.plot(figsize=(10, 8))









    Out[42]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb96d05f8>



In [43]:

    
airline["EWMA-12"] = airline["Thousands of Passengers"].ewm(span=12).mean()



In [44]:

    
airline[["Thousands of Passengers", "EWMA-12"]].plot(figsize=(10, 8))









    Out[44]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb936be80>

ETS Decomp



In [45]:

    
airline.head()









    Out[45]:







  
    
      
      Thousands of Passengers
      6-month-SMA
      12-month-SMA
      EWMA-12
    
    
      Month
      
      
      
      
    
  
  
    
      1949-01-01
      112.0
      NaN
      NaN
      112.000000
    
    
      1949-02-01
      118.0
      NaN
      NaN
      115.250000
    
    
      1949-03-01
      132.0
      NaN
      NaN
      121.787529
    
    
      1949-04-01
      129.0
      NaN
      NaN
      124.064224
    
    
      1949-05-01
      121.0
      NaN
      NaN
      123.231685



In [46]:

    
airline["Thousands of Passengers"].plot()









    Out[46]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb92c2898>



In [53]:

    
from statsmodels.tsa.seasonal import seasonal_decompose



In [56]:

    
result = seasonal_decompose(airline["Thousands of Passengers"], model="multiplicative")



In [57]:

    
result.seasonal.plot()









    Out[57]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb80dbba8>



In [58]:

    
result.trend.plot()









    Out[58]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb9307d68>



In [62]:

    
fig = result.plot()

ARIMA (Generally not the greatest for financial data)



In [63]:

    
df = pd.read_csv("monthly-milk-production-pounds-p.csv")

Preparing data...



In [64]:

    
df.head()









    Out[64]:







  
    
      
      Month
      Monthly milk production: pounds per cow. Jan 62 ? Dec 75
    
  
  
    
      0
      1962-01
      589.0
    
    
      1
      1962-02
      561.0
    
    
      2
      1962-03
      640.0
    
    
      3
      1962-04
      656.0
    
    
      4
      1962-05
      727.0



In [65]:

    
df.columns = ["Month", "Milk in Pounds per Cow"]



In [66]:

    
df.head()









    Out[66]:







  
    
      
      Month
      Milk in Pounds per Cow
    
  
  
    
      0
      1962-01
      589.0
    
    
      1
      1962-02
      561.0
    
    
      2
      1962-03
      640.0
    
    
      3
      1962-04
      656.0
    
    
      4
      1962-05
      727.0



In [67]:

    
df.tail()









    Out[67]:







  
    
      
      Month
      Milk in Pounds per Cow
    
  
  
    
      164
      1975-09
      817.0
    
    
      165
      1975-10
      827.0
    
    
      166
      1975-11
      797.0
    
    
      167
      1975-12
      843.0
    
    
      168
      Monthly milk production: pounds per cow. Jan 6...
      NaN



In [68]:

    
df.drop(168, axis=0, inplace=True)



In [69]:

    
df.tail()









    Out[69]:







  
    
      
      Month
      Milk in Pounds per Cow
    
  
  
    
      163
      1975-08
      858.0
    
    
      164
      1975-09
      817.0
    
    
      165
      1975-10
      827.0
    
    
      166
      1975-11
      797.0
    
    
      167
      1975-12
      843.0



In [70]:

    
df["Month"] = pd.to_datetime(df["Month"])



In [71]:

    
df.head()









    Out[71]:







  
    
      
      Month
      Milk in Pounds per Cow
    
  
  
    
      0
      1962-01-01
      589.0
    
    
      1
      1962-02-01
      561.0
    
    
      2
      1962-03-01
      640.0
    
    
      3
      1962-04-01
      656.0
    
    
      4
      1962-05-01
      727.0



In [72]:

    
df.set_index("Month", inplace=True)



In [73]:

    
df.index









    Out[73]:





DatetimeIndex(['1962-01-01', '1962-02-01', '1962-03-01', '1962-04-01',
               '1962-05-01', '1962-06-01', '1962-07-01', '1962-08-01',
               '1962-09-01', '1962-10-01',
               ...
               '1975-03-01', '1975-04-01', '1975-05-01', '1975-06-01',
               '1975-07-01', '1975-08-01', '1975-09-01', '1975-10-01',
               '1975-11-01', '1975-12-01'],
              dtype='datetime64[ns]', name='Month', length=168, freq=None)



In [74]:

    
df.describe().transpose()









    Out[74]:







  
    
      
      count
      mean
      std
      min
      25%
      50%
      75%
      max
    
  
  
    
      Milk in Pounds per Cow
      168.0
      754.708333
      102.204524
      553.0
      677.75
      761.0
      824.5
      969.0

Visualize the data...



In [75]:

    
df.plot()









    Out[75]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb2f67ef0>

The data has a trend and seasonality



In [76]:

    
time_series = df["Milk in Pounds per Cow"]



In [81]:

    
time_series.rolling(12).mean().plot(label="12 Month Rolling Mean")
time_series.rolling(12).std().plot(label="12 Month Rolling Variance")
time_series.plot()
plt.legend()









    Out[81]:





<matplotlib.legend.Legend at 0x7f9fb2e55898>

ETS Decomp



In [82]:

    
decomp = seasonal_decompose(time_series)



In [85]:

    
fig = decomp.plot()
fig.set_size_inches(15, 8)

Stationary?



In [86]:

    
from statsmodels.tsa.stattools import adfuller



In [87]:

    
result = adfuller(df["Milk in Pounds per Cow"])



In [88]:

    
result









    Out[88]:





(-1.3038115874221314,
 0.62742670860303074,
 13,
 154,
 {'1%': -3.4735425281962091,
  '10%': -2.5768780536346769,
  '5%': -2.880497674144038},
 1115.1730447395112)



In [90]:

    
def adf_check(time_series):
    result = adfuller(time_series)
    print(" Augmented Dicky-Fuller Test")
    labels = ["ADF Test Statistic", "p-value", "# of lags", "Num of Observations used"]
    
    for value, label in zip(result, labels):
        print(label + ": " + str(value))
        
    if result[1] <= 0.05:
        print("Strong evidence against null hypothesis")
        print("Reject null hypothesis")
        print("Data has no unit root and is stationary")
    else:
        print("Weak eveidence against null hypothesis")
        print("Fail to reject null hypothesis")
        print("Data has a unit root and is non-stationary")



In [91]:

    
adf_check(df["Milk in Pounds per Cow"])









    



 Augmented Dicky-Fuller Test
ADF Test Statistic: -1.30381158742
p-value: 0.627426708603
# of lags: 13
Num of Observations used: 154
Weak eveidence against null hypothesis
Fail to reject null hypothesis
Data has a unit root and is non-stationary

First Difference



In [92]:

    
df["First Difference"] = df["Milk in Pounds per Cow"] - df["Milk in Pounds per Cow"].shift(1)



In [93]:

    
df["First Difference"].plot()









    Out[93]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb26fd940>



In [94]:

    
adf_check(df["First Difference"].dropna())









    



 Augmented Dicky-Fuller Test
ADF Test Statistic: -3.05499555865
p-value: 0.0300680040018
# of lags: 14
Num of Observations used: 152
Strong evidence against null hypothesis
Reject null hypothesis
Data has no unit root and is stationary

Second Difference



In [95]:

    
df["Milk Second Difference"] = df["First Difference"] - df["First Difference"].shift(1)



In [96]:

    
df["Milk Second Difference"].plot()









    Out[96]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb27514e0>



In [100]:

    
adf_check(df["Milk Second Difference"].dropna())









    



 Augmented Dicky-Fuller Test
ADF Test Statistic: -14.3278736456
p-value: 1.11269893321e-26
# of lags: 11
Num of Observations used: 154
Strong evidence against null hypothesis
Reject null hypothesis
Data has no unit root and is stationary

Seasonal Difference



In [97]:

    
df["Seasonal Difference"] = df["Milk in Pounds per Cow"] - df["Milk in Pounds per Cow"].shift(12)



In [98]:

    
df["Seasonal Difference"].plot()









    Out[98]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb2600240>



In [99]:

    
adf_check(df["Seasonal Difference"].dropna())









    



 Augmented Dicky-Fuller Test
ADF Test Statistic: -2.33541931436
p-value: 0.160798805277
# of lags: 12
Num of Observations used: 143
Weak eveidence against null hypothesis
Fail to reject null hypothesis
Data has a unit root and is non-stationary

Seasonal First Difference



In [101]:

    
df["Seasonal First Differnce"] = df["First Difference"] - df["First Difference"].shift(12)



In [102]:

    
df["Seasonal First Differnce"].plot()









    Out[102]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb3396dd8>



In [103]:

    
adf_check(df["Seasonal First Differnce"].dropna())









    



 Augmented Dicky-Fuller Test
ADF Test Statistic: -5.03800227492
p-value: 1.86542343188e-05
# of lags: 11
Num of Observations used: 143
Strong evidence against null hypothesis
Reject null hypothesis
Data has no unit root and is stationary

Autocorrelation Plots



In [104]:

    
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf



In [105]:

    
fig_first = plot_acf(df["First Difference"].dropna())



In [106]:

    
fig_seasonal_first = plot_acf(df["Seasonal First Differnce"].dropna())

with pandas...



In [107]:

    
from pandas.plotting import autocorrelation_plot



In [108]:

    
autocorrelation_plot(df["Seasonal First Differnce"].dropna())









    Out[108]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb34b7240>

pandas does not do partial autocorrelation..



In [109]:

    
result = plot_pacf(df["Seasonal First Differnce"].dropna())



In [114]:

    
plot_acf(df["Seasonal First Differnce"].dropna())
plot_pacf(df["Seasonal First Differnce"].dropna());

Building a seasonal ARIMA Model



In [116]:

    
from statsmodels.tsa.arima_model import ARIMA # standard ARIMA



In [118]:

    
model = sm.tsa.statespace.SARIMAX(df["Milk in Pounds per Cow"], order=(0,1,0), seasonal_order=(1, 1, 1, 12))



In [119]:

    
results = model.fit()



In [120]:

    
print(results.summary())









    



                                 Statespace Model Results                                 
==========================================================================================
Dep. Variable:             Milk in Pounds per Cow   No. Observations:                  168
Model:             SARIMAX(0, 1, 0)x(1, 1, 1, 12)   Log Likelihood                -534.065
Date:                            Sun, 29 Oct 2017   AIC                           1074.131
Time:                                    16:36:32   BIC                           1083.503
Sample:                                01-01-1962   HQIC                          1077.934
                                     - 12-01-1975                                         
Covariance Type:                              opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.S.L12      -0.0449   1.49e+06  -3.01e-08      1.000   -2.93e+06    2.93e+06
ma.S.L12      -0.5860   1.57e+07  -3.72e-08      1.000   -3.08e+07    3.08e+07
sigma2        55.5118   7.77e+08   7.14e-08      1.000   -1.52e+09    1.52e+09
===================================================================================
Ljung-Box (Q):                       33.48   Jarque-Bera (JB):                32.04
Prob(Q):                              0.76   Prob(JB):                         0.00
Heteroskedasticity (H):               0.69   Skew:                             0.77
Prob(H) (two-sided):                  0.18   Kurtosis:                         4.60
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).



In [122]:

    
results.resid.plot()









    Out[122]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb30b4e10>



In [123]:

    
results.resid.plot(kind="kde")









    Out[123]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb310d208>

Forcasting



In [126]:

    
df["forecast"] = results.predict(start=150, end=168)
df[["Milk in Pounds per Cow", "forecast"]].plot(figsize=(12, 8))









    Out[126]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb24d0470>

Setting a later end date



In [128]:

    
df.tail()









    Out[128]:







  
    
      
      Milk in Pounds per Cow
      First Difference
      Milk Second Difference
      Seasonal Difference
      Seasonal First Differnce
      forecast
    
    
      Month
      
      
      
      
      
      
    
  
  
    
      1975-08-01
      858.0
      -38.0
      3.0
      -9.0
      3.0
      855.358617
    
    
      1975-09-01
      817.0
      -41.0
      -3.0
      2.0
      11.0
      808.841269
    
    
      1975-10-01
      827.0
      10.0
      51.0
      15.0
      13.0
      819.323183
    
    
      1975-11-01
      797.0
      -30.0
      -40.0
      24.0
      9.0
      790.427500
    
    
      1975-12-01
      843.0
      46.0
      76.0
      30.0
      6.0
      837.063646



In [129]:

    
from pandas.tseries.offsets import DateOffset



In [130]:

    
future_dates = [df.index[-1] + DateOffset(months=x) for x in range(0, 24)]



In [133]:

    
future_df = pd.DataFrame(index=future_dates, columns=df.columns)



In [134]:

    
final_df = pd.concat([df, future_df])



In [136]:

    
final_df.tail()









    Out[136]:







  
    
      
      Milk in Pounds per Cow
      First Difference
      Milk Second Difference
      Seasonal Difference
      Seasonal First Differnce
      forecast
    
  
  
    
      1977-07-01
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      1977-08-01
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      1977-09-01
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      1977-10-01
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      1977-11-01
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN



In [137]:

    
final_df["forecast"] = results.predict(start=168, end = 192)



In [138]:

    
final_df.tail()









    Out[138]:







  
    
      
      Milk in Pounds per Cow
      First Difference
      Milk Second Difference
      Seasonal Difference
      Seasonal First Differnce
      forecast
    
  
  
    
      1977-07-01
      NaN
      NaN
      NaN
      NaN
      NaN
      951.525751
    
    
      1977-08-01
      NaN
      NaN
      NaN
      NaN
      NaN
      911.918840
    
    
      1977-09-01
      NaN
      NaN
      NaN
      NaN
      NaN
      865.881041
    
    
      1977-10-01
      NaN
      NaN
      NaN
      NaN
      NaN
      871.027162
    
    
      1977-11-01
      NaN
      NaN
      NaN
      NaN
      NaN
      836.962873



In [140]:

    
final_df[["Milk in Pounds per Cow", "forecast"]].plot(figsize=(10, 8))









    Out[140]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb05b9e48>

Why are ARIMA models bad for financial data?



In [ ]:

	year	quarter	realgdp	realcons	realinv	realgovt	realdpi	cpi	m1	tbilrate	unemp	pop	infl	realint
0	1959.0	1.0	2710.349	1707.4	286.898	470.045	1886.9	28.98	139.7	2.82	5.8	177.146	0.00	0.00
1	1959.0	2.0	2778.801	1733.7	310.859	481.301	1919.7	29.15	141.7	3.08	5.1	177.830	2.34	0.74
2	1959.0	3.0	2775.488	1751.8	289.226	491.260	1916.4	29.35	140.5	3.82	5.3	178.657	2.74	1.09
3	1959.0	4.0	2785.204	1753.7	299.356	484.052	1931.3	29.37	140.0	4.33	5.6	179.386	0.27	4.06
4	1960.0	1.0	2847.699	1770.5	331.722	462.199	1955.5	29.54	139.6	3.50	5.2	180.007	2.31	1.19

	year	quarter	realgdp	realcons	realinv	realgovt	realdpi	cpi	m1	tbilrate	unemp	pop	infl	realint
1959-03-31	1959.0	1.0	2710.349	1707.4	286.898	470.045	1886.9	28.98	139.7	2.82	5.8	177.146	0.00	0.00
1959-06-30	1959.0	2.0	2778.801	1733.7	310.859	481.301	1919.7	29.15	141.7	3.08	5.1	177.830	2.34	0.74
1959-09-30	1959.0	3.0	2775.488	1751.8	289.226	491.260	1916.4	29.35	140.5	3.82	5.3	178.657	2.74	1.09
1959-12-31	1959.0	4.0	2785.204	1753.7	299.356	484.052	1931.3	29.37	140.0	4.33	5.6	179.386	0.27	4.06
1960-03-31	1960.0	1.0	2847.699	1770.5	331.722	462.199	1955.5	29.54	139.6	3.50	5.2	180.007	2.31	1.19

	Thousands of Passengers
Month
1949-01-01	112.0
1949-02-01	118.0
1949-03-01	132.0
1949-04-01	129.0
1949-05-01	121.0

	Month	Monthly milk production: pounds per cow. Jan 62 ? Dec 75
0	1962-01	589.0
1	1962-02	561.0
2	1962-03	640.0
3	1962-04	656.0
4	1962-05	727.0

	Month	Milk in Pounds per Cow
164	1975-09	817.0
165	1975-10	827.0
166	1975-11	797.0
167	1975-12	843.0
168	Monthly milk production: pounds per cow. Jan 6...	NaN

	Month	Milk in Pounds per Cow
163	1975-08	858.0
164	1975-09	817.0
165	1975-10	827.0
166	1975-11	797.0
167	1975-12	843.0

	Month	Milk in Pounds per Cow
0	1962-01-01	589.0
1	1962-02-01	561.0
2	1962-03-01	640.0
3	1962-04-01	656.0
4	1962-05-01	727.0

	Milk in Pounds per Cow	First Difference	Milk Second Difference	Seasonal Difference	Seasonal First Differnce	forecast
Month
1975-08-01	858.0	-38.0	3.0	-9.0	3.0	855.358617
1975-09-01	817.0	-41.0	-3.0	2.0	11.0	808.841269
1975-10-01	827.0	10.0	51.0	15.0	13.0	819.323183
1975-11-01	797.0	-30.0	-40.0	24.0	9.0	790.427500
1975-12-01	843.0	46.0	76.0	30.0	6.0	837.063646

	Milk in Pounds per Cow	First Difference	Milk Second Difference	Seasonal Difference	Seasonal First Differnce	forecast
1977-07-01	NaN	NaN	NaN	NaN	NaN	NaN
1977-08-01	NaN	NaN	NaN	NaN	NaN	NaN
1977-09-01	NaN	NaN	NaN	NaN	NaN	NaN
1977-10-01	NaN	NaN	NaN	NaN	NaN	NaN
1977-11-01	NaN	NaN	NaN	NaN	NaN	NaN

	Milk in Pounds per Cow	First Difference	Milk Second Difference	Seasonal Difference	Seasonal First Differnce	forecast
1977-07-01	NaN	NaN	NaN	NaN	NaN	951.525751
1977-08-01	NaN	NaN	NaN	NaN	NaN	911.918840
1977-09-01	NaN	NaN	NaN	NaN	NaN	865.881041
1977-10-01	NaN	NaN	NaN	NaN	NaN	871.027162
1977-11-01	NaN	NaN	NaN	NaN	NaN	836.962873