In [80]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
%matplotlib inline

In [3]:
df = sm.datasets.macrodata.load_pandas().data

In [4]:
df.head()


Out[4]:
year quarter realgdp realcons realinv realgovt realdpi cpi m1 tbilrate unemp pop infl realint
0 1959.0 1.0 2710.349 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 5.8 177.146 0.00 0.00
1 1959.0 2.0 2778.801 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 5.1 177.830 2.34 0.74
2 1959.0 3.0 2775.488 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 5.3 178.657 2.74 1.09
3 1959.0 4.0 2785.204 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 5.6 179.386 0.27 4.06
4 1960.0 1.0 2847.699 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 5.2 180.007 2.31 1.19

In [6]:
print(sm.datasets.macrodata.NOTE)


::
    Number of Observations - 203

    Number of Variables - 14

    Variable name definitions::

        year      - 1959q1 - 2009q3
        quarter   - 1-4
        realgdp   - Real gross domestic product (Bil. of chained 2005 US$,
                    seasonally adjusted annual rate)
        realcons  - Real personal consumption expenditures (Bil. of chained
                    2005 US$, seasonally adjusted annual rate)
        realinv   - Real gross private domestic investment (Bil. of chained
                    2005 US$, seasonally adjusted annual rate)
        realgovt  - Real federal consumption expenditures & gross investment
                    (Bil. of chained 2005 US$, seasonally adjusted annual rate)
        realdpi   - Real private disposable income (Bil. of chained 2005
                    US$, seasonally adjusted annual rate)
        cpi       - End of the quarter consumer price index for all urban
                    consumers: all items (1982-84 = 100, seasonally adjusted).
        m1        - End of the quarter M1 nominal money stock (Seasonally
                    adjusted)
        tbilrate  - Quarterly monthly average of the monthly 3-month
                    treasury bill: secondary market rate
        unemp     - Seasonally adjusted unemployment rate (%)
        pop       - End of the quarter total population: all ages incl. armed
                    forces over seas
        infl      - Inflation rate (ln(cpi_{t}/cpi_{t-1}) * 400)
        realint   - Real interest rate (tbilrate - infl)


In [10]:
index = pd.Index(sm.tsa.datetools.dates_from_range("1959Q1", "2009Q3"))

In [11]:
df.index = index

In [12]:
df.head()


Out[12]:
year quarter realgdp realcons realinv realgovt realdpi cpi m1 tbilrate unemp pop infl realint
1959-03-31 1959.0 1.0 2710.349 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 5.8 177.146 0.00 0.00
1959-06-30 1959.0 2.0 2778.801 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 5.1 177.830 2.34 0.74
1959-09-30 1959.0 3.0 2775.488 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 5.3 178.657 2.74 1.09
1959-12-31 1959.0 4.0 2785.204 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 5.6 179.386 0.27 4.06
1960-03-31 1960.0 1.0 2847.699 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 5.2 180.007 2.31 1.19

In [13]:
df["realgdp"].plot()


Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9fbcb4df98>
The Hodrick-Prescott filter

Separates a time-seris y_t into a trend T-t and a cyclical component $\tau$ t


In [19]:
gdp_cycle, gdp_trend = sm.tsa.filters.hpfilter(df["realgdp"])

In [20]:
df["trend"] = gdp_trend

In [21]:
df[["realgdp", "trend"]]["2000-03-31":].plot()


Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb977eb38>

EWMA: Exponentially Weighted Moving Average


In [23]:
airline = pd.read_csv("airline_passengers.csv", index_col="Month")

In [24]:
airline.index


Out[24]:
Index(['1949-01', '1949-02', '1949-03', '1949-04', '1949-05', '1949-06',
       '1949-07', '1949-08', '1949-09', '1949-10',
       ...
       '1960-04', '1960-05', '1960-06', '1960-07', '1960-08', '1960-09',
       '1960-10', '1960-11', '1960-12',
       'International airline passengers: monthly totals in thousands. Jan 49 ? Dec 60'],
      dtype='object', name='Month', length=145)

In [27]:
airline.dropna(inplace=True)

In [29]:
airline.index = pd.to_datetime(airline.index)

In [30]:
airline.head()


Out[30]:
Thousands of Passengers
Month
1949-01-01 112.0
1949-02-01 118.0
1949-03-01 132.0
1949-04-01 129.0
1949-05-01 121.0

In [31]:
airline.index


Out[31]:
DatetimeIndex(['1949-01-01', '1949-02-01', '1949-03-01', '1949-04-01',
               '1949-05-01', '1949-06-01', '1949-07-01', '1949-08-01',
               '1949-09-01', '1949-10-01',
               ...
               '1960-03-01', '1960-04-01', '1960-05-01', '1960-06-01',
               '1960-07-01', '1960-08-01', '1960-09-01', '1960-10-01',
               '1960-11-01', '1960-12-01'],
              dtype='datetime64[ns]', name='Month', length=144, freq=None)
SMA

In [40]:
airline["6-month-SMA"] = airline["Thousands of Passengers"].rolling(window=6).mean()

In [41]:
airline["12-month-SMA"] = airline["Thousands of Passengers"].rolling(window=12).mean()

In [42]:
airline.plot(figsize=(10, 8))


Out[42]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb96d05f8>

In [43]:
airline["EWMA-12"] = airline["Thousands of Passengers"].ewm(span=12).mean()

In [44]:
airline[["Thousands of Passengers", "EWMA-12"]].plot(figsize=(10, 8))


Out[44]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb936be80>

ETS Decomp


In [45]:
airline.head()


Out[45]:
Thousands of Passengers 6-month-SMA 12-month-SMA EWMA-12
Month
1949-01-01 112.0 NaN NaN 112.000000
1949-02-01 118.0 NaN NaN 115.250000
1949-03-01 132.0 NaN NaN 121.787529
1949-04-01 129.0 NaN NaN 124.064224
1949-05-01 121.0 NaN NaN 123.231685

In [46]:
airline["Thousands of Passengers"].plot()


Out[46]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb92c2898>

In [53]:
from statsmodels.tsa.seasonal import seasonal_decompose

In [56]:
result = seasonal_decompose(airline["Thousands of Passengers"], model="multiplicative")

In [57]:
result.seasonal.plot()


Out[57]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb80dbba8>

In [58]:
result.trend.plot()


Out[58]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb9307d68>

In [62]:
fig = result.plot()


ARIMA (Generally not the greatest for financial data)


In [63]:
df = pd.read_csv("monthly-milk-production-pounds-p.csv")
Preparing data...

In [64]:
df.head()


Out[64]:
Month Monthly milk production: pounds per cow. Jan 62 ? Dec 75
0 1962-01 589.0
1 1962-02 561.0
2 1962-03 640.0
3 1962-04 656.0
4 1962-05 727.0

In [65]:
df.columns = ["Month", "Milk in Pounds per Cow"]

In [66]:
df.head()


Out[66]:
Month Milk in Pounds per Cow
0 1962-01 589.0
1 1962-02 561.0
2 1962-03 640.0
3 1962-04 656.0
4 1962-05 727.0

In [67]:
df.tail()


Out[67]:
Month Milk in Pounds per Cow
164 1975-09 817.0
165 1975-10 827.0
166 1975-11 797.0
167 1975-12 843.0
168 Monthly milk production: pounds per cow. Jan 6... NaN

In [68]:
df.drop(168, axis=0, inplace=True)

In [69]:
df.tail()


Out[69]:
Month Milk in Pounds per Cow
163 1975-08 858.0
164 1975-09 817.0
165 1975-10 827.0
166 1975-11 797.0
167 1975-12 843.0

In [70]:
df["Month"] = pd.to_datetime(df["Month"])

In [71]:
df.head()


Out[71]:
Month Milk in Pounds per Cow
0 1962-01-01 589.0
1 1962-02-01 561.0
2 1962-03-01 640.0
3 1962-04-01 656.0
4 1962-05-01 727.0

In [72]:
df.set_index("Month", inplace=True)

In [73]:
df.index


Out[73]:
DatetimeIndex(['1962-01-01', '1962-02-01', '1962-03-01', '1962-04-01',
               '1962-05-01', '1962-06-01', '1962-07-01', '1962-08-01',
               '1962-09-01', '1962-10-01',
               ...
               '1975-03-01', '1975-04-01', '1975-05-01', '1975-06-01',
               '1975-07-01', '1975-08-01', '1975-09-01', '1975-10-01',
               '1975-11-01', '1975-12-01'],
              dtype='datetime64[ns]', name='Month', length=168, freq=None)

In [74]:
df.describe().transpose()


Out[74]:
count mean std min 25% 50% 75% max
Milk in Pounds per Cow 168.0 754.708333 102.204524 553.0 677.75 761.0 824.5 969.0

Visualize the data...


In [75]:
df.plot()


Out[75]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb2f67ef0>

The data has a trend and seasonality


In [76]:
time_series = df["Milk in Pounds per Cow"]

In [81]:
time_series.rolling(12).mean().plot(label="12 Month Rolling Mean")
time_series.rolling(12).std().plot(label="12 Month Rolling Variance")
time_series.plot()
plt.legend()


Out[81]:
<matplotlib.legend.Legend at 0x7f9fb2e55898>
ETS Decomp

In [82]:
decomp = seasonal_decompose(time_series)

In [85]:
fig = decomp.plot()
fig.set_size_inches(15, 8)


Stationary?

In [86]:
from statsmodels.tsa.stattools import adfuller

In [87]:
result = adfuller(df["Milk in Pounds per Cow"])

In [88]:
result


Out[88]:
(-1.3038115874221314,
 0.62742670860303074,
 13,
 154,
 {'1%': -3.4735425281962091,
  '10%': -2.5768780536346769,
  '5%': -2.880497674144038},
 1115.1730447395112)

In [90]:
def adf_check(time_series):
    result = adfuller(time_series)
    print(" Augmented Dicky-Fuller Test")
    labels = ["ADF Test Statistic", "p-value", "# of lags", "Num of Observations used"]
    
    for value, label in zip(result, labels):
        print(label + ": " + str(value))
        
    if result[1] <= 0.05:
        print("Strong evidence against null hypothesis")
        print("Reject null hypothesis")
        print("Data has no unit root and is stationary")
    else:
        print("Weak eveidence against null hypothesis")
        print("Fail to reject null hypothesis")
        print("Data has a unit root and is non-stationary")

In [91]:
adf_check(df["Milk in Pounds per Cow"])


 Augmented Dicky-Fuller Test
ADF Test Statistic: -1.30381158742
p-value: 0.627426708603
# of lags: 13
Num of Observations used: 154
Weak eveidence against null hypothesis
Fail to reject null hypothesis
Data has a unit root and is non-stationary
First Difference

In [92]:
df["First Difference"] = df["Milk in Pounds per Cow"] - df["Milk in Pounds per Cow"].shift(1)

In [93]:
df["First Difference"].plot()


Out[93]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb26fd940>

In [94]:
adf_check(df["First Difference"].dropna())


 Augmented Dicky-Fuller Test
ADF Test Statistic: -3.05499555865
p-value: 0.0300680040018
# of lags: 14
Num of Observations used: 152
Strong evidence against null hypothesis
Reject null hypothesis
Data has no unit root and is stationary
Second Difference

In [95]:
df["Milk Second Difference"] = df["First Difference"] - df["First Difference"].shift(1)

In [96]:
df["Milk Second Difference"].plot()


Out[96]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb27514e0>

In [100]:
adf_check(df["Milk Second Difference"].dropna())


 Augmented Dicky-Fuller Test
ADF Test Statistic: -14.3278736456
p-value: 1.11269893321e-26
# of lags: 11
Num of Observations used: 154
Strong evidence against null hypothesis
Reject null hypothesis
Data has no unit root and is stationary
Seasonal Difference

In [97]:
df["Seasonal Difference"] = df["Milk in Pounds per Cow"] - df["Milk in Pounds per Cow"].shift(12)

In [98]:
df["Seasonal Difference"].plot()


Out[98]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb2600240>

In [99]:
adf_check(df["Seasonal Difference"].dropna())


 Augmented Dicky-Fuller Test
ADF Test Statistic: -2.33541931436
p-value: 0.160798805277
# of lags: 12
Num of Observations used: 143
Weak eveidence against null hypothesis
Fail to reject null hypothesis
Data has a unit root and is non-stationary
Seasonal First Difference

In [101]:
df["Seasonal First Differnce"] = df["First Difference"] - df["First Difference"].shift(12)

In [102]:
df["Seasonal First Differnce"].plot()


Out[102]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb3396dd8>

In [103]:
adf_check(df["Seasonal First Differnce"].dropna())


 Augmented Dicky-Fuller Test
ADF Test Statistic: -5.03800227492
p-value: 1.86542343188e-05
# of lags: 11
Num of Observations used: 143
Strong evidence against null hypothesis
Reject null hypothesis
Data has no unit root and is stationary

Autocorrelation Plots


In [104]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [105]:
fig_first = plot_acf(df["First Difference"].dropna())



In [106]:
fig_seasonal_first = plot_acf(df["Seasonal First Differnce"].dropna())


with pandas...


In [107]:
from pandas.plotting import autocorrelation_plot

In [108]:
autocorrelation_plot(df["Seasonal First Differnce"].dropna())


Out[108]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb34b7240>

pandas does not do partial autocorrelation..


In [109]:
result = plot_pacf(df["Seasonal First Differnce"].dropna())



In [114]:
plot_acf(df["Seasonal First Differnce"].dropna())
plot_pacf(df["Seasonal First Differnce"].dropna());


Building a seasonal ARIMA Model

In [116]:
from statsmodels.tsa.arima_model import ARIMA # standard ARIMA

In [118]:
model = sm.tsa.statespace.SARIMAX(df["Milk in Pounds per Cow"], order=(0,1,0), seasonal_order=(1, 1, 1, 12))

In [119]:
results = model.fit()

In [120]:
print(results.summary())


                                 Statespace Model Results                                 
==========================================================================================
Dep. Variable:             Milk in Pounds per Cow   No. Observations:                  168
Model:             SARIMAX(0, 1, 0)x(1, 1, 1, 12)   Log Likelihood                -534.065
Date:                            Sun, 29 Oct 2017   AIC                           1074.131
Time:                                    16:36:32   BIC                           1083.503
Sample:                                01-01-1962   HQIC                          1077.934
                                     - 12-01-1975                                         
Covariance Type:                              opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.S.L12      -0.0449   1.49e+06  -3.01e-08      1.000   -2.93e+06    2.93e+06
ma.S.L12      -0.5860   1.57e+07  -3.72e-08      1.000   -3.08e+07    3.08e+07
sigma2        55.5118   7.77e+08   7.14e-08      1.000   -1.52e+09    1.52e+09
===================================================================================
Ljung-Box (Q):                       33.48   Jarque-Bera (JB):                32.04
Prob(Q):                              0.76   Prob(JB):                         0.00
Heteroskedasticity (H):               0.69   Skew:                             0.77
Prob(H) (two-sided):                  0.18   Kurtosis:                         4.60
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

In [122]:
results.resid.plot()


Out[122]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb30b4e10>

In [123]:
results.resid.plot(kind="kde")


Out[123]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb310d208>
Forcasting

In [126]:
df["forecast"] = results.predict(start=150, end=168)
df[["Milk in Pounds per Cow", "forecast"]].plot(figsize=(12, 8))


Out[126]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb24d0470>

Setting a later end date


In [128]:
df.tail()


Out[128]:
Milk in Pounds per Cow First Difference Milk Second Difference Seasonal Difference Seasonal First Differnce forecast
Month
1975-08-01 858.0 -38.0 3.0 -9.0 3.0 855.358617
1975-09-01 817.0 -41.0 -3.0 2.0 11.0 808.841269
1975-10-01 827.0 10.0 51.0 15.0 13.0 819.323183
1975-11-01 797.0 -30.0 -40.0 24.0 9.0 790.427500
1975-12-01 843.0 46.0 76.0 30.0 6.0 837.063646

In [129]:
from pandas.tseries.offsets import DateOffset

In [130]:
future_dates = [df.index[-1] + DateOffset(months=x) for x in range(0, 24)]

In [133]:
future_df = pd.DataFrame(index=future_dates, columns=df.columns)

In [134]:
final_df = pd.concat([df, future_df])

In [136]:
final_df.tail()


Out[136]:
Milk in Pounds per Cow First Difference Milk Second Difference Seasonal Difference Seasonal First Differnce forecast
1977-07-01 NaN NaN NaN NaN NaN NaN
1977-08-01 NaN NaN NaN NaN NaN NaN
1977-09-01 NaN NaN NaN NaN NaN NaN
1977-10-01 NaN NaN NaN NaN NaN NaN
1977-11-01 NaN NaN NaN NaN NaN NaN

In [137]:
final_df["forecast"] = results.predict(start=168, end = 192)

In [138]:
final_df.tail()


Out[138]:
Milk in Pounds per Cow First Difference Milk Second Difference Seasonal Difference Seasonal First Differnce forecast
1977-07-01 NaN NaN NaN NaN NaN 951.525751
1977-08-01 NaN NaN NaN NaN NaN 911.918840
1977-09-01 NaN NaN NaN NaN NaN 865.881041
1977-10-01 NaN NaN NaN NaN NaN 871.027162
1977-11-01 NaN NaN NaN NaN NaN 836.962873

In [140]:
final_df[["Milk in Pounds per Cow", "forecast"]].plot(figsize=(10, 8))


Out[140]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9fb05b9e48>
Why are ARIMA models bad for financial data?

In [ ]: