In [125]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

(a) Plot the data

(b) Find seasonality estimates for $s_t$ in the decomposition $Y_t=m_t+s_t+X_t$ using method 1 or 2 from Section 1.4

(c) Plot the deseasonalized data $Y_t-\hat{s}_t$

(d) Fit a parabola by least squares to the deseasonalized data and use it as estimate of the trend $m_t$

(e) Plot the residuals $\hat{X}_t=Y_t-\hat{m}_t-\hat{s}_t$.


In [126]:
accidents = pd.read_csv('data/accidents.dat', names=['y'])

# So how to estimate the seasonality given the data?
# Expecting months / years, given discussion from class
d = 12

accidents.index = pd.period_range(start='01-01-1973', periods=len(accidents), freq='M')
#accidents.index = accidents.index.to_datetime()

In [127]:
accidents.plot()


Out[127]:
<matplotlib.axes._subplots.AxesSubplot at 0x11322e7f0>

In [128]:
# Trend looks pretty constant so we'll use the small trend method

# Take the average over each year
accidents['m'] = accidents['y'].groupby(accidents.index.year).transform(np.mean)

# Seasonality
s = accidents['y'] - accidents['m']
accidents['s'] = s.groupby(s.index.month).transform(np.mean)

# Additive model
accidents['fitted'] = accidents['m'] + accidents['s']

accidents['residuals'] = accidents['y'] - accidents['fitted']

In [129]:
# Verify identifiability constraint that seasonality sums to 0
accidents['s'].groupby(accidents.index.year).sum()


Out[129]:
1973   -2.515321e-12
1974   -2.515321e-12
1975   -2.515321e-12
1976   -2.515321e-12
1977   -2.515321e-12
1978   -2.515321e-12
Name: s, dtype: float64

In [130]:
accidents[['y', 'm', 'fitted']].plot()


Out[130]:
<matplotlib.axes._subplots.AxesSubplot at 0x113b05d68>

In [131]:
g = accidents.groupby(accidents.index.year)

In [132]:
# We can verify some other model assumptions here:

m = g.mean()
m


Out[132]:
y m s fitted residuals
1973 9651.750000 9651.750000 -2.096101e-13 9651.750000 2.273737e-13
1974 8718.500000 8718.500000 -2.096101e-13 8718.500000 3.031649e-13
1975 8585.833333 8585.833333 -2.096101e-13 8585.833333 -2.273737e-13
1976 8396.750000 8396.750000 -2.096101e-13 8396.750000 3.031649e-13
1977 8576.833333 8576.833333 -2.096101e-13 8576.833333 -2.273737e-13
1978 8796.750000 8796.750000 -2.096101e-13 8796.750000 3.031649e-13

In [133]:
accidents['residuals'].plot()


Out[133]:
<matplotlib.axes._subplots.AxesSubplot at 0x1133f6be0>

In [134]:
accidents['residuals'].plot(kind='hist')


Out[134]:
<matplotlib.axes._subplots.AxesSubplot at 0x11391a8d0>

In [135]:
# Deseasonalized
accidents['yds'] = accidents['y'] - accidents['s']

# Make some integers to regress on
accidents['x'] = np.arange(len(accidents))
accidents['x2'] = accidents['x'] ** 2

# Fit a quadratic curve to deseasonalized data
fit = smf.ols('yds ~ x + x2', data=accidents).fit()
accidents['mquad'] = fit.predict()

In [136]:
accidents[['yds', 'mquad']].plot()


Out[136]:
<matplotlib.axes._subplots.AxesSubplot at 0x11409f208>

In [137]:
# Residuals from quadratic fit
accidents['res2'] = accidents['y'] - accidents['mquad'] - accidents['s']

In [138]:
accidents[['residuals', 'res2']].plot()


Out[138]:
<matplotlib.axes._subplots.AxesSubplot at 0x114382908>

So they look a lot like the residuals from before.


In [139]:
accidents['res2'].plot(kind='kde')


Out[139]:
<matplotlib.axes._subplots.AxesSubplot at 0x113b1d5c0>

In [140]:
# Moving average

w = 13
weight = np.ones(w)
weight[[0, -1]] = 0.5

print(weight)

accidents['movavg'] = pd.rolling_apply(accidents['y'], window=w, func=np.average, 
                                        center=True, kwargs={'weights': weight})
accidents['movavg2'] = pd.rolling_mean(accidents['y'], window=w - 1, center=True)


[ 0.5  1.   1.   1.   1.   1.   1.   1.   1.   1.   1.   1.   0.5]

In [141]:
accidents[['movavg', 'movavg2']].plot()


Out[141]:
<matplotlib.axes._subplots.AxesSubplot at 0x1148f1f28>

We observe that using the more complicated weighted ends makes it slightly smoother. It also avoids the trouble of having an unbalanced number on each side.


In [142]:
accidents['y'][:12].mean()


Out[142]:
9651.75

In [143]:
# Differencing at lag d
accidents['ylag'] = accidents['y'].groupby(accidents.index.month).transform(np.diff)

accidents['ylag'].plot()


Out[143]:
<matplotlib.axes._subplots.AxesSubplot at 0x114ceaba8>

In [144]:
# So how were the last computed? Judging by the graph they were recycled.

accidents['ylag'][:12] == accidents['ylag'][-12:]


Out[144]:
1973-01    True
1973-02    True
1973-03    True
1973-04    True
1973-05    True
1973-06    True
1973-07    True
1973-08    True
1973-09    True
1973-10    True
1973-11    True
1973-12    True
Freq: M, Name: ylag, dtype: bool

In [144]: