In [125]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
(a) Plot the data
(b) Find seasonality estimates for $s_t$ in the decomposition $Y_t=m_t+s_t+X_t$ using method 1 or 2 from Section 1.4
(c) Plot the deseasonalized data $Y_t-\hat{s}_t$
(d) Fit a parabola by least squares to the deseasonalized data and use it as estimate of the trend $m_t$
(e) Plot the residuals $\hat{X}_t=Y_t-\hat{m}_t-\hat{s}_t$.
In [126]:
accidents = pd.read_csv('data/accidents.dat', names=['y'])
# So how to estimate the seasonality given the data?
# Expecting months / years, given discussion from class
d = 12
accidents.index = pd.period_range(start='01-01-1973', periods=len(accidents), freq='M')
#accidents.index = accidents.index.to_datetime()
In [127]:
accidents.plot()
Out[127]:
In [128]:
# Trend looks pretty constant so we'll use the small trend method
# Take the average over each year
accidents['m'] = accidents['y'].groupby(accidents.index.year).transform(np.mean)
# Seasonality
s = accidents['y'] - accidents['m']
accidents['s'] = s.groupby(s.index.month).transform(np.mean)
# Additive model
accidents['fitted'] = accidents['m'] + accidents['s']
accidents['residuals'] = accidents['y'] - accidents['fitted']
In [129]:
# Verify identifiability constraint that seasonality sums to 0
accidents['s'].groupby(accidents.index.year).sum()
Out[129]:
In [130]:
accidents[['y', 'm', 'fitted']].plot()
Out[130]:
In [131]:
g = accidents.groupby(accidents.index.year)
In [132]:
# We can verify some other model assumptions here:
m = g.mean()
m
Out[132]:
In [133]:
accidents['residuals'].plot()
Out[133]:
In [134]:
accidents['residuals'].plot(kind='hist')
Out[134]:
In [135]:
# Deseasonalized
accidents['yds'] = accidents['y'] - accidents['s']
# Make some integers to regress on
accidents['x'] = np.arange(len(accidents))
accidents['x2'] = accidents['x'] ** 2
# Fit a quadratic curve to deseasonalized data
fit = smf.ols('yds ~ x + x2', data=accidents).fit()
accidents['mquad'] = fit.predict()
In [136]:
accidents[['yds', 'mquad']].plot()
Out[136]:
In [137]:
# Residuals from quadratic fit
accidents['res2'] = accidents['y'] - accidents['mquad'] - accidents['s']
In [138]:
accidents[['residuals', 'res2']].plot()
Out[138]:
So they look a lot like the residuals from before.
In [139]:
accidents['res2'].plot(kind='kde')
Out[139]:
In [140]:
# Moving average
w = 13
weight = np.ones(w)
weight[[0, -1]] = 0.5
print(weight)
accidents['movavg'] = pd.rolling_apply(accidents['y'], window=w, func=np.average,
center=True, kwargs={'weights': weight})
accidents['movavg2'] = pd.rolling_mean(accidents['y'], window=w - 1, center=True)
In [141]:
accidents[['movavg', 'movavg2']].plot()
Out[141]:
We observe that using the more complicated weighted ends makes it slightly smoother. It also avoids the trouble of having an unbalanced number on each side.
In [142]:
accidents['y'][:12].mean()
Out[142]:
In [143]:
# Differencing at lag d
accidents['ylag'] = accidents['y'].groupby(accidents.index.month).transform(np.diff)
accidents['ylag'].plot()
Out[143]:
In [144]:
# So how were the last computed? Judging by the graph they were recycled.
accidents['ylag'][:12] == accidents['ylag'][-12:]
Out[144]:
In [144]: