notebook.community

Edit and run



In [125]:

    
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

(a) Plot the data

(b) Find seasonality estimates for $s_t$ in the decomposition $Y_t=m_t+s_t+X_t$ using method 1 or 2 from Section 1.4

(d) Fit a parabola by least squares to the deseasonalized data and use it as estimate of the trend $m_t$

(e) Plot the residuals $\hat{X}_t=Y_t-\hat{m}_t-\hat{s}_t$.



In [126]:

    
accidents = pd.read_csv('data/accidents.dat', names=['y'])

# So how to estimate the seasonality given the data?
# Expecting months / years, given discussion from class
d = 12

accidents.index = pd.period_range(start='01-01-1973', periods=len(accidents), freq='M')
#accidents.index = accidents.index.to_datetime()



In [127]:

    
accidents.plot()









    Out[127]:





<matplotlib.axes._subplots.AxesSubplot at 0x11322e7f0>



In [128]:

    
# Trend looks pretty constant so we'll use the small trend method

# Take the average over each year
accidents['m'] = accidents['y'].groupby(accidents.index.year).transform(np.mean)

# Seasonality
s = accidents['y'] - accidents['m']
accidents['s'] = s.groupby(s.index.month).transform(np.mean)

# Additive model
accidents['fitted'] = accidents['m'] + accidents['s']

accidents['residuals'] = accidents['y'] - accidents['fitted']



In [129]:

    
# Verify identifiability constraint that seasonality sums to 0
accidents['s'].groupby(accidents.index.year).sum()









    Out[129]:





1973   -2.515321e-12
1974   -2.515321e-12
1975   -2.515321e-12
1976   -2.515321e-12
1977   -2.515321e-12
1978   -2.515321e-12
Name: s, dtype: float64



In [130]:

    
accidents[['y', 'm', 'fitted']].plot()









    Out[130]:





<matplotlib.axes._subplots.AxesSubplot at 0x113b05d68>



In [131]:

    
g = accidents.groupby(accidents.index.year)



In [132]:

    
# We can verify some other model assumptions here:

m = g.mean()
m









    Out[132]:






  
    
      
      y
      m
      s
      fitted
      residuals
    
  
  
    
      1973
       9651.750000
       9651.750000
      -2.096101e-13
       9651.750000
       2.273737e-13
    
    
      1974
       8718.500000
       8718.500000
      -2.096101e-13
       8718.500000
       3.031649e-13
    
    
      1975
       8585.833333
       8585.833333
      -2.096101e-13
       8585.833333
      -2.273737e-13
    
    
      1976
       8396.750000
       8396.750000
      -2.096101e-13
       8396.750000
       3.031649e-13
    
    
      1977
       8576.833333
       8576.833333
      -2.096101e-13
       8576.833333
      -2.273737e-13
    
    
      1978
       8796.750000
       8796.750000
      -2.096101e-13
       8796.750000
       3.031649e-13



In [133]:

    
accidents['residuals'].plot()









    Out[133]:





<matplotlib.axes._subplots.AxesSubplot at 0x1133f6be0>



In [134]:

    
accidents['residuals'].plot(kind='hist')









    Out[134]:





<matplotlib.axes._subplots.AxesSubplot at 0x11391a8d0>



In [135]:

    
# Deseasonalized
accidents['yds'] = accidents['y'] - accidents['s']

# Make some integers to regress on
accidents['x'] = np.arange(len(accidents))
accidents['x2'] = accidents['x'] ** 2

# Fit a quadratic curve to deseasonalized data
fit = smf.ols('yds ~ x + x2', data=accidents).fit()
accidents['mquad'] = fit.predict()



In [136]:

    
accidents[['yds', 'mquad']].plot()









    Out[136]:





<matplotlib.axes._subplots.AxesSubplot at 0x11409f208>



In [137]:

    
# Residuals from quadratic fit
accidents['res2'] = accidents['y'] - accidents['mquad'] - accidents['s']



In [138]:

    
accidents[['residuals', 'res2']].plot()









    Out[138]:





<matplotlib.axes._subplots.AxesSubplot at 0x114382908>

So they look a lot like the residuals from before.



In [139]:

    
accidents['res2'].plot(kind='kde')









    Out[139]:





<matplotlib.axes._subplots.AxesSubplot at 0x113b1d5c0>



In [140]:

    
# Moving average

w = 13
weight = np.ones(w)
weight[[0, -1]] = 0.5

print(weight)

accidents['movavg'] = pd.rolling_apply(accidents['y'], window=w, func=np.average, 
                                        center=True, kwargs={'weights': weight})
accidents['movavg2'] = pd.rolling_mean(accidents['y'], window=w - 1, center=True)









    



[ 0.5  1.   1.   1.   1.   1.   1.   1.   1.   1.   1.   1.   0.5]



In [141]:

    
accidents[['movavg', 'movavg2']].plot()









    Out[141]:





<matplotlib.axes._subplots.AxesSubplot at 0x1148f1f28>

We observe that using the more complicated weighted ends makes it slightly smoother. It also avoids the trouble of having an unbalanced number on each side.



In [142]:

    
accidents['y'][:12].mean()









    Out[142]:





9651.75



In [143]:

    
# Differencing at lag d
accidents['ylag'] = accidents['y'].groupby(accidents.index.month).transform(np.diff)

accidents['ylag'].plot()









    Out[143]:





<matplotlib.axes._subplots.AxesSubplot at 0x114ceaba8>



In [144]:

    
# So how were the last computed? Judging by the graph they were recycled.

accidents['ylag'][:12] == accidents['ylag'][-12:]









    Out[144]:





1973-01    True
1973-02    True
1973-03    True
1973-04    True
1973-05    True
1973-06    True
1973-07    True
1973-08    True
1973-09    True
1973-10    True
1973-11    True
1973-12    True
Freq: M, Name: ylag, dtype: bool



In [144]:

	y	m	s	fitted	residuals
1973	9651.750000	9651.750000	-2.096101e-13	9651.750000	2.273737e-13
1974	8718.500000	8718.500000	-2.096101e-13	8718.500000	3.031649e-13
1975	8585.833333	8585.833333	-2.096101e-13	8585.833333	-2.273737e-13
1976	8396.750000	8396.750000	-2.096101e-13	8396.750000	3.031649e-13
1977	8576.833333	8576.833333	-2.096101e-13	8576.833333	-2.273737e-13
1978	8796.750000	8796.750000	-2.096101e-13	8796.750000	3.031649e-13