Introductory Time Series with R $\rightarrow$ converted to Python

Starting with Chapter 1: Time Series Data

$\S$ 1.4 Plots, trends and seasonal variation

Load the dataset of historical air passengers



In [6]:

    
import pandas as pd



In [7]:

    
ap = pd.read_csv('../datasets/AirPassengers.csv')



In [8]:

    
ap.head()









    Out[8]:







  
    
      
      Month
      #Passengers
    
  
  
    
      0
      1949-01
      112
    
    
      1
      1949-02
      118
    
    
      2
      1949-03
      132
    
    
      3
      1949-04
      129
    
    
      4
      1949-05
      121



In [9]:

    
%matplotlib inline
import matplotlib.pylab as plt



In [10]:

    
ap.plot()









    Out[10]:





<matplotlib.axes._subplots.AxesSubplot at 0x115abad68>



In [16]:

    
ap.iloc[-1].Month[:4]









    Out[16]:





'1960'



In [17]:

    
ts_start = ap.iloc[0].Month[:4]
ts_end = ap.iloc[-1].Month[:4]
ts_freq = 'M'
ts_length = len(ap.index)
my_ts = pd.date_range(str(int(ts_start)),periods=ts_length,freq=ts_freq)
my_ts









    Out[17]:





DatetimeIndex(['1949-01-31', '1949-02-28', '1949-03-31', '1949-04-30',
               '1949-05-31', '1949-06-30', '1949-07-31', '1949-08-31',
               '1949-09-30', '1949-10-31',
               ...
               '1960-03-31', '1960-04-30', '1960-05-31', '1960-06-30',
               '1960-07-31', '1960-08-31', '1960-09-30', '1960-10-31',
               '1960-11-30', '1960-12-31'],
              dtype='datetime64[ns]', length=144, freq='M')



In [18]:

    
ap.index = my_ts
ap.plot()









    Out[18]:





<matplotlib.axes._subplots.AxesSubplot at 0x118ed9ef0>



In [20]:

    
# resample annually
ap.resample('A').apply(sum).plot()









    Out[20]:





<matplotlib.axes._subplots.AxesSubplot at 0x119288898>



In [23]:

    
#Determing rolling statistics
rolmean = ap["#Passengers"].rolling(window=12).mean()
rolstd = ap["#Passengers"].rolling(window=12).std()
print(len(rolmean), len(rolstd))



In [24]:

    
#Plot rolling statistics:
orig = plt.plot(ap["#Passengers"], color='blue',label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)



In [25]:

    
#Perform Dickey-Fuller test:
from statsmodels.tsa.stattools import adfuller

print ('Results of Dickey-Fuller Test:')
dftest = adfuller(ap['#Passengers'], autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
    dfoutput['Critical Value (%s)'%key] = value
print(dfoutput)









    



Results of Dickey-Fuller Test:
Test Statistic                   0.815369
p-value                          0.991880
#Lags Used                      13.000000
Number of Observations Used    130.000000
Critical Value (5%)             -2.884042
Critical Value (10%)            -2.578770
Critical Value (1%)             -3.481682
dtype: float64






    



/Users/aj/anaconda/lib/python3.5/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools



In [27]:

    
#Estimating trend
import numpy as np
ap_logScale = np.log(ap['#Passengers'])
plt.plot(ap_logScale)









    Out[27]:





[<matplotlib.lines.Line2D at 0x11cbaffd0>]



In [28]:

    
movingAverage = ap_logScale.rolling(window=12).mean()
movingSTD = ap_logScale.rolling(window=12).std()
plt.plot(ap_logScale)
plt.plot(movingAverage, color='red')









    Out[28]:





[<matplotlib.lines.Line2D at 0x11e050ba8>]



In [29]:

    
# Get the difference between the moving average and the actual number of passengers
datasetLogScaleMinusMovingAverage = ap_logScale - movingAverage
datasetLogScaleMinusMovingAverage.head(12)
#Remove Nan Values
datasetLogScaleMinusMovingAverage.dropna(inplace=True)
datasetLogScaleMinusMovingAverage.head(10)









    Out[29]:





1949-12-31   -0.065494
1950-01-31   -0.093449
1950-02-28   -0.007566
1950-03-31    0.099416
1950-04-30    0.052142
1950-05-31   -0.027529
1950-06-30    0.139881
1950-07-31    0.260184
1950-08-31    0.248635
1950-09-30    0.162937
Freq: M, Name: #Passengers, dtype: float64



In [33]:

    
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries):
    
    #Determing rolling statistics
    movingAverage = timeseries.rolling(window=12).mean()
    movingSTD = timeseries.rolling(window=12).std()

    #Plot rolling statistics:
    orig = plt.plot(timeseries, color='blue',label='Original')
    mean = plt.plot(movingAverage, color='red', label='Rolling Mean')
    std = plt.plot(movingSTD, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)
    
    #Perform Dickey-Fuller test:
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print(dfoutput)



In [34]:

    
test_stationarity(datasetLogScaleMinusMovingAverage)









    












    



Results of Dickey-Fuller Test:
Test Statistic                  -3.162908
p-value                          0.022235
#Lags Used                      13.000000
Number of Observations Used    119.000000
Critical Value (5%)             -2.886151
Critical Value (10%)            -2.579896
Critical Value (1%)             -3.486535
dtype: float64



In [35]:

    
exponentialDecayWeightedAverage = ap_logScale.ewm(halflife=12, min_periods=0, adjust=True).mean()
plt.plot(ap_logScale)
plt.plot(exponentialDecayWeightedAverage, color='red')









    Out[35]:





[<matplotlib.lines.Line2D at 0x11e13ecc0>]



In [36]:

    
datasetLogScaleMinusMovingExponentialDecayAverage = ap_logScale - exponentialDecayWeightedAverage
test_stationarity(datasetLogScaleMinusMovingExponentialDecayAverage)









    












    



Results of Dickey-Fuller Test:
Test Statistic                  -3.601262
p-value                          0.005737
#Lags Used                      13.000000
Number of Observations Used    130.000000
Critical Value (5%)             -2.884042
Critical Value (10%)            -2.578770
Critical Value (1%)             -3.481682
dtype: float64



In [37]:

    
datasetLogDiffShifting = ap_logScale - ap_logScale.shift() # just shift by 1 period
plt.plot(datasetLogDiffShifting)









    Out[37]:





[<matplotlib.lines.Line2D at 0x11936d1d0>]



In [40]:

    
datasetLogDiffShifting.dropna(inplace=True)
test_stationarity(datasetLogDiffShifting)









    












    



Results of Dickey-Fuller Test:
Test Statistic                  -2.717131
p-value                          0.071121
#Lags Used                      14.000000
Number of Observations Used    128.000000
Critical Value (5%)             -2.884398
Critical Value (10%)            -2.578960
Critical Value (1%)             -3.482501
dtype: float64



In [43]:

    
from statsmodels.tsa.seasonal import seasonal_decompose
decomposition = seasonal_decompose(ap_logScale)

trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid

plt.subplot(411)
plt.plot(ap_logScale, label='Original')
plt.legend(loc='best')
plt.subplot(412)
plt.plot(trend, label='Trend')
plt.legend(loc='best')
plt.subplot(413)
plt.plot(seasonal,label='Seasonality')
plt.legend(loc='best')
plt.subplot(414)
plt.plot(residual, label='Residuals')
plt.legend(loc='best')
plt.tight_layout()



In [44]:

    
decomposedLogData = residual
decomposedLogData.dropna(inplace=True)
test_stationarity(decomposedLogData)









    












    



Results of Dickey-Fuller Test:
Test Statistic                -6.332387e+00
p-value                        2.885059e-08
#Lags Used                     9.000000e+00
Number of Observations Used    1.220000e+02
Critical Value (5%)           -2.885538e+00
Critical Value (10%)          -2.579569e+00
Critical Value (1%)           -3.485122e+00
dtype: float64



In [45]:

    
#ACF and PACF plots:
from statsmodels.tsa.stattools import acf, pacf

lag_acf = acf(datasetLogDiffShifting, nlags=20)
lag_pacf = pacf(datasetLogDiffShifting, nlags=20, method='ols')


#Plot ACF: 
plt.subplot(121) 
plt.plot(lag_acf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(datasetLogDiffShifting)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(datasetLogDiffShifting)),linestyle='--',color='gray')
plt.title('Autocorrelation Function')

#Plot PACF:
plt.subplot(122)
plt.plot(lag_pacf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(datasetLogDiffShifting)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(datasetLogDiffShifting)),linestyle='--',color='gray')
plt.title('Partial Autocorrelation Function')
plt.tight_layout()



In [46]:

    
from statsmodels.tsa.arima_model import ARIMA

#AR MODEL
model = ARIMA(ap_logScale, order=(2, 1, 0))  
results_AR = model.fit(disp=-1)  
plt.plot(datasetLogDiffShifting)
plt.plot(results_AR.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_AR.fittedvalues-datasetLogDiffShifting)**2))
print('Plotting AR model')









    



Plotting AR model



In [47]:

    
#MA MODEL
model = ARIMA(ap_logScale, order=(0, 1, 2))  
results_MA = model.fit(disp=-1)  
plt.plot(datasetLogDiffShifting)
plt.plot(results_MA.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_MA.fittedvalues-datasetLogDiffShifting)**2))
print('Plotting AR model')









    



Plotting AR model



In [48]:

    
model = ARIMA(ap_logScale, order=(2, 1, 2))  
results_ARIMA = model.fit(disp=-1)  
plt.plot(datasetLogDiffShifting)
plt.plot(results_ARIMA.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_ARIMA.fittedvalues-datasetLogDiffShifting)**2))









    Out[48]:





<matplotlib.text.Text at 0x11e45ffd0>



In [49]:

    
predictions_ARIMA_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
print (predictions_ARIMA_diff.head())
print(len(predictions_ARIMA_diff))









    



1949-02-28    0.009580
1949-03-31    0.017491
1949-04-30    0.027670
1949-05-31   -0.004521
1949-06-30   -0.023889
Freq: M, dtype: float64
143



In [50]:

    
#Convert to cumulative sum
predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum()
print (predictions_ARIMA_diff_cumsum.head())









    



1949-02-28    0.009580
1949-03-31    0.027071
1949-04-30    0.054742
1949-05-31    0.050221
1949-06-30    0.026331
Freq: M, dtype: float64



In [52]:

    
#predictions_ARIMA_log = pd.Series(ts_log.ix[0], index=ts_log.index)
#predictions_ARIMA_log = predictions_ARIMA_log.add(predictions_ARIMA_diff_cumsum,fill_value=0)
#predictions_ARIMA_log.head()

predictions_ARIMA_log = pd.Series(ap_logScale.iloc[0], index=ap_logScale.index)
predictions_ARIMA_log = predictions_ARIMA_log.add(predictions_ARIMA_diff_cumsum,fill_value=0)
predictions_ARIMA_log.head()









    Out[52]:





1949-01-31    4.718499
1949-02-28    4.728079
1949-03-31    4.745570
1949-04-30    4.773241
1949-05-31    4.768720
Freq: M, dtype: float64



In [54]:

    
predictions_ARIMA = np.exp(predictions_ARIMA_log)
plt.plot(ap["#Passengers"])
plt.plot(predictions_ARIMA)
plt.title('RMSE: %.4f'% np.sqrt(sum((predictions_ARIMA-ap["#Passengers"])**2)/len(ap["#Passengers"])))









    Out[54]:





<matplotlib.text.Text at 0x11e2fffd0>



In [ ]: