Introductory Time Series with R $\rightarrow$ converted to Python

Starting with Chapter 1: Time Series Data

Load the dataset of historical air passengers


In [6]:
import pandas as pd

In [7]:
ap = pd.read_csv('../datasets/AirPassengers.csv')

In [8]:
ap.head()


Out[8]:
Month #Passengers
0 1949-01 112
1 1949-02 118
2 1949-03 132
3 1949-04 129
4 1949-05 121

In [9]:
%matplotlib inline
import matplotlib.pylab as plt

In [10]:
ap.plot()


Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x115abad68>

In [16]:
ap.iloc[-1].Month[:4]


Out[16]:
'1960'

In [17]:
ts_start = ap.iloc[0].Month[:4]
ts_end = ap.iloc[-1].Month[:4]
ts_freq = 'M'
ts_length = len(ap.index)
my_ts = pd.date_range(str(int(ts_start)),periods=ts_length,freq=ts_freq)
my_ts


Out[17]:
DatetimeIndex(['1949-01-31', '1949-02-28', '1949-03-31', '1949-04-30',
               '1949-05-31', '1949-06-30', '1949-07-31', '1949-08-31',
               '1949-09-30', '1949-10-31',
               ...
               '1960-03-31', '1960-04-30', '1960-05-31', '1960-06-30',
               '1960-07-31', '1960-08-31', '1960-09-30', '1960-10-31',
               '1960-11-30', '1960-12-31'],
              dtype='datetime64[ns]', length=144, freq='M')

In [18]:
ap.index = my_ts
ap.plot()


Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x118ed9ef0>

In [20]:
# resample annually
ap.resample('A').apply(sum).plot()


Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x119288898>

In [23]:
#Determing rolling statistics
rolmean = ap["#Passengers"].rolling(window=12).mean()
rolstd = ap["#Passengers"].rolling(window=12).std()
print(len(rolmean), len(rolstd))


144 144

In [24]:
#Plot rolling statistics:
orig = plt.plot(ap["#Passengers"], color='blue',label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)



In [25]:
#Perform Dickey-Fuller test:
from statsmodels.tsa.stattools import adfuller

print ('Results of Dickey-Fuller Test:')
dftest = adfuller(ap['#Passengers'], autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
    dfoutput['Critical Value (%s)'%key] = value
print(dfoutput)


Results of Dickey-Fuller Test:
Test Statistic                   0.815369
p-value                          0.991880
#Lags Used                      13.000000
Number of Observations Used    130.000000
Critical Value (5%)             -2.884042
Critical Value (10%)            -2.578770
Critical Value (1%)             -3.481682
dtype: float64
/Users/aj/anaconda/lib/python3.5/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools

In [27]:
#Estimating trend
import numpy as np
ap_logScale = np.log(ap['#Passengers'])
plt.plot(ap_logScale)


Out[27]:
[<matplotlib.lines.Line2D at 0x11cbaffd0>]

In [28]:
movingAverage = ap_logScale.rolling(window=12).mean()
movingSTD = ap_logScale.rolling(window=12).std()
plt.plot(ap_logScale)
plt.plot(movingAverage, color='red')


Out[28]:
[<matplotlib.lines.Line2D at 0x11e050ba8>]

In [29]:
# Get the difference between the moving average and the actual number of passengers
datasetLogScaleMinusMovingAverage = ap_logScale - movingAverage
datasetLogScaleMinusMovingAverage.head(12)
#Remove Nan Values
datasetLogScaleMinusMovingAverage.dropna(inplace=True)
datasetLogScaleMinusMovingAverage.head(10)


Out[29]:
1949-12-31   -0.065494
1950-01-31   -0.093449
1950-02-28   -0.007566
1950-03-31    0.099416
1950-04-30    0.052142
1950-05-31   -0.027529
1950-06-30    0.139881
1950-07-31    0.260184
1950-08-31    0.248635
1950-09-30    0.162937
Freq: M, Name: #Passengers, dtype: float64

In [33]:
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries):
    
    #Determing rolling statistics
    movingAverage = timeseries.rolling(window=12).mean()
    movingSTD = timeseries.rolling(window=12).std()

    #Plot rolling statistics:
    orig = plt.plot(timeseries, color='blue',label='Original')
    mean = plt.plot(movingAverage, color='red', label='Rolling Mean')
    std = plt.plot(movingSTD, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)
    
    #Perform Dickey-Fuller test:
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print(dfoutput)

In [34]:
test_stationarity(datasetLogScaleMinusMovingAverage)


Results of Dickey-Fuller Test:
Test Statistic                  -3.162908
p-value                          0.022235
#Lags Used                      13.000000
Number of Observations Used    119.000000
Critical Value (5%)             -2.886151
Critical Value (10%)            -2.579896
Critical Value (1%)             -3.486535
dtype: float64

In [35]:
exponentialDecayWeightedAverage = ap_logScale.ewm(halflife=12, min_periods=0, adjust=True).mean()
plt.plot(ap_logScale)
plt.plot(exponentialDecayWeightedAverage, color='red')


Out[35]:
[<matplotlib.lines.Line2D at 0x11e13ecc0>]

In [36]:
datasetLogScaleMinusMovingExponentialDecayAverage = ap_logScale - exponentialDecayWeightedAverage
test_stationarity(datasetLogScaleMinusMovingExponentialDecayAverage)


Results of Dickey-Fuller Test:
Test Statistic                  -3.601262
p-value                          0.005737
#Lags Used                      13.000000
Number of Observations Used    130.000000
Critical Value (5%)             -2.884042
Critical Value (10%)            -2.578770
Critical Value (1%)             -3.481682
dtype: float64

In [37]:
datasetLogDiffShifting = ap_logScale - ap_logScale.shift() # just shift by 1 period
plt.plot(datasetLogDiffShifting)


Out[37]:
[<matplotlib.lines.Line2D at 0x11936d1d0>]

In [40]:
datasetLogDiffShifting.dropna(inplace=True)
test_stationarity(datasetLogDiffShifting)


Results of Dickey-Fuller Test:
Test Statistic                  -2.717131
p-value                          0.071121
#Lags Used                      14.000000
Number of Observations Used    128.000000
Critical Value (5%)             -2.884398
Critical Value (10%)            -2.578960
Critical Value (1%)             -3.482501
dtype: float64

In [43]:
from statsmodels.tsa.seasonal import seasonal_decompose
decomposition = seasonal_decompose(ap_logScale)

trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid

plt.subplot(411)
plt.plot(ap_logScale, label='Original')
plt.legend(loc='best')
plt.subplot(412)
plt.plot(trend, label='Trend')
plt.legend(loc='best')
plt.subplot(413)
plt.plot(seasonal,label='Seasonality')
plt.legend(loc='best')
plt.subplot(414)
plt.plot(residual, label='Residuals')
plt.legend(loc='best')
plt.tight_layout()



In [44]:
decomposedLogData = residual
decomposedLogData.dropna(inplace=True)
test_stationarity(decomposedLogData)


Results of Dickey-Fuller Test:
Test Statistic                -6.332387e+00
p-value                        2.885059e-08
#Lags Used                     9.000000e+00
Number of Observations Used    1.220000e+02
Critical Value (5%)           -2.885538e+00
Critical Value (10%)          -2.579569e+00
Critical Value (1%)           -3.485122e+00
dtype: float64

In [45]:
#ACF and PACF plots:
from statsmodels.tsa.stattools import acf, pacf

lag_acf = acf(datasetLogDiffShifting, nlags=20)
lag_pacf = pacf(datasetLogDiffShifting, nlags=20, method='ols')


#Plot ACF: 
plt.subplot(121) 
plt.plot(lag_acf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(datasetLogDiffShifting)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(datasetLogDiffShifting)),linestyle='--',color='gray')
plt.title('Autocorrelation Function')

#Plot PACF:
plt.subplot(122)
plt.plot(lag_pacf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(datasetLogDiffShifting)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(datasetLogDiffShifting)),linestyle='--',color='gray')
plt.title('Partial Autocorrelation Function')
plt.tight_layout()



In [46]:
from statsmodels.tsa.arima_model import ARIMA

#AR MODEL
model = ARIMA(ap_logScale, order=(2, 1, 0))  
results_AR = model.fit(disp=-1)  
plt.plot(datasetLogDiffShifting)
plt.plot(results_AR.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_AR.fittedvalues-datasetLogDiffShifting)**2))
print('Plotting AR model')


Plotting AR model

In [47]:
#MA MODEL
model = ARIMA(ap_logScale, order=(0, 1, 2))  
results_MA = model.fit(disp=-1)  
plt.plot(datasetLogDiffShifting)
plt.plot(results_MA.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_MA.fittedvalues-datasetLogDiffShifting)**2))
print('Plotting AR model')


Plotting AR model

In [48]:
model = ARIMA(ap_logScale, order=(2, 1, 2))  
results_ARIMA = model.fit(disp=-1)  
plt.plot(datasetLogDiffShifting)
plt.plot(results_ARIMA.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_ARIMA.fittedvalues-datasetLogDiffShifting)**2))


Out[48]:
<matplotlib.text.Text at 0x11e45ffd0>

In [49]:
predictions_ARIMA_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
print (predictions_ARIMA_diff.head())
print(len(predictions_ARIMA_diff))


1949-02-28    0.009580
1949-03-31    0.017491
1949-04-30    0.027670
1949-05-31   -0.004521
1949-06-30   -0.023889
Freq: M, dtype: float64
143

In [50]:
#Convert to cumulative sum
predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum()
print (predictions_ARIMA_diff_cumsum.head())


1949-02-28    0.009580
1949-03-31    0.027071
1949-04-30    0.054742
1949-05-31    0.050221
1949-06-30    0.026331
Freq: M, dtype: float64

In [52]:
#predictions_ARIMA_log = pd.Series(ts_log.ix[0], index=ts_log.index)
#predictions_ARIMA_log = predictions_ARIMA_log.add(predictions_ARIMA_diff_cumsum,fill_value=0)
#predictions_ARIMA_log.head()

predictions_ARIMA_log = pd.Series(ap_logScale.iloc[0], index=ap_logScale.index)
predictions_ARIMA_log = predictions_ARIMA_log.add(predictions_ARIMA_diff_cumsum,fill_value=0)
predictions_ARIMA_log.head()


Out[52]:
1949-01-31    4.718499
1949-02-28    4.728079
1949-03-31    4.745570
1949-04-30    4.773241
1949-05-31    4.768720
Freq: M, dtype: float64

In [54]:
predictions_ARIMA = np.exp(predictions_ARIMA_log)
plt.plot(ap["#Passengers"])
plt.plot(predictions_ARIMA)
plt.title('RMSE: %.4f'% np.sqrt(sum((predictions_ARIMA-ap["#Passengers"])**2)/len(ap["#Passengers"])))


Out[54]:
<matplotlib.text.Text at 0x11e2fffd0>

In [ ]: