In [6]:
import pandas as pd
In [7]:
ap = pd.read_csv('../datasets/AirPassengers.csv')
In [8]:
ap.head()
Out[8]:
In [9]:
%matplotlib inline
import matplotlib.pylab as plt
In [10]:
ap.plot()
Out[10]:
In [16]:
ap.iloc[-1].Month[:4]
Out[16]:
In [17]:
ts_start = ap.iloc[0].Month[:4]
ts_end = ap.iloc[-1].Month[:4]
ts_freq = 'M'
ts_length = len(ap.index)
my_ts = pd.date_range(str(int(ts_start)),periods=ts_length,freq=ts_freq)
my_ts
Out[17]:
In [18]:
ap.index = my_ts
ap.plot()
Out[18]:
In [20]:
# resample annually
ap.resample('A').apply(sum).plot()
Out[20]:
In [23]:
#Determing rolling statistics
rolmean = ap["#Passengers"].rolling(window=12).mean()
rolstd = ap["#Passengers"].rolling(window=12).std()
print(len(rolmean), len(rolstd))
In [24]:
#Plot rolling statistics:
orig = plt.plot(ap["#Passengers"], color='blue',label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)
In [25]:
#Perform Dickey-Fuller test:
from statsmodels.tsa.stattools import adfuller
print ('Results of Dickey-Fuller Test:')
dftest = adfuller(ap['#Passengers'], autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print(dfoutput)
In [27]:
#Estimating trend
import numpy as np
ap_logScale = np.log(ap['#Passengers'])
plt.plot(ap_logScale)
Out[27]:
In [28]:
movingAverage = ap_logScale.rolling(window=12).mean()
movingSTD = ap_logScale.rolling(window=12).std()
plt.plot(ap_logScale)
plt.plot(movingAverage, color='red')
Out[28]:
In [29]:
# Get the difference between the moving average and the actual number of passengers
datasetLogScaleMinusMovingAverage = ap_logScale - movingAverage
datasetLogScaleMinusMovingAverage.head(12)
#Remove Nan Values
datasetLogScaleMinusMovingAverage.dropna(inplace=True)
datasetLogScaleMinusMovingAverage.head(10)
Out[29]:
In [33]:
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries):
#Determing rolling statistics
movingAverage = timeseries.rolling(window=12).mean()
movingSTD = timeseries.rolling(window=12).std()
#Plot rolling statistics:
orig = plt.plot(timeseries, color='blue',label='Original')
mean = plt.plot(movingAverage, color='red', label='Rolling Mean')
std = plt.plot(movingSTD, color='black', label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)
#Perform Dickey-Fuller test:
print('Results of Dickey-Fuller Test:')
dftest = adfuller(timeseries, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print(dfoutput)
In [34]:
test_stationarity(datasetLogScaleMinusMovingAverage)
In [35]:
exponentialDecayWeightedAverage = ap_logScale.ewm(halflife=12, min_periods=0, adjust=True).mean()
plt.plot(ap_logScale)
plt.plot(exponentialDecayWeightedAverage, color='red')
Out[35]:
In [36]:
datasetLogScaleMinusMovingExponentialDecayAverage = ap_logScale - exponentialDecayWeightedAverage
test_stationarity(datasetLogScaleMinusMovingExponentialDecayAverage)
In [37]:
datasetLogDiffShifting = ap_logScale - ap_logScale.shift() # just shift by 1 period
plt.plot(datasetLogDiffShifting)
Out[37]:
In [40]:
datasetLogDiffShifting.dropna(inplace=True)
test_stationarity(datasetLogDiffShifting)
In [43]:
from statsmodels.tsa.seasonal import seasonal_decompose
decomposition = seasonal_decompose(ap_logScale)
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid
plt.subplot(411)
plt.plot(ap_logScale, label='Original')
plt.legend(loc='best')
plt.subplot(412)
plt.plot(trend, label='Trend')
plt.legend(loc='best')
plt.subplot(413)
plt.plot(seasonal,label='Seasonality')
plt.legend(loc='best')
plt.subplot(414)
plt.plot(residual, label='Residuals')
plt.legend(loc='best')
plt.tight_layout()
In [44]:
decomposedLogData = residual
decomposedLogData.dropna(inplace=True)
test_stationarity(decomposedLogData)
In [45]:
#ACF and PACF plots:
from statsmodels.tsa.stattools import acf, pacf
lag_acf = acf(datasetLogDiffShifting, nlags=20)
lag_pacf = pacf(datasetLogDiffShifting, nlags=20, method='ols')
#Plot ACF:
plt.subplot(121)
plt.plot(lag_acf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(datasetLogDiffShifting)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(datasetLogDiffShifting)),linestyle='--',color='gray')
plt.title('Autocorrelation Function')
#Plot PACF:
plt.subplot(122)
plt.plot(lag_pacf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(datasetLogDiffShifting)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(datasetLogDiffShifting)),linestyle='--',color='gray')
plt.title('Partial Autocorrelation Function')
plt.tight_layout()
In [46]:
from statsmodels.tsa.arima_model import ARIMA
#AR MODEL
model = ARIMA(ap_logScale, order=(2, 1, 0))
results_AR = model.fit(disp=-1)
plt.plot(datasetLogDiffShifting)
plt.plot(results_AR.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_AR.fittedvalues-datasetLogDiffShifting)**2))
print('Plotting AR model')
In [47]:
#MA MODEL
model = ARIMA(ap_logScale, order=(0, 1, 2))
results_MA = model.fit(disp=-1)
plt.plot(datasetLogDiffShifting)
plt.plot(results_MA.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_MA.fittedvalues-datasetLogDiffShifting)**2))
print('Plotting AR model')
In [48]:
model = ARIMA(ap_logScale, order=(2, 1, 2))
results_ARIMA = model.fit(disp=-1)
plt.plot(datasetLogDiffShifting)
plt.plot(results_ARIMA.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_ARIMA.fittedvalues-datasetLogDiffShifting)**2))
Out[48]:
In [49]:
predictions_ARIMA_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
print (predictions_ARIMA_diff.head())
print(len(predictions_ARIMA_diff))
In [50]:
#Convert to cumulative sum
predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum()
print (predictions_ARIMA_diff_cumsum.head())
In [52]:
#predictions_ARIMA_log = pd.Series(ts_log.ix[0], index=ts_log.index)
#predictions_ARIMA_log = predictions_ARIMA_log.add(predictions_ARIMA_diff_cumsum,fill_value=0)
#predictions_ARIMA_log.head()
predictions_ARIMA_log = pd.Series(ap_logScale.iloc[0], index=ap_logScale.index)
predictions_ARIMA_log = predictions_ARIMA_log.add(predictions_ARIMA_diff_cumsum,fill_value=0)
predictions_ARIMA_log.head()
Out[52]:
In [54]:
predictions_ARIMA = np.exp(predictions_ARIMA_log)
plt.plot(ap["#Passengers"])
plt.plot(predictions_ARIMA)
plt.title('RMSE: %.4f'% np.sqrt(sum((predictions_ARIMA-ap["#Passengers"])**2)/len(ap["#Passengers"])))
Out[54]:
In [ ]: