In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.stattools import kpss
from statsmodels.tsa.stattools import adfuller
In [4]:
# This is the original time series
def parser(x):
return pd.datetime.strptime('190'+x, '%Y-%m')
series = pd.read_csv('shampoo_sales.csv', header=0, parse_dates=[0], index_col=0, squeeze=True, date_parser=parser)
autocorrelation_plot(series)
plt.show()
In [5]:
def test_stationarity(timeseries):
#Determing rolling statistics
rolmean = timeseries.rolling(window=12,center=False).mean()
rolstd = timeseries.rolling(window=12,center=False).std()
rolcov = timeseries.rolling(window=12,center=False).cov()
# Plot rolling statistics:
plt.figure(figsize=(9,7))
orig = plt.plot(timeseries, color='blue',label='Original')
mean = plt.plot(rolmean, color='green', label='Rolling Mean')
std = plt.plot(rolstd, color='red', label = 'Rolling Std')
cov = plt.plot(rolstd, color='purple', label = 'Rolling Cov')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)
# Perform Augmented Dickey-Fuller test:
print 'Results of Augmented Dickey-Fuller (ADF) Test:'
dftest = adfuller(timeseries, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print dfoutput
print
# Perform KPSS
print 'Results of KPSS Test:'
kpsstest = kpss(timeseries, regression='c')
kpss_output = pd.Series(kpsstest[0:3], index=['Test Statistic','p-value','Lags Used'])
for key,value in kpsstest[3].items():
kpss_output['Critical Value (%s)'%key] = value
print kpss_output
In [6]:
ts_log = np.log(series)
plt.figure(figsize=(9,7))
plt.plot(ts_log)
plt.show()
In [7]:
test_stationarity(ts_log)
❣️Notes:
In [8]:
# Change 1 - Differencing
## I still want to try differencing.
ts_log_diff = ts_log - ts_log.shift(3) # I tried 1, 7 steps too
plt.figure(figsize=(9,7))
plt.plot(ts_log_diff)
plt.show()
In [9]:
ts_log_diff.dropna(inplace=True)
test_stationarity(ts_log_diff)
❣️Notes:
In [11]:
# Change 2 - Remove trend with moving average
## As we found above, log series seems to be a trend stationary
moving_avg = ts_log.rolling(window=12,center=False).mean() # taking average of LAST 2 years (36-12) values
plt.figure(figsize=(9,7))
plt.plot(ts_log)
plt.plot(moving_avg, color='orange')
plt.show()
In [14]:
ts_log_moving_avg_diff = ts_log - moving_avg
ts_log_moving_avg_diff.head(12)
Out[14]:
In [15]:
ts_log_moving_avg_diff.dropna(inplace=True)
test_stationarity(ts_log_moving_avg_diff)
❣️Notes:
In [41]:
# Change 3 - Remove trend with weighted moving average
expwighted_avg = ts_log.ewm(alpha=0.9,ignore_na=False,min_periods=0,adjust=True).mean()
plt.figure(figsize=(9,7))
plt.plot(ts_log)
plt.plot(expwighted_avg, color='red')
plt.show()
In [42]:
ts_log_ewma_diff = ts_log - expwighted_avg
test_stationarity(ts_log_ewma_diff)
❣️Notes:
alpha it will determine other params such as halflife, span and com. These params decides how fast the decay in series will happen. When alpha is larger, the decay is slower, and the results will be closer to strict stationry but ADF is always showing non-stationary. It's very easy to get KPSS showing (trend) stationary, which means if we further remove the trend, it will become strict stationary.