In [11]:
from pandas import read_csv
from pandas import datetime
from pandas import DataFrame
from statsmodels.tsa.arima_model import ARIMA
from matplotlib import pyplot
from pandas.plotting import autocorrelation_plot
# download the data here (no need login):
## https://datamarket.com/data/set/22r0/sales-of-shampoo-over-a-three-year-period#!ds=22r0&display=line
In [3]:
def parser(x):
return datetime.strptime('190'+x, '%Y-%m')
In [105]:
series = read_csv('shampoo_sales.csv', header=0, parse_dates=[0], index_col=0, squeeze=True, date_parser=parser)
autocorrelation_plot(series)
pyplot.show()
# First 10 ~ 12 lags are positive
# First 5 lags are higher than the dash line
# So, this is AR model, and the param for AR can start with 5
In [40]:
# Check Stationary
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries):
#Determing rolling statistics
rolmean = timeseries.rolling(window=12,center=False).mean()
rolstd = timeseries.rolling(window=12,center=False).std()
#Plot rolling statistics:
plt.figure(figsize=(9,7))
orig = plt.plot(timeseries, color='blue',label='Original')
mean = plt.plot(rolmean, color='green', label='Rolling Mean')
std = plt.plot(rolstd, color='red', label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)
#Perform Dickey-Fuller test:
print 'Results of Dickey-Fuller Test:'
dftest = adfuller(timeseries, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print dfoutput
In [41]:
test_stationarity(series)
In [25]:
# From the above output, we can see:
## Mean is changing over the time, although variance stays more constant -> non-stationary
## Test Statistics is higher than Critical Value -> non-statioary
# So here comes data preprocessing, to Make It Stationary
import numpy as np
In [29]:
# Make It Stationary - First Trick - Log Transform, to reduce trend
## Original data is all positive, taking log() is fine
ts_log = np.log(series)
plt.figure(figsize=(9,7))
plt.plot(ts_log)
plt.show()
In [35]:
# Make It Stationary Mehtod 1 - Moving Average [Eliminate Trend Only]
moving_avg = ts_log.rolling(window=12,center=False).mean() # taking average of LAST 2 years (36-12) values
plt.figure(figsize=(9,7))
plt.plot(ts_log)
plt.plot(moving_avg, color='red')
plt.show()
In [36]:
ts_log_moving_avg_diff = ts_log - moving_avg
ts_log_moving_avg_diff.head(12) # rolling mean is not defined in the first 12 values, it's YEARLY average
Out[36]:
In [42]:
ts_log_moving_avg_diff.dropna(inplace=True)
test_stationarity(ts_log_moving_avg_diff)
In [45]:
# Make It Stationary Mehtod 2 - Weighted Moving Average [Eliminate Trend Only]
expwighted_avg = ts_log.ewm(halflife=12,ignore_na=False,min_periods=0,adjust=True).mean()
plt.figure(figsize=(9,7))
plt.plot(ts_log)
plt.plot(expwighted_avg, color='red')
plt.show()
In [46]:
ts_log_ewma_diff = ts_log - expwighted_avg
test_stationarity(ts_log_ewma_diff)
In [47]:
# Make It Stationary Method 3 [Eliminating Trend and Seasonality]
## Differencing – taking the differece with a particular time lag
ts_log_diff = ts_log - ts_log.shift()
plt.figure(figsize=(9,7))
plt.plot(ts_log_diff)
plt.show()
In [48]:
ts_log_diff.head()
Out[48]:
In [56]:
ts_log_diff.dropna(inplace=True)
test_stationarity(ts_log_diff)
# It's better than the above 2 methods (for this case), basde on the plot
# Although Test Statistics is still higher than all the Critical Value, it's closer
In [57]:
# Make It Stationary Method 4 [Eliminating Trend and Seasonality]
## Decoposing - trend and seasonality are modeled separately and the remaining part of the series is returned
from statsmodels.tsa.seasonal import seasonal_decompose
# trend, seasonality are separated out from data, and we can model the residuals
decomposition = seasonal_decompose(ts_log)
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid
plt.figure(figsize=(10,10))
plt.subplot(411)
plt.plot(ts_log, label='Original')
plt.legend(loc='best')
plt.subplot(412)
plt.plot(trend, label='Trend')
plt.legend(loc='best')
plt.subplot(413)
plt.plot(seasonal,label='Seasonality')
plt.legend(loc='best')
plt.subplot(414)
plt.plot(residual, label='Residuals')
plt.legend(loc='best')
plt.tight_layout()
plt.show()
In [58]:
ts_log_decompose = residual
ts_log_decompose.dropna(inplace=True)
test_stationarity(ts_log_decompose)
In [ ]:
# None of the above 4 methods could make this piece of data stationary
# Maybe we need more years of data in practice
In [23]:
# try ARIMA model on the original data
# AR model with p=5, q=0 (not MA model), d=1 means using difference order as 1 to make the time series statinoary
series = read_csv('shampoo_sales.csv', header=0, parse_dates=[0], index_col=0, squeeze=True, date_parser=parser)
# fit model
model = ARIMA(series, order=(5,1,0))
model_fit = model.fit(disp=0) # with disp=0, we are turning off the debug info about fitting the linear regression
print(model_fit.summary())
# plot residual errors
residuals = DataFrame(model_fit.resid)
residuals.plot()
pyplot.show()
residuals.plot(kind='kde')
pyplot.show()
print(residuals.describe())
In [13]:
# The first line plot if the plot of the residual errors,
## suggesting that there may still be some trend information not captured by the model.
# The second plot, a density plot of the residual error values,
## suggesting the errors are Gaussian, but may not be centered on zero.
# The last output is the distribution of the residual errors is displayed
## non-zero mean indicates that, there is bias in prediction
In [16]:
# forecast
series = read_csv('shampoo_sales.csv', header=0, parse_dates=[0], index_col=0, squeeze=True, date_parser=parser)
series.values
Out[16]:
In [21]:
from sklearn.metrics import mean_squared_error
X = series.values
size = int(len(X) * 0.7)
train, test = X[0:size], X[size:len(X)]
history = [x for x in train]
predictions = list()
for t in range(len(test)):
model = ARIMA(history, order=(5,1,0))
model_fit = model.fit(disp=0)
output = model_fit.forecast()
y_pred = output[0]
predictions.append(y_pred)
obs = test[t]
history.append(obs) # The observation value in the last run will be put as the history for the next run
print('predicted=%f, expected=%f' % (y_pred, obs)) # compare each prediction and observation
error = mean_squared_error(test, predictions)
print('Test MSE: %.3f' % error)
# plot the forecast
pyplot.plot(test)
pyplot.plot(predictions, color='red')
pyplot.show()
In [60]:
# try ARIMA model on differenting output, since it gave best result above
from statsmodels.tsa.stattools import acf, pacf
lag_acf = acf(ts_log_diff, nlags=20)
lag_pacf = pacf(ts_log_diff, nlags=20, method='ols')
In [69]:
plt.figure(figsize=(20,10))
#Plot ACF:
## q – The lag value where the ACF chart crosses the upper confidence interval for the first time (in this case q=1)
plt.subplot(121)
plt.plot(lag_acf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.title('Autocorrelation Function')
#Plot PACF:
## p – The lag value where the PACF chart crosses the upper confidence interval for the first time (in this case p=1)
plt.subplot(122)
plt.plot(lag_pacf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.title('Partial Autocorrelation Function')
plt.tight_layout()
plt.show()
In [80]:
from statsmodels.tsa.arima_model import ARIMA
# RSS is for the values of residuals
# AR model
model = ARIMA(ts_log, order=(1, 1, 0))
results_AR = model.fit(disp=-1)
plt.figure(figsize=(20,10))
plt.plot(ts_log_diff)
plt.plot(results_AR.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_AR.fittedvalues-ts_log_diff)**2))
plt.show()
In [79]:
# MA Model
model = ARIMA(ts_log, order=(0, 1, 1))
results_MA = model.fit(disp=-1)
plt.figure(figsize=(20,10))
plt.plot(ts_log_diff)
plt.plot(results_MA.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_MA.fittedvalues-ts_log_diff)**2))
plt.show()
In [100]:
# combined mobel
model = ARIMA(ts_log, order=(1, 1, 1))
results_ARIMA = model.fit(disp=-1)
plt.figure(figsize=(20,10))
plt.plot(ts_log_diff)
plt.plot(results_ARIMA.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_ARIMA.fittedvalues-ts_log_diff)**2))
plt.show()
In [101]:
# As we can see based on RSS (sum of resuduals)
## Scale back to original value, and see how this combined value performs
# Step 1 - store predicted result as seperated result
predictions_ARIMA_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
print predictions_ARIMA_diff.head()
# start with 1901-02-01 is because we took the lag by 1, first element has nothing to subtract
In [102]:
# Step 2 - Convert the differencing to log scale
## Add these differences consecutively to the base number
## An easy way to do it is to first determine the cumulative sum at index and then add it to the base number
predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum()
print predictions_ARIMA_diff_cumsum.head()
In [103]:
predictions_ARIMA_log = pd.Series(ts_log.iloc[0], index=ts_log.index)
predictions_ARIMA_log = predictions_ARIMA_log.add(predictions_ARIMA_diff_cumsum,fill_value=0)
predictions_ARIMA_log.head()
Out[103]:
In [107]:
predictions_ARIMA = np.exp(predictions_ARIMA_log)
plt.figure(figsize=(20,10))
plt.plot(series, color='red')
plt.plot(predictions_ARIMA, color='green')
plt.title('RMSE: %.4f'% np.sqrt(sum((predictions_ARIMA-series)**2)/len(series)))
plt.show()
In [ ]:
# The forecast result is not really good.... Maybe the training data needs more years