I found a tutorial and I don't like it. So, finally decided to find an easier solution to do grid search for ARIMA, and to add cross validation in a way that's easier to understand.
Download the data here (no need login): https://datamarket.com/data/set/22r0/sales-of-shampoo-over-a-three-year-period#!ds=22r0&display=line
In [6]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import itertools
In [2]:
series = pd.read_csv('shampoo_sales.csv', header=0, parse_dates=[0], index_col=0, squeeze=True)
series.head()
Out[2]:
In [3]:
def parser(x):
return pd.datetime.strptime('190'+x, '%Y-%m')
series = pd.read_csv('shampoo_sales.csv', header=0, parse_dates=[0], index_col=0, squeeze=True, date_parser=parser)
series.head()
Out[3]:
In [4]:
series.values
Out[4]:
In [9]:
# Method 1 - Without cross validation, just simply grid search for ARIMA
p = d = q = range(0, 3)
pdq = list(itertools.product(p, d, q))
bestAIC = np.inf
bestParam = None
# grid search
for param in pdq:
try:
model = sm.tsa.statespace.SARIMAX(series.values,
order=param,
enforce_stationarity=False,
enforce_invertibility=False)
results = model.fit()
#if current run of AIC is better than the best one so far, overwrite it
if results.aic < bestAIC:
bestAIC = results.aic
bestParam = param
except:
pass
print('the best AIC and order are:',bestAIC,bestParam)
In [8]:
# Method 2 - Add cross validation with grid search for ARIMA
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=10)
for train_index, test_index in tscv.split(series.values):
print("TRAIN:", train_index, "TEST:", test_index)
In [10]:
import warnings
warnings.filterwarnings("ignore")
p = d = q = range(0, 3)
pdq = list(itertools.product(p, d, q))
bestAIC = np.inf
best_cvAIC = np.inf
bestParam = None
best_cvParam = None
X = series.values
# grid search
for param in pdq:
for train_index, test_index in tscv.split(series.values):
X_train, X_test = X[train_index], X[test_index]
try:
model = sm.tsa.statespace.SARIMAX(X_train,
order=param,
enforce_stationarity=False,
enforce_invertibility=False)
results = model.fit()
#if current run of AIC is better than the best one so far, overwrite it
if results.aic < bestAIC:
bestAIC = results.aic
bestParam = param
# apply the best order on testig data
best_model = sm.tsa.statespace.SARIMAX(X_test,
order=bestParam,
enforce_stationarity=False,
enforce_invertibility=False)
results = best_smodel.fit()
if results.aic < best_cvAIC:
best_cvAIC = results.aic
best_cvParam = bestParam
except:
pass
print('the best AIC, the best CV AIC and order are:',bestAIC,best_cvAIC,bestParam)
SARIMAX
model