In [3]:
# see http://logics-of-blue.com/python-time-series-analysis/
import numpy as np
import pandas as pd
from scipy import stats
# graph plot
from matplotlib import pylab as plt
import seaborn as sns
%matplotlib inline
# setting graph size
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15,6
# model
import statsmodels.api as sm
In [ ]:
In [ ]:
In [4]:
# read data
dataNormal = pd.read_csv('AirPassengers.csv')
dataNormal.head()
Out[4]:
In [ ]:
In [5]:
# read data as month : data
dateparse = lambda dates: pd.datetime.strptime(dates, '%Y-%m')
data = pd.read_csv('AirPassengers.csv' , index_col='Month',date_parser=dateparse, dtype='float')
data.head()
Out[5]:
In [ ]:
In [6]:
# setting data for plot
ts = data['#Passengers']
ts.head()
Out[6]:
In [10]:
print(ts.index.freq)
ts.index
Out[10]:
In [8]:
# plot data
plt.plot(ts)
Out[8]:
In [9]:
# data-get-method 1
ts['1949-01-01']
Out[9]:
In [10]:
# data-get-method 2
ts['1949']
Out[10]:
In [11]:
# slide data
ts.shift().head()
Out[11]:
In [13]:
# difference
diff = ts - ts.shift()
diff.head()
Out[13]:
In [14]:
# log difference
logDiff = np.log(ts) - np.log(ts.shift())
# remove NaN
logDiff.dropna().head()
Out[14]:
In [20]:
# Autocirrekation
ts_acf = sm.tsa.stattools.acf(ts, nlags=40)
ts_acf
Out[20]:
In [22]:
# Partial autocorrelation
ts_pacf = sm.tsa.stattools.pacf(ts, nlags=40, method='ols')
ts_pacf
Out[22]:
In [15]:
# plot autocorrelation
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(ts, lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(ts, lags=40, ax=ax2)
In [ ]:
In [24]:
# ARMA model prediction ... (This is self thought (not automatically)) ........................ <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
diff = ts - ts.shift()
diff = diff.dropna()
diff.head()
# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
Out[24]:
In [25]:
# difference plot
plt.plot(diff)
Out[25]:
In [26]:
# automatically ARMA prediction functrion (for difference sequence)
resDiff = sm.tsa.arma_order_select_ic(diff, ic='aic', trend='nc')
resDiff
# search min_value
Out[26]:
In [27]:
# we found P-3, q=2 automatically
from statsmodels.tsa.arima_model import ARIMA
ARIMA_3_1_2 = ARIMA(ts, order=(3, 1, 2)).fit(dist=False) # "3"(ar) 1 (i) "2"(ma)
ARIMA_3_1_2.params
Out[27]:
In [28]:
# check Residual error (... I think this is "White noise")
# this is not Arima ... (Periodicity remained)
resid = ARIMA_3_1_2.resid
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(resid.values.squeeze(), lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(resid, lags=40, ax=ax2)
In [15]:
# predict SARIMA model by myself (not automatically)
import statsmodels.api as sm
SARIMA_3_1_2_111 = sm.tsa.SARIMAX(ts, order=(3,1,2), seasonal_order=(1,1,1,12)).fit()
# order ... from ARIMA model // seasonal_order ... 1 1 1 ... ?
print(SARIMA_3_1_2_111.summary())
# maybe use "Box-Jenkins method" ...
# https://github.com/statsmodels/statsmodels/issues/3620 for error
In [22]:
type(SARIMA_3_1_2_111)
Out[22]:
In [12]:
# check Residual error
residSARIMA = SARIMA_3_1_2_111.resid
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(residSARIMA.values.squeeze(), lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(residSARIMA, lags=40, ax=ax2)
In [25]:
# prediction
pred = SARIMA_3_1_2_111.predict('1960-01-01', '1961-12-01')
print(pred)
type(pred)
Out[25]:
In [14]:
# plot real data and predict data
plt.plot(ts)
plt.plot(pred, "r")
Out[14]:
In [43]:
# SARIMA prediction (automatically) with AIC
max_p = 3
max_q = 3
max_d = 1
max_sp = 1
max_sq = 1
max_sd = 1
pattern = max_p*(max_q + 1)*(max_d + 1)*(max_sp + 1)*(max_sq + 1)*(max_sd + 1)
modelSelection = pd.DataFrame(index=range(pattern), columns=["model", "aic"])
pattern
Out[43]:
In [44]:
# Brute force method
num = 0
for p in range(1, max_p + 1):
for d in range(0, max_d + 1):
for q in range(0, max_q + 1):
for sp in range(0, max_sp + 1):
for sd in range(0, max_sd + 1):
for sq in range(0, max_sq + 1):
sarima = sm.tsa.SARIMAX(
ts, order=(p,d,q),
seasonal_order=(sp,sd,sq,12),
enforce_stationarity = False,
enforce_invertibility = False
).fit()
modelSelection.ix[num]["model"] = "order=(" + str(p) + ","+ str(d) + ","+ str(q) + "), season=("+ str(sp) + ","+ str(sd) + "," + str(sq) + ")"
modelSelection.ix[num]["aic"] = sarima.aic
num = num + 1
In [45]:
modelSelection
Out[45]:
In [46]:
# min_AIC model
modelSelection[modelSelection.aic == min(modelSelection.aic)]
Out[46]:
In [47]:
bestSARIMA = sm.tsa.SARIMAX(ts, order=(3,1,3), seasonal_order=(0,1,1,12), enforce_stationarity = False, enforce_invertibility = False).fit()
In [48]:
print(bestSARIMA.summary())
In [49]:
# check Residual error
residSARIMA = bestSARIMA.resid
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(residSARIMA, lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(residSARIMA, lags=40, ax=ax2)
In [50]:
# prediction
bestPred = bestSARIMA.predict('1960-01-01', '1961-12-01')
# plot real data and predict data
plt.plot(ts)
plt.plot(bestPred, "r")
Out[50]:
In [ ]: