In [1]:
import pandas as pd
import numpy as np
%pylab inline
pylab.style.use('ggplot')
In [2]:
url = 'https://vincentarelbundock.github.io/Rdatasets/csv/datasets/AirPassengers.csv'
In [3]:
passengers = pd.read_csv(url)
In [4]:
passengers.head()
Out[4]:
In [5]:
passengers = passengers.drop('Unnamed: 0', axis=1)
In [6]:
month, year = np.modf(passengers['time'])
In [7]:
month = np.round(month*12+1).astype(np.int)
In [8]:
year = year.astype(np.int)
In [9]:
from datetime import date
dates = [date(y, m, 1) for y, m in zip(year, month)]
In [10]:
periods = pd.PeriodIndex(dates, freq='M')
In [11]:
passengers.index = periods
In [12]:
passengers = passengers.drop('time', axis=1).squeeze()
In [13]:
passengers.plot()
Out[13]:
In [14]:
lagged = passengers.diff(1)[1:]
In [15]:
lagged.plot()
Out[15]:
In [16]:
seasonal_removed = lagged.diff(12)[12:]
In [17]:
seasonal_removed.plot()
Out[17]:
In [18]:
from statsmodels.tsa.stattools import adfuller
adf, p_val, lag, nobs, critical_vals, icbest = adfuller(seasonal_removed)
In [19]:
print('DF test statistic: {:.4f}, p_val: {:.4f}, lag={}'.format(adf, p_val, lag))
In [20]:
from statsmodels.tsa.stattools import acf, pacf
In [21]:
acfs = acf(seasonal_removed)[:20]
pd.Series(acfs, index=range(20)).plot(kind='bar')
Out[21]:
In [22]:
pacfs = pacf(seasonal_removed)[:20]
pd.Series(pacfs, index=range(20)).plot(kind='bar')
Out[22]:
In [23]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
model = SARIMAX(passengers, trend='n', order=(0, 1 ,0), seasonal_order=(1, 1, 1, 12))
results = model.fit()
results.summary()
Out[23]:
In [24]:
passengers_predicted = results.predict()
In [25]:
results_df = pd.concat([passengers, passengers_predicted], keys=['actual', 'predicted'], axis=1)
In [26]:
results_df.plot(figsize=(10, 4))
Out[26]: