pmdarima
This example follows the post on Towards Data Science (TDS), demonstrating the use of pmdarima
to simplify time series analysis.
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import pmdarima as pm
print(f"Using pmdarima {pm.__version__}")
In [2]:
from pmdarima.datasets.stocks import load_msft
df = load_msft()
df.head()
Out[2]:
In [3]:
train_len = int(df.shape[0] * 0.8)
train_data, test_data = df[:train_len], df[train_len:]
y_train = train_data['Open'].values
y_test = test_data['Open'].values
print(f"{train_len} train samples")
print(f"{df.shape[0] - train_len} test samples")
In [18]:
from pandas.plotting import lag_plot
fig, axes = plt.subplots(3, 2, figsize=(12, 16))
plt.title('MSFT Autocorrelation plot')
# The axis coordinates for the plots
ax_idcs = [
(0, 0),
(0, 1),
(1, 0),
(1, 1),
(2, 0),
(2, 1)
]
for lag, ax_coords in enumerate(ax_idcs, 1):
ax_row, ax_col = ax_coords
axis = axes[ax_row][ax_col]
lag_plot(df['Open'], lag=lag, ax=axis)
axis.set_title(f"Lag={lag}")
plt.show()
All lags look fairly linear, so it's a good indicator that an auto-regressive model is a good choice. Therefore, we'll allow the auto_arima
to select the lag term for us, up to 6.
We can estimate the best lag term with several statistical tests:
In [19]:
from pmdarima.arima import ndiffs
kpss_diffs = ndiffs(y_train, alpha=0.05, test='kpss', max_d=6)
adf_diffs = ndiffs(y_train, alpha=0.05, test='adf', max_d=6)
n_diffs = max(adf_diffs, kpss_diffs)
print(f"Estimated differencing term: {n_diffs}")
Use auto_arima
to fit a model on the data.
In [74]:
auto = pm.auto_arima(y_train, d=n_diffs, seasonal=False, stepwise=True,
suppress_warnings=True, error_action="ignore", max_p=6,
max_order=None, trace=True)
In [75]:
print(auto.order)
In [76]:
from sklearn.metrics import mean_squared_error
from pmdarima.metrics import smape
model = auto
def forecast_one_step():
fc, conf_int = model.predict(n_periods=1, return_conf_int=True)
return (
fc.tolist()[0],
np.asarray(conf_int).tolist()[0])
forecasts = []
confidence_intervals = []
for new_ob in y_test:
fc, conf = forecast_one_step()
forecasts.append(fc)
confidence_intervals.append(conf)
# Updates the existing model with a small number of MLE steps
model.update(new_ob)
print(f"Mean squared error: {mean_squared_error(y_test, forecasts)}")
print(f"SMAPE: {smape(y_test, forecasts)}")
In [86]:
fig, axes = plt.subplots(2, 1, figsize=(12, 12))
# --------------------- Actual vs. Predicted --------------------------
axes[0].plot(y_train, color='blue', label='Training Data')
axes[0].plot(test_data.index, forecasts, color='green', marker='o',
label='Predicted Price')
axes[0].plot(test_data.index, y_test, color='red', label='Actual Price')
axes[0].set_title('Microsoft Prices Prediction')
axes[0].set_xlabel('Dates')
axes[0].set_ylabel('Prices')
axes[0].set_xticks(np.arange(0, 7982, 1300).tolist(), df['Date'][0:7982:1300].tolist())
axes[0].legend()
# ------------------ Predicted with confidence intervals ----------------
axes[1].plot(y_train, color='blue', label='Training Data')
axes[1].plot(test_data.index, forecasts, color='green',
label='Predicted Price')
axes[1].set_title('Prices Predictions & Confidence Intervals')
axes[1].set_xlabel('Dates')
axes[1].set_ylabel('Prices')
conf_int = np.asarray(confidence_intervals)
axes[1].fill_between(test_data.index,
conf_int[:, 0], conf_int[:, 1],
alpha=0.9, color='orange',
label="Confidence Intervals")
axes[1].set_xticks(np.arange(0, 7982, 1300).tolist(), df['Date'][0:7982:1300].tolist())
axes[1].legend()
Out[86]:
In [88]:
df["Date"]
Out[88]:
In [ ]: