In [80]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
%matplotlib inline
In [3]:
df = sm.datasets.macrodata.load_pandas().data
In [4]:
df.head()
Out[4]:
In [6]:
print(sm.datasets.macrodata.NOTE)
In [10]:
index = pd.Index(sm.tsa.datetools.dates_from_range("1959Q1", "2009Q3"))
In [11]:
df.index = index
In [12]:
df.head()
Out[12]:
In [13]:
df["realgdp"].plot()
Out[13]:
Separates a time-seris y_t into a trend T-t and a cyclical component $\tau$ t
In [19]:
gdp_cycle, gdp_trend = sm.tsa.filters.hpfilter(df["realgdp"])
In [20]:
df["trend"] = gdp_trend
In [21]:
df[["realgdp", "trend"]]["2000-03-31":].plot()
Out[21]:
In [23]:
airline = pd.read_csv("airline_passengers.csv", index_col="Month")
In [24]:
airline.index
Out[24]:
In [27]:
airline.dropna(inplace=True)
In [29]:
airline.index = pd.to_datetime(airline.index)
In [30]:
airline.head()
Out[30]:
In [31]:
airline.index
Out[31]:
In [40]:
airline["6-month-SMA"] = airline["Thousands of Passengers"].rolling(window=6).mean()
In [41]:
airline["12-month-SMA"] = airline["Thousands of Passengers"].rolling(window=12).mean()
In [42]:
airline.plot(figsize=(10, 8))
Out[42]:
In [43]:
airline["EWMA-12"] = airline["Thousands of Passengers"].ewm(span=12).mean()
In [44]:
airline[["Thousands of Passengers", "EWMA-12"]].plot(figsize=(10, 8))
Out[44]:
In [45]:
airline.head()
Out[45]:
In [46]:
airline["Thousands of Passengers"].plot()
Out[46]:
In [53]:
from statsmodels.tsa.seasonal import seasonal_decompose
In [56]:
result = seasonal_decompose(airline["Thousands of Passengers"], model="multiplicative")
In [57]:
result.seasonal.plot()
Out[57]:
In [58]:
result.trend.plot()
Out[58]:
In [62]:
fig = result.plot()
In [63]:
df = pd.read_csv("monthly-milk-production-pounds-p.csv")
In [64]:
df.head()
Out[64]:
In [65]:
df.columns = ["Month", "Milk in Pounds per Cow"]
In [66]:
df.head()
Out[66]:
In [67]:
df.tail()
Out[67]:
In [68]:
df.drop(168, axis=0, inplace=True)
In [69]:
df.tail()
Out[69]:
In [70]:
df["Month"] = pd.to_datetime(df["Month"])
In [71]:
df.head()
Out[71]:
In [72]:
df.set_index("Month", inplace=True)
In [73]:
df.index
Out[73]:
In [74]:
df.describe().transpose()
Out[74]:
Visualize the data...
In [75]:
df.plot()
Out[75]:
The data has a trend and seasonality
In [76]:
time_series = df["Milk in Pounds per Cow"]
In [81]:
time_series.rolling(12).mean().plot(label="12 Month Rolling Mean")
time_series.rolling(12).std().plot(label="12 Month Rolling Variance")
time_series.plot()
plt.legend()
Out[81]:
In [82]:
decomp = seasonal_decompose(time_series)
In [85]:
fig = decomp.plot()
fig.set_size_inches(15, 8)
In [86]:
from statsmodels.tsa.stattools import adfuller
In [87]:
result = adfuller(df["Milk in Pounds per Cow"])
In [88]:
result
Out[88]:
In [90]:
def adf_check(time_series):
result = adfuller(time_series)
print(" Augmented Dicky-Fuller Test")
labels = ["ADF Test Statistic", "p-value", "# of lags", "Num of Observations used"]
for value, label in zip(result, labels):
print(label + ": " + str(value))
if result[1] <= 0.05:
print("Strong evidence against null hypothesis")
print("Reject null hypothesis")
print("Data has no unit root and is stationary")
else:
print("Weak eveidence against null hypothesis")
print("Fail to reject null hypothesis")
print("Data has a unit root and is non-stationary")
In [91]:
adf_check(df["Milk in Pounds per Cow"])
In [92]:
df["First Difference"] = df["Milk in Pounds per Cow"] - df["Milk in Pounds per Cow"].shift(1)
In [93]:
df["First Difference"].plot()
Out[93]:
In [94]:
adf_check(df["First Difference"].dropna())
In [95]:
df["Milk Second Difference"] = df["First Difference"] - df["First Difference"].shift(1)
In [96]:
df["Milk Second Difference"].plot()
Out[96]:
In [100]:
adf_check(df["Milk Second Difference"].dropna())
In [97]:
df["Seasonal Difference"] = df["Milk in Pounds per Cow"] - df["Milk in Pounds per Cow"].shift(12)
In [98]:
df["Seasonal Difference"].plot()
Out[98]:
In [99]:
adf_check(df["Seasonal Difference"].dropna())
In [101]:
df["Seasonal First Differnce"] = df["First Difference"] - df["First Difference"].shift(12)
In [102]:
df["Seasonal First Differnce"].plot()
Out[102]:
In [103]:
adf_check(df["Seasonal First Differnce"].dropna())
In [104]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
In [105]:
fig_first = plot_acf(df["First Difference"].dropna())
In [106]:
fig_seasonal_first = plot_acf(df["Seasonal First Differnce"].dropna())
with pandas...
In [107]:
from pandas.plotting import autocorrelation_plot
In [108]:
autocorrelation_plot(df["Seasonal First Differnce"].dropna())
Out[108]:
pandas does not do partial autocorrelation..
In [109]:
result = plot_pacf(df["Seasonal First Differnce"].dropna())
In [114]:
plot_acf(df["Seasonal First Differnce"].dropna())
plot_pacf(df["Seasonal First Differnce"].dropna());
In [116]:
from statsmodels.tsa.arima_model import ARIMA # standard ARIMA
In [118]:
model = sm.tsa.statespace.SARIMAX(df["Milk in Pounds per Cow"], order=(0,1,0), seasonal_order=(1, 1, 1, 12))
In [119]:
results = model.fit()
In [120]:
print(results.summary())
In [122]:
results.resid.plot()
Out[122]:
In [123]:
results.resid.plot(kind="kde")
Out[123]:
In [126]:
df["forecast"] = results.predict(start=150, end=168)
df[["Milk in Pounds per Cow", "forecast"]].plot(figsize=(12, 8))
Out[126]:
Setting a later end date
In [128]:
df.tail()
Out[128]:
In [129]:
from pandas.tseries.offsets import DateOffset
In [130]:
future_dates = [df.index[-1] + DateOffset(months=x) for x in range(0, 24)]
In [133]:
future_df = pd.DataFrame(index=future_dates, columns=df.columns)
In [134]:
final_df = pd.concat([df, future_df])
In [136]:
final_df.tail()
Out[136]:
In [137]:
final_df["forecast"] = results.predict(start=168, end = 192)
In [138]:
final_df.tail()
Out[138]:
In [140]:
final_df[["Milk in Pounds per Cow", "forecast"]].plot(figsize=(10, 8))
Out[140]:
In [ ]: