In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
import statsmodels.api as sm
In [3]:
# Importing built-in datasets in statsmodels
df = sm.datasets.macrodata.load_pandas().data
In [5]:
df.head()
Out[5]:
In [6]:
print(sm.datasets.macrodata.NOTE)
In [7]:
df.head()
Out[7]:
In [8]:
df.tail()
Out[8]:
In [13]:
# statsmodels.timeseriesanalysis.datetools
index = pd.Index(sm.tsa.datetools.dates_from_range('1959Q1', '2009Q3'))
index
Out[13]:
In [14]:
df.index = index
In [15]:
df.head()
Out[15]:
In [16]:
df['realgdp'].plot()
Out[16]:
In [17]:
result = sm.tsa.filters.hpfilter(df['realgdp'])
result
Out[17]:
In [18]:
type(result)
Out[18]:
In [19]:
type(result[0])
Out[19]:
In [20]:
type(result[1])
Out[20]:
In [22]:
gdp_cycle, gdp_trend = result
df['trend'] = gdp_trend
In [24]:
df[['realgdp', 'trend']].plot()
Out[24]:
In [26]:
# zooming in
df[['realgdp', 'trend']]['2000-03-31':].plot()
Out[26]:
In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [34]:
airline = pd.read_csv('airline_passengers.csv', index_col = 'Month')
airline.head()
Out[34]:
In [36]:
# this is a normal index
airline.index
Out[36]:
In [37]:
# Get rid of all the missing values in this dataset
airline.dropna(inplace=True)
In [38]:
airline.index = pd.to_datetime(airline.index)
airline.head()
Out[38]:
In [39]:
# now its a DatetimeIndex
airline.index
Out[39]:
In [40]:
# Recap of making the SMA
airline['6-month-SMA'] = airline['Thousands of Passengers'].rolling(window=6).mean()
airline['12-month-SMA'] = airline['Thousands of Passengers'].rolling(window=12).mean()
In [41]:
airline.plot(figsize=(10,8))
Out[41]:
Weakness of SMA
In [42]:
airline['EWMA-12'] = airline['Thousands of Passengers'].ewm(span=12).mean()
In [44]:
airline[['Thousands of Passengers', 'EWMA-12']].plot(figsize=(10,8))
Out[44]:
Full reading on mathematics of EWMA
In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [4]:
airline = pd.read_csv('airline_passengers.csv', index_col = 'Month')
airline.head()
Out[4]:
In [5]:
airline.plot()
Out[5]:
In [6]:
airline.dropna(inplace = True)
In [7]:
airline.index = pd.to_datetime(airline.index)
airline.head()
Out[7]:
In [8]:
from statsmodels.tsa.seasonal import seasonal_decompose
In [10]:
# additive/ multiplicative models available
# suspected linear trend = use additive
# suspected non-linear trend = multiplicative model
result = seasonal_decompose(airline['Thousands of Passengers'], model='multiplicative')
In [11]:
result.seasonal
Out[11]:
In [12]:
result.seasonal.plot()
Out[12]:
In [13]:
result.trend.plot()
Out[13]:
In [14]:
result.resid.plot()
Out[14]:
In [17]:
fig = result.plot()
Original Data
Time1 | 10 |
Time2 | 12 |
Time3 | 8 |
Time4 | 14 |
Time5 | 7 |
First Difference
Time1 | NA |
Time2 | 2 |
Time3 | -4 |
Time4 | 6 |
Time5 | -7 |
Second Difference
Time1 | NA |
Time2 | NA |
Time3 | -6 |
Time4 | 10 |
Time5 | -13 |
For seasonal data, you can also difference by season. If you had monthly data with yearly seasonality, you could difference by a time unit of 12, instead of just 1.
Another common techinique with seasonal ARIMA models is to combine both methods, taking the seasonal difference of the first difference.
The general process for ARIMA models is the following:
Let's go through these steps! [https://people.duke.edu/~rnau/arimrule.htm]
In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
%matplotlib inline
In [3]:
df = pd.read_csv('monthly-milk-production-pounds-p.csv')
df.head()
Out[3]:
In [4]:
df.columns = ['Month', 'Milk in Pounds per Cow']
df.head()
Out[4]:
In [5]:
df.tail()
Out[5]:
In [6]:
df.drop(168, axis=0, inplace=True)
df.tail()
Out[6]:
In [7]:
df['Month'] = pd.to_datetime(df['Month'])
df.head()
Out[7]:
In [8]:
df.set_index('Month', inplace=True)
df.head()
Out[8]:
In [9]:
df.index
Out[9]:
In [10]:
df.describe()
Out[10]:
In [11]:
df.describe().transpose()
Out[11]:
In [13]:
df.plot();
In [14]:
time_series = df['Milk in Pounds per Cow']
type(time_series)
Out[14]:
In [21]:
time_series.rolling(12).mean().plot(label='12 SMA')
time_series.rolling(12).std().plot(label='12 STD')
time_series.plot()
plt.legend();
In [22]:
from statsmodels.tsa.seasonal import seasonal_decompose
In [26]:
decomp = seasonal_decompose(time_series)
fig = decomp.plot()
fig.set_size_inches(15,8)
We can use the Augmented Dickey-Fuller unit root test.
In statistics and econometrics, an augmented Dickey–Fuller test (ADF) tests the null hypothesis that a unit root is present in a time series sample. The alternative hypothesis is different depending on which version of the test is used, but is usually stationarity or trend-stationarity.
Basically, we are trying to whether to accept the Null Hypothesis H0 (that the time series has a unit root, indicating it is non-stationary) or reject H0 and go with the Alternative Hypothesis (that the time series has no unit root and is stationary).
We end up deciding this based on the p-value return.
A small p-value (typically ≤ 0.05) indicates strong evidence against the null hypothesis, so you reject the null hypothesis.
A large p-value (> 0.05) indicates weak evidence against the null hypothesis, so you fail to reject the null hypothesis.
Let's run the Augmented Dickey-Fuller test on our data:
In [27]:
from statsmodels.tsa.stattools import adfuller
In [30]:
result = adfuller(df['Milk in Pounds per Cow'])
result
Out[30]:
In [33]:
def adf_check(time_series):
result = adfuller(time_series)
print('Augumented Dicky-Fuller Test')
labels = ['ADF Test Statistic', 'p-value', '# of lags', 'Num of Observations used']
for value, label in zip(result, labels):
print(label + ': ' + str(value))
if result[1] < 0.05:
print('Strong evidence against null hypothesis')
print('Rejecting null hypothesis')
print('Data has no unit root! and is stationary')
else:
print('Weak evidence against null hypothesis')
print('Fail to reject null hypothesis')
print('Data has a unit root, it is non-stationary')
In [34]:
adf_check(df['Milk in Pounds per Cow'])
In [37]:
# Now making the data stationary
df['First Difference'] = df['Milk in Pounds per Cow'] - df['Milk in Pounds per Cow'].shift(1)
df['First Difference'].plot()
Out[37]:
In [42]:
# adf_check(df['First Difference']) - THIS RESULTS IN LinAlgError: SVD did not converge ERROR
In [39]:
# Note: we need to drop the first NA value before plotting this
adf_check(df['First Difference'].dropna())
In [41]:
df['Second Difference'] = df['First Difference'] - df['First Difference'].shift(1)
df['Second Difference'].plot();
In [43]:
adf_check(df['Second Difference'].dropna())
In [45]:
# Let's plot seasonal difference
df['Seasonal Difference'] = df['Milk in Pounds per Cow'] - df['Milk in Pounds per Cow'].shift(12)
df['Seasonal Difference'].plot();
In [46]:
adf_check(df['Seasonal Difference'].dropna())
In [48]:
# Plotting 'Seasonal first difference'
df['Seasonal First Difference'] = df['First Difference'] - df['First Difference'].shift(12)
df['Seasonal First Difference'].plot();
In [49]:
adf_check(df['Seasonal First Difference'].dropna())
In [50]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
In [52]:
# Plotting the gradual decline autocorrelation
fig_first = plot_acf(df['First Difference'].dropna())
In [54]:
fig_first = plot_acf(df['First Difference'].dropna(), use_vlines=False)
In [55]:
fig_seasonal_first = plot_acf(df['Seasonal First Difference'].dropna(), use_vlines=False)
In [56]:
fig_seasonal_first_pacf = plot_pacf(df['Seasonal First Difference'].dropna(), use_vlines=False)
In [78]:
plot_acf(df['Seasonal First Difference'].dropna());
plot_pacf(df['Seasonal First Difference'].dropna());
In [64]:
# ARIMA model for non-sesonal data
from statsmodels.tsa.arima_model import ARIMA
In [65]:
# help(ARIMA)
In [66]:
# ARIMA model from seasonal data
# from statsmodels.tsa.statespace import sarimax
Choosing the p, d, q values of the order and seasonal_order tuple is reading task
More information here...
[https://stackoverflow.com/questions/22770352/auto-arima-equivalent-for-python]
[https://stats.stackexchange.com/questions/44992/what-are-the-values-p-d-q-in-arima]
In [68]:
model = sm.tsa.statespace.SARIMAX(df['Milk in Pounds per Cow'], order=(0,1,0), seasonal_order=(1,1,1,12))
In [69]:
results = model.fit()
In [70]:
print(results.summary())
In [75]:
# residual errors of prediction on the original training data
results.resid
Out[75]:
In [76]:
# plot of residual errors of prediction on the original training data
results.resid.plot();
In [77]:
# KDE plot of residual errors of prediction on the original training data
results.resid.plot(kind='kde');
In [82]:
# Creating a column forecast to house the forecasted values for existing values
df['forecast'] = results.predict(start=150, end=168)
df[['Milk in Pounds per Cow', 'forecast']].plot(figsize=(12,8));
In [84]:
# Forecasting for future data
df.tail()
Out[84]:
In [85]:
from pandas.tseries.offsets import DateOffset
In [86]:
future_dates = [df.index[-1] + DateOffset(months=x) for x in range(0,24)]
future_dates
Out[86]:
In [87]:
future_df = pd.DataFrame(index=future_dates, columns=df.columns)
future_df.head()
Out[87]:
In [89]:
final_df = pd.concat([df, future_df])
final_df.head()
Out[89]:
In [90]:
final_df.tail()
Out[90]:
In [92]:
final_df['forecast'] = results.predict(start=168, end=192)
final_df.tail()
Out[92]:
In [93]:
final_df[['Milk in Pounds per Cow', 'forecast']].plot()
Out[93]:
Lot of this stuff assumes that the y-axis value (price) is directly connected to the time (x-axis value) and that the time is really important aspect of the y value.
While that is true for financial series it discounts the external force i.e. traders also able to buy and sell securities outside the market and affect its price.
And because of that often you'll hear stock and securities prices are following some sort of Brownian motion almost like a random walk.
Because of those aspects of the financial and securities data this sort of forecsting method doesn't really work with stock.
In [ ]: