In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sbn
import numpy as np
from warnings import filterwarnings
from helpers import HarModel, HarchModel
from itertools import product
from pandas.tools.plotting import autocorrelation_plot
from scipy.stats import expon, laplace, t
from arch.univariate import HARX, HARCH, GARCH, EGARCH, StudentsT, SkewStudent
from statsmodels.graphics.tsaplots import plot_pacf
In [2]:
filterwarnings('ignore')
%matplotlib notebook
sbn.set_style('whitegrid')
In [3]:
dat_file = './data/CommodityIndices.xlsx'
skiprows = (2, 3, 4, 5)
raw_data = pd.read_excel(dat_file, skiprows=skiprows, index_col=0, parse_cols='B:AE')
raw_data.columns = [column[4:-2].rstrip() for column in raw_data.columns]
raw_data.iloc[0] = 100.0
In [4]:
raw_data.columns.values.tolist()
Out[4]:
In [5]:
name = 'Energy' # Name of index as string
data = raw_data[name].iloc[-2088:]
fig, ax = plt.subplots(num=name+' Over the Aeons')
data.plot(ax=ax, label=name)
ax.legend(loc='best', frameon=True, fancybox=True, framealpha=0.8)
ax.set_xlabel('Time')
ax.set_ylabel('Relative index')
fig.tight_layout()
We don't want to do any smoothing or averaging to find a trend in the data because we know little of how to extrapolate such a trend to the future.
In [6]:
winsize = 30 # Window size (in business days) to average over
max_lag = 300 # Maximum lag to consider in (partial) autocorrelation
returns = 100 * data.pct_change(1).dropna()
rtnswin = returns.rolling(winsize, center=True) # or .ewm(com=winsize)
rtnmean = rtnswin.mean()
rtn_std = rtnswin.std()
fig = plt.figure(num='Returns on '+name+' and their (Partial) Autocorrelation', figsize=(9.7, 10))
return_ax = fig.add_subplot(311)
aucorr_ax = fig.add_subplot(312)
pacorr_ax = fig.add_subplot(313)
returns.plot(color='#4c72b0', style=['_'], marker='o', alpha=0.2, ax=return_ax)
rtnmean.plot(color='#4c72b0', ax=return_ax)
(rtnmean + rtn_std).plot(color='#c44e52', linewidth=1.0, ax=return_ax)
(rtnmean - rtn_std).plot(color='#c44e52', linewidth=1.0, ax=return_ax)
return_ax.set_xlabel('Time')
return_ax.set_ylabel('Daily returns')
autocorrelation_plot(returns, linewidth=1.0, ax=aucorr_ax)
aucorr_ax.set_xlim(left=0, right=max_lag)
plot_pacf(returns, lags=max_lag, alpha=0.01, ax=pacorr_ax, linewidth=0.5)
pacorr_ax.set_ylim(aucorr_ax.get_ylim())
pacorr_ax.set_xlim(aucorr_ax.get_xlim())
pacorr_ax.set_title('')
pacorr_ax.set_xlabel('Lag')
pacorr_ax.set_ylabel('Partial autocorrelation')
fig.tight_layout()
We note that:
So let's look at the absolute returns and their autocorrelation next.
In [7]:
fig = plt.figure(num='Absolute Returns on '+name+' and their (Partial) Autocorrelation', figsize=(9.7, 10))
return_ax = fig.add_subplot(311)
aucorr_ax = fig.add_subplot(312)
pacorr_ax = fig.add_subplot(313)
returns.abs().plot(linewidth=0.5, ax=return_ax, label='Volatility')
return_ax.set_xlabel('Time')
return_ax.set_ylabel('Absolute daily returns')
autocorrelation_plot(returns.abs(), linewidth=0.5, ax=aucorr_ax, label='Volatility');
aucorr_ax.set_xlim(left=0, right=max_lag)
plot_pacf(returns.abs(), lags=max_lag, alpha=0.01, ax=pacorr_ax, linewidth=0.5)
pacorr_ax.set_ylim(aucorr_ax.get_ylim())
pacorr_ax.set_xlim(aucorr_ax.get_xlim())
pacorr_ax.set_title('')
pacorr_ax.set_xlabel('Lag')
pacorr_ax.set_ylabel('Partial autocorrelation')
fig.tight_layout()
We observe volatility bunching and long-term autocorrelation of the absolute returns. The partial autocorrelation dies off at much shorter lags.
In [8]:
fig = plt.figure(num='Distribution of (Absolute) Returns on '+name, figsize=(9.7, 5))
returns_ax = fig.add_subplot(121)
absrtns_ax = fig.add_subplot(122)
sbn.distplot(returns,
bins=49,
kde=False,
fit=laplace,
fit_kws={'color': '#c44e52', 'label': 'Laplace'},
ax=returns_ax);
returns_ax.set_title('Daily Returns on '+name)
returns_ax.set_xlabel('Value')
returns_ax.set_ylabel('Frequency')
sbn.distplot(returns.abs(),
bins=49,
kde=False,
fit=expon,
fit_kws={'color': '#c44e52'},
color='#55a868',
ax=absrtns_ax);
absrtns_ax.set_title('Absolute Daily Returns on '+name)
absrtns_ax.set_xlabel('Absolute Value')
absrtns_ax.set_ylabel('Frequency')
x_lims = returns_ax.get_xlim()
x_vals = np.linspace(*x_lims, 200)
params = t.fit(returns[returns.abs() > 0.05], floc=0)
returns_ax.plot(x_vals, t.pdf(x_vals, *params), label='Students-t')
returns_ax.legend(loc='best', frameon=True, fancybox=True, framealpha=0.8)
fig.tight_layout()
They returns follow a Laplace- whereas the absolute returns follow an Exponential (or maybe also a Pareto) distribution. The tails are far too fat for a Gaussian process. Interestingly, a Students-t distribution appears to be quite a reasonable fit for the returns, if the pronounced peak at zero is neglected.
In [9]:
params = { 'y': returns,
'constant': False,
'hold_back': max_lag,
'distribution': StudentsT()}
har = HarModel(params)
In [10]:
while har.bic.gets_smaller():
har.refine()
har.extend()
result = har.model.fit()
result.summary()
Out[10]:
In [11]:
harch = HarchModel(har.model)
while harch.bic.gets_smaller():
harch.refine()
harch.extend()
result = harch.model.fit()
result.summary()
Out[11]:
In [12]:
fixed = { 'y': returns,
'lags': har.lags,
'hold_back': max_lag}
tests = { 'constant': [False, True],
'distribution': [StudentsT(), SkewStudent()],
'volatility': [HARCH(lags=harch.lags),
GARCH(p=1, o=0, q=1),
GARCH(p=1, o=1, q=1),
GARCH(p=1, o=0, q=1, power=1.0),
GARCH(p=1, o=1, q=1, power=1.0),
EGARCH(p=1, o=0, q=1),
EGARCH(p=1, o=1, q=1)]}
fit_kws = {'update_freq': 0,
'disp': 'off'}
params_grid = (dict(zip(tests.keys(), values)) for values in product(*tests.values()))
min_bic, min_params = min((HARX(**fixed, **params).fit(**fit_kws).bic, params) for params in params_grid)
print('Minimum BIC is: ', min_bic)
print('for model with: ', [str(param) for param in min_params.values()])
model = HARX(**fixed, **min_params)
result = model.fit(**fit_kws)
result.summary()
Out[12]:
In [13]:
fig = plt.figure(num='Residuals of HAR+Volatility Model and their (Partial) Autocorrelation', figsize=(9.7, 10))
residual_ax = fig.add_subplot(311)
autocorr_ax = fig.add_subplot(312)
partcorr_ax = fig.add_subplot(313)
returns.plot(ax=residual_ax, linewidth=1, label='Returns')
result.resid.plot(ax=residual_ax, linewidth=0.5, label='Residual')
(returns-result.resid).plot(ax=residual_ax, color='#c44e52', label='Model')
residual_ax.set_xlabel('Time')
residual_ax.set_ylabel('Residual of model')
residual_ax.legend(loc='best', frameon=True, fancybox=True, framealpha=0.8)
autocorrelation_plot(returns, ax=autocorr_ax, linewidth=1, label='Returns')
autocorrelation_plot(result.resid.dropna(), ax=autocorr_ax, linewidth=1, label='Residual')
autocorr_ax.set_xlim(left=0, right=max_lag)
autocorr_ax.legend(loc='best', frameon=True, fancybox=True, framealpha=0.8)
plot_pacf(returns, lags=max_lag, alpha=0.01, ax=partcorr_ax, linewidth=0.5, label='Returns')
plot_pacf(result.resid.dropna(), lags=max_lag, alpha=0.05, ax=partcorr_ax, linewidth=0.5, label='Residual')
partcorr_ax.set_ylim(autocorr_ax.get_ylim())
partcorr_ax.set_xlim(autocorr_ax.get_xlim())
partcorr_ax.set_title('')
partcorr_ax.set_xlabel('Lag')
partcorr_ax.set_ylabel('Partial autocorrelation')
fig.tight_layout()
In [14]:
resids = (returns.abs()-result.conditional_volatility).dropna()
resids.plot(linewidth=0.5, ax=return_ax, label='Residual')
result.conditional_volatility.plot(linewidth=1, ax=return_ax, label='Model')
return_ax.legend(loc='best', frameon=True, fancybox=True, framealpha=0.8)
autocorrelation_plot(resids, linewidth=1, ax=aucorr_ax, label='Residual')
aucorr_ax.legend(loc='best', frameon=True, fancybox=True, framealpha=0.8)
plot_pacf(resids, lags=max_lag, alpha=0.05, ax=pacorr_ax, linewidth=0.5)
In [15]:
n_sims = 1000 # Number of simulations to average over
future = 22 # Number of future business days to provide a forecast for
past = 5 # Number of business days in the past to start our the forecast at
start_d = returns.index[-past-1]
horizon = past + future
forecast = result.forecast(horizon=horizon,
start=start_d,
align='origin',
method='simulation',
simulations=n_sims)
fc_index = pd.DatetimeIndex(freq='B', start=start_d + pd.tseries.offsets.BDay(1), periods=horizon)
fc_frame = pd.DataFrame(data=forecast.mean.iloc[-past-1:].values[0], index=fc_index, columns=['mean'])
fc_frame['+var'] = forecast.variance.iloc[-past-1:].values[0]
fc_frame['-var'] = -forecast.variance.iloc[-past-1:].values[0]
fig, ax = plt.subplots(num='Forecast on Returns of '+name)
returns.iloc[-horizon:].plot(ax=ax)
fc_frame['mean'].plot(ax=ax)
fc_frame['+var'].plot(ax=ax, color='#c44e52')
fc_frame['-var'].plot(ax=ax, color='#c44e52')
ax.fill_between(fc_index, fc_frame['+var'], fc_frame['-var'], color='#c44e52', alpha=0.2)
ax.set_ylabel('Daily returns')
fig.tight_layout()
In [16]:
past_pad = max_lag # Number of business days in the past to include into the plot
fig, ax = plt.subplots(num='Forecast on Index of '+name, figsize=(9.5, 7))
data.iloc[-past_pad:].plot(ax=ax)
init_val = data[-past-1]
fc_frame = pd.DataFrame(data=np.zeros((horizon, n_sims)), index=fc_index)
fc_frame.loc[start_d] = init_val
fc_frame.sort_index(inplace=True)
for sim in range(n_sims):
fc_frame.loc[1:, sim] = init_val * (forecast.simulations.values[-past-1][sim] / 100 + 1).cumprod()
fc_frame[sim].plot(ax=ax, color='#55a868', alpha=0.01, linewidth=5)
confintvls = [fc_frame.iloc[0, :2].tolist()]
for day in range(1, horizon+1):
confintvls.append(np.percentile(fc_frame.iloc[day], (2.5, 97.5)).tolist())
ci_frame = pd.DataFrame(confintvls, index=fc_frame.index)
ci_frame.plot(ax=ax, color='#c44e52', legend=False)
ax.set_ylabel('Time')
ax.set_ylabel('Index')
fig.tight_layout()
In [19]:
max_days = data.size # Number of business days to run the simulation for
init_val = 100. # Value that simulated index starts from
initial_vals = returns[-max(har.lags)]
initial_vola = result.conditional_volatility[-max(harch.lags)]
sim_returns = model.simulate(result.params,
max_days,
burn=1000,
initial_value=initial_vals,
initial_value_vol=initial_vola)
sim_trace = [init_val]
sim_trace.extend(init_val * (sim_returns.data / 100 + 1).cumprod())
fig = plt.figure(num='Simulated Traces for the '+name+' Index and its Volatility', figsize=(9.7, 6))
vals_ax = fig.add_subplot(211)
vola_ax = fig.add_subplot(212)
vals_ax.plot(sim_trace)
vals_ax.set_xlabel('Days')
vals_ax.set_ylabel('Index')
vals_ax.set_xlim(left=0, right=max_days)
sim_returns.volatility.plot(ax=vola_ax, color='#c44e52')
vola_ax.set_xlabel('Days')
vola_ax.set_ylabel('Volatility')
vola_ax.set_xlim(left=0, right=max_days)
fig.tight_layout()
In [ ]: