In [1]:
import operator
from functools import reduce
from itertools import repeat, cycle
import datetime
import calendar
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.collections import PolyCollection, LineCollection
from matplotlib import colors as mcolors
from vixstructure.data import TermStructure, Expirations
In [2]:
mpl.rcParams["figure.figsize"] = 16, 9
In [3]:
x = np.linspace(-3, 3, 61)
fig = plt.figure(figsize=(3.5,2))
plt.axvline(0, color="black", lw=1)
plt.axhline(0, color="black", lw=1)
ax = fig.add_subplot(1, 1, 1)
ax.spines['left'].set_position('zero')
ax.spines['right'].set_color('none')
ax.spines['bottom'].set_position('zero')
ax.spines['top'].set_color('none')
ax.spines['left'].set_smart_bounds(True)
ax.spines['bottom'].set_smart_bounds(True)
ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')
ax.plot(x, np.maximum(0 ,x), lw=2)
plt.xticks(range(-3, 6), [])
plt.yticks(range(-3, 6), [])
plt.text(-0.1, -0.1, 0, ha="right", va="top", size="large")
plt.ylim(-0.5, 3.5)
plt.xlim(-3.5, 3.5)
plt.savefig("rectifier.pdf", format="pdf", bbox_inches="tight", dpi=300)
plt.show()
In [22]:
def selu(z, alpha=1.6732632423543772848170429916717, lam=1.0507009873554804934193349852946):
return lam * z if z > 0 else alpha * np.exp(z) - alpha
vselu = np.vectorize(selu)
fig = plt.figure(figsize=(2.5,2))
plt.axvline(0, color="black", lw=1)
plt.axhline(0, color="black", lw=1)
ax = fig.add_subplot(1, 1, 1)
ax.spines['left'].set_position('zero')
ax.spines['right'].set_color('none')
ax.spines['bottom'].set_position('zero')
ax.spines['top'].set_color('none')
ax.spines['left'].set_smart_bounds(True)
ax.spines['bottom'].set_smart_bounds(True)
ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')
ax.plot(x, vselu(x), lw=2)
plt.xticks(range(-3, 6), [])
plt.yticks(range(-3, 6), [])
plt.text(-0.1, -0.1, 0, ha="right", va="top", size="large")
plt.ylim(-2.5, 3.5)
plt.xlim(-3.5, 3.5)
plt.show()
In [4]:
# Consider value '0' as NaN.
xm_settle = pd.read_csv("data/8_m_settle.csv", usecols=range(1,10), dtype=np.float32,
parse_dates=[0], header=0, index_col=0, na_values=0)
xm_symbols = pd.read_csv("data/8_m_symbols.csv", usecols=range(1,10), parse_dates=[0],
header=0, index_col=0, na_values=0)
expiration_months = pd.read_csv("data/expiration.months.csv", header=0, usecols=range(1,13), dtype=np.float32)
vix_index = pd.read_csv("data/vix.csv", parse_dates=[0], header=0, index_col=0, na_values="null",
dtype=np.float32)
expirations = pd.read_csv("data/expirations.csv", parse_dates=list(range(0,9)), usecols=range(1,10),
header=0, index_col=0)
In [5]:
vix_index["Adj Close"]["2006":"2016"].plot(figsize=(8,4))
plt.xlabel("")
plt.savefig("vix.pdf", format="pdf", bbox_inches="tight", dpi=300)
plt.show()
In [6]:
# Plot part of the data.
lines = xm_settle.iloc[1000:2000].T
lines.fillna(method='pad').plot(legend=False, figsize=(16,8), colormap=cm.Blues)
plt.grid()
plt.show()
In [7]:
data_line = xm_settle.iloc[1240:3000:252].iloc[1:]
data_line.T.plot(style="-", legend=False, figsize=(4,5))
plt.grid()
plt.xticks(np.arange(8), calendar.month_abbr[3:])
plt.legend(tuple(map(lambda x: x.strftime("%m/%d/%Y"), data_line.index.date)), title=False)
plt.savefig("term-structure.pdf", format="pdf", bbox_inches="tight", dpi=300)
plt.show()
In [8]:
data_line = xm_settle.loc["2008-11-20":"2008-12-17"]
data_line.T.plot(style="-", legend=False, figsize=(4,5), colormap=cm.coolwarm)
plt.grid()
plt.xticks(np.arange(8), calendar.month_abbr[12:] + calendar.month_abbr[1:8])
plt.savefig("term-structure-crisis.pdf", format="pdf", bbox_inches="tight", dpi=300)
plt.show()
In [9]:
data_line = xm_settle.loc["2012-02-29"]
ax1 = data_line.T.plot(style="-o", legend=False, figsize=(8,4), color="green")
ax1.set_ylabel("futures price", color="green")
plt.xticks(np.arange(8), calendar.month_abbr[3:11])
ax1.tick_params(axis='y', colors='green')
data_line_spread = data_line.aggregate(
lambda x: pd.Series([np.nan] + [2*x[i] - x[i-1] - x[i+1] for i in range(1, len(x)-1)] + [np.nan],
index=calendar.month_abbr[3:11]))
ax2 = data_line_spread.plot(secondary_y=True, color="blue", style="-o")
ax2.set_ylabel("spread price", rotation=270, color="blue")
ax2.tick_params(axis='y', colors='blue')
ax1.grid(axis="x")
plt.axhline(0, color="CornflowerBlue", linestyle="--")
plt.savefig("long_spread.pdf", format="pdf", bbox_inches="tight", dpi=300)
plt.show()
In [10]:
mxs = [xm_settle.iloc[2000:3000, i] for i in range(8)]
contangos = [(mxs[i + 1] - mxs[i]) / mxs[i] for i in range(8 - 1)]
contango_labels = ["M{}-M{}-Contango".format(i+1, i) for i in range(1,8)]
plt.figure(figsize=(16,10))
for i in range(len(contangos)):
contangos[i].plot(label=contango_labels[i], legend=True)
plt.axhline(0, color="black")
plt.xlabel("")
plt.grid()
plt.show()
In [11]:
vix = vix_index["Adj Close"] # Only this one is needed for the index.
trainingdata = pd.merge(pd.DataFrame(vix), xm_settle, left_index=True, right_index=True)
The make sure the training error isn't exploding it's best to normalize the data so its in (-1,1)-range. Doing this as simple as possible is preferred. Too much data wrangling beforehand might introduce some unwanted prior. Wenn normalizing the data all NaN values (including 0) aren't considered because they stand for not available data and influence the result too much.
In [12]:
mean = trainingdata.mean()
ptp = trainingdata.max() - trainingdata.min()
normalized = (trainingdata - mean) / ptp
In [13]:
normalized.plot(figsize=(16,10))
plt.grid()
plt.show()
In [14]:
# Create a new data frame (not very efficient)
xm_year = pd.DataFrame(index=xm_settle.index, columns=expiration_months.columns, dtype=np.float32)
def symbol_to_month(symbol):
mapping_dict = {"F":"January", "G":"February", "H":"March", "J":"April", "K":"May", "M":"June",
"N":"July", "Q":"August", "U":"September", "V":"October", "X":"November", "Z":"December"}
return mapping_dict[symbol[0]]
for date, data in xm_year.iterrows():
symbol = xm_symbols.loc[date].dropna().map(symbol_to_month)
settle = xm_settle[symbol.index].loc[date]
data[symbol] = settle
In [15]:
def plot_xm_year(index, scale=None, save=False):
global xm_year
month_to_x = {month:idx for idx, month in enumerate(xm_year.columns)}
data = xm_year.iloc[index].dropna()
x = data.rename(month_to_x).index.values
y = data.values
global xm_symbols
months = xm_symbols.iloc[index].dropna().map(symbol_to_month).map(month_to_x)
months_increasing = all(months[i] < months[i+1] for i in range(len(months) - 1))
plt.figure(figsize=(16,9))
if months_increasing:
plt.plot(x, y, "-ob")
else:
splitindex = len(months) - months.values.argmin()
plt.plot(x[:splitindex], y[:splitindex], "-ob")
plt.plot(x[splitindex:], y[splitindex:], "-ob")
plt.xticks(np.arange(12), xm_year.columns)
if scale:
y_min = xm_year.min().min()
y_max = xm_year.max().max()
plt.ylim(y_min, y_max)
plt.xlim(0, 11)
ydiff = plt.ylim()[1] - plt.ylim()[0]
xdiff = plt.xlim()[1] - plt.xlim()[0]
for idx, idy in zip(x, y):
plt.text(idx + xdiff/200, idy + ydiff/200, idy)
plt.grid(axis="x")
plt.title(xm_year.iloc[index].name.date())
if save:
plt.savefig("img_annual_structure/{:04d}.png".format(index))
plt.close()
else:
plt.show()
plot_xm_year(3000)
In [16]:
# Create a new data frame for spread prices
long_prices = pd.DataFrame(index=xm_year.index, columns=xm_year.columns, dtype=np.float32)
for date, data in xm_year.iterrows():
data = np.concatenate((data[0:1], data, data[-1:]))
# Calculate long prices (buy 2-1-1)
prices = [2*data[i] - data[i-1] - data[i+1] for i in range(1,13)]
long_prices.loc[date] = prices
In [17]:
index = 3000
long_prices.iloc[index].plot(figsize=(16,9))
(xm_year.iloc[index] - xm_year.iloc[index].mean()).plot()
plt.grid()
plt.xticks(np.arange(12), long_prices.columns)
plt.legend(("Long prices", "Centered futures"), loc="upper left")
plt.show()
In [18]:
# There are times when no spread prices can be calculated.
long_prices.dropna(how="all").shape
Out[18]:
In [19]:
def plot_one_date(data_point, ha="left", va="bottom"):
plt.plot(data_point.name, data_point.isnull().sum(), 'rx')
plt.text(data_point.name, data_point.isnull().sum(), data_point.name.date(), ha=ha, va=va, color="r")
xs_null = xm_settle.isnull().sum(axis=1)
xs_null.plot()
plt.ylabel("Number of missing values")
plt.xlabel("")
count_nan = xm_settle.isnull().sum(axis=1)
last_day_with_many_nans = count_nan[count_nan > 2].index[-1]
first_day_with_usable_data = xm_settle.loc[last_day_with_many_nans:].iloc[1]
plot_one_date(first_day_with_usable_data, "right", "top")
plot_one_date(xm_settle.loc[last_day_with_many_nans])
plt.show()
In [20]:
xs_null = xs_null[xs_null > 0]
xs_null.groupby(xs_null.index.year).max().plot.bar(figsize=(4,3), color="r", width=0.8)
xs_full = xm_settle.isnull().sum(axis=1)
xs_full.groupby(xs_full.index.year).mean().plot.bar(stacked=True, width=0.8)
plt.xlim(-0.6,7.5)
plt.legend(("Max", "Mean"))
plt.xlabel("")
plt.ylabel("Number of missing values")
plt.grid(axis="y")
plt.savefig("missing_values.pdf", format="pdf", bbox_inches="tight", dpi=300)
plt.show()
In [21]:
clip_settle = xm_settle.loc[last_day_with_many_nans + datetime.timedelta(days=1):]
clip_year = xm_year.loc[last_day_with_many_nans + datetime.timedelta(days=1):]
assert clip_settle.index.identical(clip_year.index)
print("These are the data points you can actually use:")
len(clip_settle)
Out[21]:
In [22]:
clip_settle.loc["2006-11-15"].plot(figsize=(16,9))
plt.show()
In [23]:
clip_settle.isnull().sum()
Out[23]:
In [24]:
clip_settle.interpolate()[clip_settle.isnull()["M6"] == True].T.plot(figsize=(16,9))
plt.axvspan(5, 8, color=(0.9,0.9,1))
plt.show()
In [25]:
dates = pd.Series(xm_settle.index)
In [26]:
dates_diff = dates.diff()
dates_diff.groupby(dates_diff).count()
Out[26]:
In [27]:
weekdays = dates.map(operator.methodcaller("weekday"))
name_of_weekday = tuple(calendar.day_abbr)
weekdays = weekdays.groupby(weekdays).count()
weekdays.index = weekdays.index.map(lambda x: name_of_weekday[x])
weekdays.plot.bar(figsize=(4,3), width=0.6)
plt.xlabel("")
plt.savefig("weekdays.pdf", format="pdf", bbox_inches="tight", dpi=300)
plt.show()
In [28]:
expirations = expirations.loc["2006-10-23":]
xm_settle = xm_settle.loc["2006-10-23":]
assert expirations.shape == xm_settle.shape
In [29]:
expirations_v1 = expirations["V1"]
In [30]:
until_expiration = pd.Series(expirations_v1.values - expirations_v1.index.values)
assert len(until_expiration) == len(expirations)
until_expiration.index = expirations.index
until_expiration.name = "Expiration"
In [31]:
xm_settle.iloc[-1].plot(figsize=(10,5))
plt.axhline(vix[-1], color="black")
date = xm_settle.iloc[-1].name
plt.title("{} ({} days to expiration)".format(date.date(), until_expiration.loc[date].days))
plt.show()
In [32]:
settle_diff = xm_settle.diff(axis=1).iloc[:,1:] # Because first column only has NaNs
settle_diff = settle_diff.join(xm_settle.iloc[:,0]) # Get original first value in
settle_diff = settle_diff.iloc[:,range(-1,7)] # Get the order right: M1 -> M8
settle_diff
Out[32]:
In [33]:
settle_diff.plot(figsize=(16,9))
print(settle_diff.min().min(), settle_diff.max().max())
plt.title("Differences between spread prices.")
plt.show()
In [34]:
expiration_indices = until_expiration.where(until_expiration == pd.Timedelta(0))
expiration_indices.index = range(len(expiration_indices))
expiration_indices = expiration_indices.dropna().index
expiration_indices
Out[34]:
In [35]:
# First consider the expired futures. One the next day they get value NaN.
spreads = xm_settle.copy()
spreads.iloc[expiration_indices[:-1] + 1] = xm_settle.iloc[expiration_indices[:-1] + 1].iloc[:,1:].assign(M1=lambda x: np.NaN).iloc[:,range(-1,7)]
assert len(spreads[spreads.M1.isnull() == True]) == len(expiration_indices) - 1
def calculate_long_prices(term: pd.Series):
longs = [2*term[i] - term[i-1] - term[i+1] for i in range(1, len(term)-1)]
return pd.Series(longs, term[1:-1].index)
spreads = spreads.apply(calculate_long_prices, axis=1)
In [36]:
xm_settle.shape
Out[36]:
In [37]:
assert spreads.index.identical(settle_diff.index)
In [38]:
xm_settle7 = xm_settle.loc[:,"M1":"M7"]
xm_settle7_test = xm_settle7.loc["2006-11-15"].dropna()
xm_settle7_test
Out[38]:
In [39]:
xm_settle7_test.index = list(range(len(xm_settle7_test.index)))
In [40]:
z = np.polyfit(xm_settle7_test.index, xm_settle7_test, 2)
p = np.poly1d(z)
xm_settle7_test.plot()
xp = np.linspace(0, 5, 100)
plt.plot(xp, p(xp))
plt.show()
In [41]:
xm_settle7[xm_settle7.M7.isnull()].T.plot(figsize=(10,200), legend=False, subplots=True)
plt.show()
In [42]:
spreads_norm = (spreads - spreads.mean()) / (spreads.max() - spreads.min())
print(spreads_norm.min().min(), spreads_norm.max().max())
spreads_norm.plot()
plt.show()
In [43]:
settle_norm = (settle_diff - settle_diff.mean()) / (settle_diff.max() - settle_diff.min())
print(settle_norm.min().min(), settle_norm.max().max())
settle_norm.plot()
plt.show()
In [44]:
settle_denorm = settle_norm.values * (settle_diff.max() - settle_diff.min()).values + settle_diff.mean().values
pd.DataFrame(settle_denorm).plot()
plt.show()
In [45]:
settle_test = (settle_diff - settle_denorm)
settle_test.plot()
plt.show()
In [46]:
fig, axes = plt.subplots(nrows=2, sharex=True, figsize=(10,10))
settle_diff.plot(ax=axes[0])
def plot_vlines(ax):
l = len(settle_diff)
s = int(l * 0.15 / 2)
# Validation (red)
ax.axvspan(settle_diff.index[int(l/2-s)], settle_diff.index[int(l / 2)], facecolor="r", alpha=0.3)
ax.axvspan(settle_diff.index[-s], settle_diff.index[-1], facecolor="r", alpha=0.3)
# Test (green)
ax.axvspan(settle_diff.index[int(l/2-2*s)], settle_diff.index[int(l/2)-s], facecolor="g", alpha=0.3)
ax.axvspan(settle_diff.index[-2*s], settle_diff.index[-s], facecolor="g", alpha=0.3)
plot_vlines(axes[0])
axes[0].set_title("Differences of futures' term structures")
spreads.plot(ax=axes[1])
plot_vlines(axes[1])
axes[1].set_title("Butterfly spread prices")
plt.show()
In [47]:
expirations = Expirations("data/expirations.csv")
termStructure = TermStructure("data/8_m_settle.csv", expirations)
In [48]:
naive_prediction_error = termStructure.long_prices[:-1].values - termStructure.long_prices[1:].values
In [49]:
print("Naive MSE. I definitely have to beat this!")
print(np.nanmean(np.square(naive_prediction_error)))
In [50]:
exp_counter = pd.Series(Counter(expirations.days_to_expiration), dtype=int)
exp_counter.index = pd.to_numeric(exp_counter.index, downcast="integer")
exp_counter.plot.bar(figsize=(8,2), width=0.8)
plt.ylabel("Nr. of occurences")
plt.savefig("days_to_expiration.pdf", format="pdf", bbox_inches="tight", dpi=300)
plt.show()
In [51]:
xm_settle.dropna(how="all", axis=(0,1), inplace=True)
xm_settle.describe()
Out[51]:
In [52]:
xm_settle[xm_settle.isnull().sum(axis=1) >= 5]
Out[52]:
In [53]:
#exp_last = expirations.loc[xm_settle.iloc[-1].name]
#exp_last.index = xm_settle.iloc[-1].index
#templol = pd.concat((xm_settle.iloc[-1], exp_last), axis=1).T
#templol.index = ["Value", "Expiration"]
#templol.loc["Expiration"] = templol.loc["Expiration"].apply(lambda x: x.strftime("%b %d"))
#print(templol.to_latex())
In [54]:
xm_settle.iloc[::, ::-1].plot(colormap=cm.winter_r, figsize=(8,4), linewidth=1.0, alpha=0.8)
plt.xlabel("")
plt.savefig("termstructures.pdf", format="pdf", bbox_inches="tight", dpi=300)
plt.show()
In [55]:
fig = plt.figure(figsize=(8, 4))
ax = fig.gca(projection='3d')
xs = np.arange(len(xm_settle))
zs = np.arange(0, 8)
verts = []
xm_settle_3dplot = xm_settle.copy()
xm_settle_3dplot.index = xs
for z in zs:
ys = xm_settle_3dplot.iloc[:,int(z)].fillna(10)
ys.iloc[0] = 10
ys.iloc[-1] = 10
verts.append(list(zip(ys.index.values, ys.values)))
poly = PolyCollection(verts, linewidth=2.0, facecolors=[cm.winter(i, 0.8) for i in np.linspace(0, 1, 8)])
ax.add_collection3d(poly, zs=zs, zdir='y')
ax.set_xlim3d(0, len(xm_settle))
ax.set_xticks([(xm_settle.index.year == year).argmax() for year in xm_settle.index.year.unique()[1::2]])
ax.set_xticklabels(xm_settle.index.year.unique()[1::2])
ax.set_ylim3d(0.0, 7.5)
ax.set_yticklabels(["M1", "M2", "M3", "M4", "M5", "M6", "M7", "M8"])
ax.set_zlim3d(11, 50)
plt.savefig("termstructures.pdf", format="pdf", bbox_inches="tight", dpi=300)
plt.show()
In [56]:
xm_settle.groupby(xm_settle.index.year).count().sum(axis=1).plot.bar(figsize=(4,3), width=0.7)
plt.xlabel("")
plt.ylabel("Number of samples")
plt.grid(axis="y")
plt.savefig("number_of_samples.pdf", format="pdf", bbox_inches="tight", dpi=300)
plt.show()
In [57]:
spreads_descr = pd.concat({key:spreads[key] for key in spreads.columns}).describe()
In [58]:
termstr_descr = pd.concat({key:xm_settle[key] for key in xm_settle.columns}).describe()
In [59]:
print(pd.concat([termstr_descr, spreads_descr], axis=1, keys=["term structure", "spread prices"]).iloc[1:].T.to_latex(float_format='%.2f'))
In [114]:
fig, axes = plt.subplots(nrows=2, sharex=True, figsize=(8,8))
xm_settle.plot(ax=axes[0], cmap=cm.brg, linewidth=0.8)
axes[0].grid(axis="x")
axes[0].set_ylabel("Futures price")
axes[0].legend(loc="upper left")
def plot_vlines(ax):
l = len(xm_settle)
s = int(l * 0.15 / 2)
# Validation (red)
ax.axvspan(xm_settle.index[int(l/2-s)], xm_settle.index[int(l / 2)], facecolor="r", alpha=0.3)
ax.axvspan(xm_settle.index[-s], xm_settle.index[-1], facecolor="r", alpha=0.3)
# Test (green)
ax.axvspan(xm_settle.index[int(l/2-2*s)], xm_settle.index[int(l/2)-s], facecolor="g", alpha=0.3)
ax.axvspan(xm_settle.index[-2*s], xm_settle.index[-s], facecolor="g", alpha=0.3)
plot_vlines(axes[0])
spreads.plot(ax=axes[1], cmap=cm.brg, linewidth=0.8)
axes[1].grid(axis="x")
axes[1].set_ylabel("Spread price")
plot_vlines(axes[1])
plt.xlabel("")
plt.savefig("validation-and-test-set.pdf", format="pdf", bbox_inches="tight", dpi=300)
plt.show()
In [ ]: