In [1]:
import operator
from functools import reduce
from itertools import repeat, cycle
import datetime
import calendar
from collections import Counter

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.collections import PolyCollection, LineCollection
from matplotlib import colors as mcolors

from vixstructure.data import TermStructure, Expirations

In [2]:
mpl.rcParams["figure.figsize"] = 16, 9

In [3]:
x = np.linspace(-3, 3, 61)
fig = plt.figure(figsize=(3.5,2))
plt.axvline(0, color="black", lw=1)
plt.axhline(0, color="black", lw=1)
ax = fig.add_subplot(1, 1, 1)
ax.spines['left'].set_position('zero')
ax.spines['right'].set_color('none')
ax.spines['bottom'].set_position('zero')
ax.spines['top'].set_color('none')
ax.spines['left'].set_smart_bounds(True)
ax.spines['bottom'].set_smart_bounds(True)
ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')
ax.plot(x, np.maximum(0 ,x), lw=2)
plt.xticks(range(-3, 6), [])
plt.yticks(range(-3, 6), [])
plt.text(-0.1, -0.1, 0, ha="right", va="top", size="large")
plt.ylim(-0.5, 3.5)
plt.xlim(-3.5, 3.5)
plt.savefig("rectifier.pdf", format="pdf", bbox_inches="tight", dpi=300)
plt.show()



In [22]:
def selu(z, alpha=1.6732632423543772848170429916717, lam=1.0507009873554804934193349852946):
    return lam * z if z > 0 else alpha * np.exp(z) - alpha
vselu = np.vectorize(selu)
fig = plt.figure(figsize=(2.5,2))
plt.axvline(0, color="black", lw=1)
plt.axhline(0, color="black", lw=1)
ax = fig.add_subplot(1, 1, 1)
ax.spines['left'].set_position('zero')
ax.spines['right'].set_color('none')
ax.spines['bottom'].set_position('zero')
ax.spines['top'].set_color('none')
ax.spines['left'].set_smart_bounds(True)
ax.spines['bottom'].set_smart_bounds(True)
ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')
ax.plot(x, vselu(x), lw=2)
plt.xticks(range(-3, 6), [])
plt.yticks(range(-3, 6), [])
plt.text(-0.1, -0.1, 0, ha="right", va="top", size="large")
plt.ylim(-2.5, 3.5)
plt.xlim(-3.5, 3.5)
plt.show()



In [4]:
# Consider value '0' as NaN.
xm_settle = pd.read_csv("data/8_m_settle.csv", usecols=range(1,10), dtype=np.float32,
                        parse_dates=[0], header=0, index_col=0, na_values=0)
xm_symbols = pd.read_csv("data/8_m_symbols.csv", usecols=range(1,10), parse_dates=[0],
                         header=0, index_col=0, na_values=0)
expiration_months = pd.read_csv("data/expiration.months.csv", header=0, usecols=range(1,13), dtype=np.float32)
vix_index = pd.read_csv("data/vix.csv", parse_dates=[0], header=0, index_col=0, na_values="null",
                        dtype=np.float32)
expirations = pd.read_csv("data/expirations.csv", parse_dates=list(range(0,9)), usecols=range(1,10),
                                header=0, index_col=0)

In [5]:
vix_index["Adj Close"]["2006":"2016"].plot(figsize=(8,4))
plt.xlabel("")
plt.savefig("vix.pdf", format="pdf", bbox_inches="tight", dpi=300)
plt.show()


Plot all the term structures in some large range

Helpful to get a general feeling.


In [6]:
# Plot part of the data.
lines = xm_settle.iloc[1000:2000].T
lines.fillna(method='pad').plot(legend=False, figsize=(16,8), colormap=cm.Blues)
plt.grid()
plt.show()


And this is a single term structure

One of the nice and clean ones.


In [7]:
data_line = xm_settle.iloc[1240:3000:252].iloc[1:]
data_line.T.plot(style="-", legend=False, figsize=(4,5))
plt.grid()
plt.xticks(np.arange(8), calendar.month_abbr[3:])
plt.legend(tuple(map(lambda x: x.strftime("%m/%d/%Y"), data_line.index.date)), title=False)
plt.savefig("term-structure.pdf", format="pdf", bbox_inches="tight", dpi=300)
plt.show()



In [8]:
data_line = xm_settle.loc["2008-11-20":"2008-12-17"]
data_line.T.plot(style="-", legend=False, figsize=(4,5), colormap=cm.coolwarm)
plt.grid()
plt.xticks(np.arange(8), calendar.month_abbr[12:] + calendar.month_abbr[1:8])
plt.savefig("term-structure-crisis.pdf", format="pdf", bbox_inches="tight", dpi=300)
plt.show()



In [9]:
data_line = xm_settle.loc["2012-02-29"]
ax1 = data_line.T.plot(style="-o", legend=False, figsize=(8,4), color="green")
ax1.set_ylabel("futures price", color="green")
plt.xticks(np.arange(8), calendar.month_abbr[3:11])
ax1.tick_params(axis='y', colors='green')
data_line_spread = data_line.aggregate(
    lambda x: pd.Series([np.nan] + [2*x[i] - x[i-1] - x[i+1] for i in range(1, len(x)-1)] + [np.nan],
                        index=calendar.month_abbr[3:11]))
ax2 = data_line_spread.plot(secondary_y=True, color="blue", style="-o")
ax2.set_ylabel("spread price", rotation=270, color="blue")
ax2.tick_params(axis='y', colors='blue')
ax1.grid(axis="x")
plt.axhline(0, color="CornflowerBlue", linestyle="--")
plt.savefig("long_spread.pdf", format="pdf", bbox_inches="tight", dpi=300)
plt.show()


Here I plot the contangos for a large range

Notice how the fluctuation of the M2-M1 contango is the largest while it steadily decreases for later expiration months.


In [10]:
mxs = [xm_settle.iloc[2000:3000, i] for i in range(8)]
contangos = [(mxs[i + 1] - mxs[i]) / mxs[i] for i in range(8 - 1)]
contango_labels = ["M{}-M{}-Contango".format(i+1, i) for i in range(1,8)]
plt.figure(figsize=(16,10))
for i in range(len(contangos)):
    contangos[i].plot(label=contango_labels[i], legend=True)
plt.axhline(0, color="black")
plt.xlabel("")
plt.grid()
plt.show()



In [11]:
vix = vix_index["Adj Close"]   # Only this one is needed for the index.
trainingdata = pd.merge(pd.DataFrame(vix), xm_settle, left_index=True, right_index=True)

Experimenting with data normalization

The make sure the training error isn't exploding it's best to normalize the data so its in (-1,1)-range. Doing this as simple as possible is preferred. Too much data wrangling beforehand might introduce some unwanted prior. Wenn normalizing the data all NaN values (including 0) aren't considered because they stand for not available data and influence the result too much.


In [12]:
mean = trainingdata.mean()
ptp = trainingdata.max() - trainingdata.min()
normalized = (trainingdata - mean) / ptp

In [13]:
normalized.plot(figsize=(16,10))
plt.grid()
plt.show()


Mapping Data to anual structure

At the moment you have a x-axis [M1,...,M8] but I want it [Jan,Feb,...,Dec]. All the points without given values are NaN. Something like expiration_months but with dates.


In [14]:
# Create a new data frame (not very efficient)
xm_year = pd.DataFrame(index=xm_settle.index, columns=expiration_months.columns, dtype=np.float32)
def symbol_to_month(symbol):
    mapping_dict = {"F":"January", "G":"February", "H":"March", "J":"April", "K":"May", "M":"June",
                    "N":"July", "Q":"August", "U":"September", "V":"October", "X":"November", "Z":"December"}
    return mapping_dict[symbol[0]]
for date, data in xm_year.iterrows():
    symbol = xm_symbols.loc[date].dropna().map(symbol_to_month)
    settle = xm_settle[symbol.index].loc[date]
    data[symbol] = settle

In [15]:
def plot_xm_year(index, scale=None, save=False):
    global xm_year
    month_to_x = {month:idx for idx, month in enumerate(xm_year.columns)}
    data = xm_year.iloc[index].dropna()
    x = data.rename(month_to_x).index.values
    y = data.values
    global xm_symbols
    months = xm_symbols.iloc[index].dropna().map(symbol_to_month).map(month_to_x)
    months_increasing = all(months[i] < months[i+1] for i in range(len(months) - 1))
    plt.figure(figsize=(16,9))
    if months_increasing:
        plt.plot(x, y, "-ob")
    else:
        splitindex = len(months) - months.values.argmin()
        plt.plot(x[:splitindex], y[:splitindex], "-ob")
        plt.plot(x[splitindex:], y[splitindex:], "-ob")
    plt.xticks(np.arange(12), xm_year.columns)
    if scale:
        y_min = xm_year.min().min()
        y_max = xm_year.max().max()
        plt.ylim(y_min, y_max)
    plt.xlim(0, 11)
    ydiff = plt.ylim()[1] - plt.ylim()[0]
    xdiff = plt.xlim()[1] - plt.xlim()[0]
    for idx, idy in zip(x, y):
        plt.text(idx + xdiff/200, idy + ydiff/200, idy)
    plt.grid(axis="x")
    plt.title(xm_year.iloc[index].name.date())
    if save:
        plt.savefig("img_annual_structure/{:04d}.png".format(index))
        plt.close()
    else:
        plt.show()
plot_xm_year(3000)



In [16]:
# Create a new data frame for spread prices
long_prices = pd.DataFrame(index=xm_year.index, columns=xm_year.columns, dtype=np.float32)
for date, data in xm_year.iterrows():
    data = np.concatenate((data[0:1], data, data[-1:]))
    # Calculate long prices (buy 2-1-1)
    prices = [2*data[i] - data[i-1] - data[i+1] for i in range(1,13)]
    long_prices.loc[date] = prices

In [17]:
index = 3000
long_prices.iloc[index].plot(figsize=(16,9))
(xm_year.iloc[index] - xm_year.iloc[index].mean()).plot()
plt.grid()
plt.xticks(np.arange(12), long_prices.columns)
plt.legend(("Long prices", "Centered futures"), loc="upper left")
plt.show()


What to do with NaN values?

There is a high number of NaN values especially with early data. It is the best to remove this first part because it is simply to unstable.


In [18]:
# There are times when no spread prices can be calculated.
long_prices.dropna(how="all").shape


Out[18]:
(2887, 12)

In [19]:
def plot_one_date(data_point, ha="left", va="bottom"):
    plt.plot(data_point.name, data_point.isnull().sum(), 'rx')
    plt.text(data_point.name, data_point.isnull().sum(), data_point.name.date(), ha=ha, va=va, color="r")
xs_null = xm_settle.isnull().sum(axis=1)
xs_null.plot()
plt.ylabel("Number of missing values")
plt.xlabel("")
count_nan = xm_settle.isnull().sum(axis=1)
last_day_with_many_nans = count_nan[count_nan > 2].index[-1]
first_day_with_usable_data = xm_settle.loc[last_day_with_many_nans:].iloc[1]
plot_one_date(first_day_with_usable_data, "right", "top")
plot_one_date(xm_settle.loc[last_day_with_many_nans])
plt.show()



In [20]:
xs_null = xs_null[xs_null > 0]
xs_null.groupby(xs_null.index.year).max().plot.bar(figsize=(4,3), color="r", width=0.8)
xs_full = xm_settle.isnull().sum(axis=1)
xs_full.groupby(xs_full.index.year).mean().plot.bar(stacked=True, width=0.8)
plt.xlim(-0.6,7.5)
plt.legend(("Max", "Mean"))
plt.xlabel("")
plt.ylabel("Number of missing values")
plt.grid(axis="y")
plt.savefig("missing_values.pdf", format="pdf", bbox_inches="tight", dpi=300)
plt.show()



In [21]:
clip_settle = xm_settle.loc[last_day_with_many_nans + datetime.timedelta(days=1):]
clip_year = xm_year.loc[last_day_with_many_nans + datetime.timedelta(days=1):]
assert clip_settle.index.identical(clip_year.index)
print("These are the data points you can actually use:")
len(clip_settle)


These are the data points you can actually use:
Out[21]:
2656

In [22]:
clip_settle.loc["2006-11-15"].plot(figsize=(16,9))
plt.show()



In [23]:
clip_settle.isnull().sum()


Out[23]:
M1      0
M2      0
M3      0
M4      0
M5      0
M6     27
M7    186
M8    408
dtype: int64

In [24]:
clip_settle.interpolate()[clip_settle.isnull()["M6"] == True].T.plot(figsize=(16,9))
plt.axvspan(5, 8, color=(0.9,0.9,1))
plt.show()


How large are the gaps in the data from holidays, weekends etc.?

  • A gap of one day is normal
  • A gap of two days means a weekend (and that is essentially every fifth day)

In [25]:
dates = pd.Series(xm_settle.index)

In [26]:
dates_diff = dates.diff()
dates_diff.groupby(dates_diff).count()


Out[26]:
date
1 days    2592
2 days      27
3 days     594
4 days      89
5 days       2
Name: date, dtype: int64

In [27]:
weekdays = dates.map(operator.methodcaller("weekday"))
name_of_weekday = tuple(calendar.day_abbr)
weekdays = weekdays.groupby(weekdays).count()
weekdays.index = weekdays.index.map(lambda x: name_of_weekday[x])
weekdays.plot.bar(figsize=(4,3), width=0.6)
plt.xlabel("")
plt.savefig("weekdays.pdf", format="pdf", bbox_inches="tight", dpi=300)
plt.show()


Finding the right input data

Note that only data from 2006-10-23 is used (look above).


In [28]:
expirations = expirations.loc["2006-10-23":]
xm_settle = xm_settle.loc["2006-10-23":]
assert expirations.shape == xm_settle.shape

1. Time to expiration


In [29]:
expirations_v1 = expirations["V1"]

In [30]:
until_expiration = pd.Series(expirations_v1.values - expirations_v1.index.values)
assert len(until_expiration) == len(expirations)
until_expiration.index = expirations.index
until_expiration.name = "Expiration"

In [31]:
xm_settle.iloc[-1].plot(figsize=(10,5))
plt.axhline(vix[-1], color="black")
date = xm_settle.iloc[-1].name
plt.title("{} ({} days to expiration)".format(date.date(), until_expiration.loc[date].days))
plt.show()


2. Difference between term structure legs

This is the input data.


In [32]:
settle_diff = xm_settle.diff(axis=1).iloc[:,1:] # Because first column only has NaNs
settle_diff = settle_diff.join(xm_settle.iloc[:,0]) # Get original first value in
settle_diff = settle_diff.iloc[:,range(-1,7)] # Get the order right: M1 -> M8
settle_diff


Out[32]:
M1 M2 M3 M4 M5 M6 M7 M8
date
2006-10-23 11.900000 1.250000 0.930000 0.570000 0.380000 0.500000 0.170000 NaN
2006-10-24 11.770000 1.040000 1.250000 0.440000 0.660000 0.250000 0.190001 NaN
2006-10-25 11.490000 1.030001 1.219999 0.730000 0.450000 0.510000 0.170000 NaN
2006-10-26 11.270000 0.950000 1.280000 0.660000 0.800000 0.290000 0.190000 NaN
2006-10-27 11.350000 0.900000 1.240000 0.740000 0.760000 0.200000 0.260000 NaN
2006-10-30 11.300000 0.970000 1.080000 0.799999 0.510000 0.440001 0.365000 NaN
2006-10-31 11.350000 0.799999 1.050000 0.750000 0.660000 0.370000 0.270000 NaN
2006-11-01 11.510000 0.940000 0.790000 0.710000 0.540000 0.390000 0.200000 NaN
2006-11-02 11.660000 0.780000 0.720000 0.640000 0.710000 0.240000 0.300000 NaN
2006-11-03 11.500000 0.980000 0.560000 0.900000 0.520000 0.300000 0.240000 NaN
2006-11-06 11.140000 0.860000 0.950000 0.770000 0.550000 0.419999 0.310000 NaN
2006-11-07 11.210000 0.860000 0.810000 0.809999 0.460000 0.380000 0.460000 NaN
2006-11-08 10.900000 0.920000 0.980000 0.500000 0.639999 0.490001 0.469999 NaN
2006-11-09 10.950000 0.810000 1.040000 0.500000 0.630000 0.520000 0.380000 NaN
2006-11-10 10.990000 0.990000 0.740001 0.440000 0.590000 0.640000 0.389999 NaN
2006-11-13 10.590000 1.360000 0.780000 0.450001 0.520000 0.630000 0.450000 NaN
2006-11-14 10.240000 1.460000 0.820001 0.500000 0.580000 0.670000 0.410000 NaN
2006-11-15 11.600000 0.700000 0.650000 0.600000 0.550000 0.410000 NaN NaN
2006-11-16 11.600000 0.679999 0.640000 0.510000 0.570000 0.530000 NaN NaN
2006-11-17 11.650000 0.700001 0.570000 0.480000 0.560000 0.540000 NaN NaN
2006-11-20 11.400000 0.950001 0.719999 0.280001 0.549999 0.660001 0.889999 NaN
2006-11-21 11.520000 0.980000 0.780000 0.220000 0.450000 0.560000 NaN NaN
2006-11-22 11.600000 1.040000 0.920000 0.160000 0.230000 0.840000 NaN NaN
2006-11-24 11.710000 1.140000 0.690000 0.560000 0.170000 0.629999 NaN NaN
2006-11-27 12.450000 0.990000 0.560000 0.370000 0.220000 0.520000 NaN NaN
2006-11-28 11.950000 1.000000 0.710000 0.440001 0.349999 0.530000 NaN NaN
2006-11-29 11.420000 1.180000 0.860000 0.470000 0.480000 0.540000 NaN NaN
2006-11-30 11.360000 1.170000 0.750000 0.550000 0.550000 0.580000 NaN NaN
2006-12-01 11.600000 1.150000 0.760000 0.510000 0.469999 0.660000 NaN NaN
2006-12-04 11.500000 1.000000 0.680000 0.670000 0.540000 0.660000 NaN NaN
... ... ... ... ... ... ... ... ...
2017-03-30 12.820000 0.530001 0.820000 0.880000 0.440000 0.790001 0.469999 0.250000
2017-03-31 13.280000 0.300000 0.670000 0.750000 0.500000 0.760000 0.440001 0.330000
2017-04-03 13.480000 0.100000 0.690001 0.809999 0.490000 0.770000 0.410000 0.299999
2017-04-04 13.240000 0.130000 0.760000 0.820000 0.450000 0.700001 0.400000 0.320000
2017-04-05 13.900000 -0.049999 0.700000 0.750000 0.380000 0.770000 0.289999 0.260000
2017-04-06 13.600000 -0.050000 0.599999 0.810000 0.390000 0.699999 0.420000 0.310001
2017-04-07 14.050000 -0.190001 0.690001 0.700000 0.350000 0.699999 0.400002 0.250000
2017-04-10 15.050000 -0.720000 0.460000 0.650000 0.320001 0.719999 0.320000 0.270000
2017-04-11 16.000000 -1.470000 0.350000 0.650000 0.340000 0.650001 0.330000 0.299999
2017-04-12 16.299999 -1.349999 0.070001 0.570000 0.309999 0.660000 0.320000 0.300001
2017-04-13 16.299999 -1.099999 0.070001 0.580000 0.250000 0.600000 0.369999 0.270000
2017-04-17 15.070000 -0.770000 0.250000 0.670000 0.330000 0.700000 0.350000 0.250000
2017-04-18 14.700000 -0.380000 0.220000 0.630000 0.320000 0.710001 0.349998 0.340000
2017-04-19 14.580000 -0.050000 0.590000 0.310000 0.740000 0.400000 0.300001 0.080000
2017-04-20 14.320000 0.020000 0.650000 0.390000 0.790000 0.400000 0.300001 0.029999
2017-04-21 14.300000 -0.060000 0.560000 0.349999 0.780001 0.410000 0.360001 0.049999
2017-04-24 12.680000 0.620000 0.889999 0.510000 0.890000 0.430000 0.359999 0.070002
2017-04-25 12.500000 0.600000 0.799999 0.500000 0.800000 0.450000 0.400000 0.130001
2017-04-26 12.350000 0.839999 0.860001 0.500000 0.800000 0.379999 0.360001 0.160000
2017-04-27 12.400000 0.700001 0.969999 0.510000 0.820000 0.400001 0.349999 0.100000
2017-04-28 12.250000 0.860000 0.870000 0.500000 0.780001 0.429999 0.410001 0.109999
2017-05-01 11.850000 0.750000 0.950000 0.570000 0.800000 0.380000 0.410000 0.160000
2017-05-02 11.800000 1.020000 0.930000 0.450000 0.850000 0.349999 0.350000 0.170000
2017-05-03 12.100000 0.849999 0.910000 0.490001 0.750000 0.450000 0.349999 0.150000
2017-05-04 11.730000 0.940001 0.930000 0.509999 0.840000 0.400001 0.450000 0.150000
2017-05-05 11.780000 0.890000 0.990000 0.560000 0.880000 0.450000 0.400000 0.160001
2017-05-08 11.400000 1.100000 1.200000 0.630000 0.850000 0.450000 0.370000 0.200001
2017-05-09 11.400000 1.080000 1.190001 0.680000 0.799999 0.470000 0.380000 0.190001
2017-05-10 11.550000 0.920000 1.210000 0.719999 0.900001 0.429999 0.370001 0.230000
2017-05-11 11.450000 1.160000 1.080000 0.660001 0.950000 0.429999 0.400000 0.170000

2656 rows × 8 columns


In [33]:
settle_diff.plot(figsize=(16,9))
print(settle_diff.min().min(), settle_diff.max().max())
plt.title("Differences between spread prices.")
plt.show()


-21.1 67.9

Get targets (spread prices)


In [34]:
expiration_indices = until_expiration.where(until_expiration == pd.Timedelta(0))
expiration_indices.index = range(len(expiration_indices))
expiration_indices = expiration_indices.dropna().index
expiration_indices


Out[34]:
Int64Index([  17,   41,   57,   77,  101,  120,  140,  164,  183,  208,
            ...
            2471, 2495, 2515, 2535, 2559, 2576, 2596, 2620, 2640, 2655],
           dtype='int64', length=127)

In [35]:
# First consider the expired futures. One the next day they get value NaN.
spreads = xm_settle.copy()
spreads.iloc[expiration_indices[:-1] + 1] = xm_settle.iloc[expiration_indices[:-1] + 1].iloc[:,1:].assign(M1=lambda x: np.NaN).iloc[:,range(-1,7)]
assert len(spreads[spreads.M1.isnull() == True]) == len(expiration_indices) - 1
def calculate_long_prices(term: pd.Series):
    longs = [2*term[i] - term[i-1] - term[i+1] for i in range(1, len(term)-1)]
    return pd.Series(longs, term[1:-1].index)
spreads = spreads.apply(calculate_long_prices, axis=1)

In [36]:
xm_settle.shape


Out[36]:
(2656, 8)

In [37]:
assert spreads.index.identical(settle_diff.index)

In [38]:
xm_settle7 = xm_settle.loc[:,"M1":"M7"]
xm_settle7_test = xm_settle7.loc["2006-11-15"].dropna()
xm_settle7_test


Out[38]:
M1    11.60
M2    12.30
M3    12.95
M4    13.55
M5    14.10
M6    14.51
Name: 2006-11-15 00:00:00, dtype: float32

In [39]:
xm_settle7_test.index = list(range(len(xm_settle7_test.index)))

In [40]:
z = np.polyfit(xm_settle7_test.index, xm_settle7_test, 2)
p = np.poly1d(z)
xm_settle7_test.plot()
xp = np.linspace(0, 5, 100)
plt.plot(xp, p(xp))
plt.show()



In [41]:
xm_settle7[xm_settle7.M7.isnull()].T.plot(figsize=(10,200), legend=False, subplots=True)
plt.show()


Normalization


In [42]:
spreads_norm = (spreads - spreads.mean()) / (spreads.max() - spreads.min())
print(spreads_norm.min().min(), spreads_norm.max().max())
spreads_norm.plot()
plt.show()


-0.79177435516 0.620398106199

In [43]:
settle_norm = (settle_diff - settle_diff.mean()) / (settle_diff.max() - settle_diff.min())
print(settle_norm.min().min(), settle_norm.max().max())
settle_norm.plot()
plt.show()


-0.824046 0.817341

In [44]:
settle_denorm = settle_norm.values * (settle_diff.max() - settle_diff.min()).values + settle_diff.mean().values
pd.DataFrame(settle_denorm).plot()
plt.show()



In [45]:
settle_test = (settle_diff - settle_denorm)
settle_test.plot()
plt.show()


Splitting the data

We need a set for training, cross validation and testing.


In [46]:
fig, axes = plt.subplots(nrows=2, sharex=True, figsize=(10,10))
settle_diff.plot(ax=axes[0])
def plot_vlines(ax):
    l = len(settle_diff)
    s = int(l * 0.15 / 2)
    # Validation (red)
    ax.axvspan(settle_diff.index[int(l/2-s)], settle_diff.index[int(l / 2)], facecolor="r", alpha=0.3)
    ax.axvspan(settle_diff.index[-s], settle_diff.index[-1], facecolor="r", alpha=0.3)
    # Test (green)
    ax.axvspan(settle_diff.index[int(l/2-2*s)], settle_diff.index[int(l/2)-s], facecolor="g", alpha=0.3)
    ax.axvspan(settle_diff.index[-2*s], settle_diff.index[-s], facecolor="g", alpha=0.3)
plot_vlines(axes[0])
axes[0].set_title("Differences of futures' term structures")
spreads.plot(ax=axes[1])
plot_vlines(axes[1])
axes[1].set_title("Butterfly spread prices")
plt.show()


Naive prediction

What is the loss when the network always the same as the current day? (identity)


In [47]:
expirations = Expirations("data/expirations.csv")
termStructure = TermStructure("data/8_m_settle.csv", expirations)

In [48]:
naive_prediction_error = termStructure.long_prices[:-1].values - termStructure.long_prices[1:].values

In [49]:
print("Naive MSE. I definitely have to beat this!")
print(np.nanmean(np.square(naive_prediction_error)))


Naive MSE. I definitely have to beat this!
0.170281337234

In [50]:
exp_counter = pd.Series(Counter(expirations.days_to_expiration), dtype=int)
exp_counter.index = pd.to_numeric(exp_counter.index, downcast="integer")
exp_counter.plot.bar(figsize=(8,2), width=0.8)
plt.ylabel("Nr. of occurences")
plt.savefig("days_to_expiration.pdf", format="pdf", bbox_inches="tight", dpi=300)
plt.show()


Playing around with visualizations


In [51]:
xm_settle.dropna(how="all", axis=(0,1), inplace=True)
xm_settle.describe()


Out[51]:
M1 M2 M3 M4 M5 M6 M7 M8
count 2656.000000 2656.000000 2656.000000 2656.000000 2656.000000 2629.000000 2470.000000 2248.000000
mean 20.772100 21.550528 22.051373 22.374050 22.682241 22.983538 23.529346 23.348955
std 8.747683 7.765030 7.115117 6.633684 6.301106 6.037752 5.825076 5.562232
min 10.240000 11.700000 12.520000 13.020000 13.600000 14.100000 14.500000 14.300000
25% 14.877500 15.950000 16.750000 17.400000 17.870001 18.370001 18.922500 19.150000
50% 18.245000 19.450001 20.350000 20.900000 21.400000 21.770000 22.595000 21.850000
75% 24.000000 24.799999 25.299999 25.602500 25.850000 26.400000 27.299999 26.512500
max 67.900002 59.520000 54.619999 50.430000 47.759998 45.990002 44.580002 44.000000

In [52]:
xm_settle[xm_settle.isnull().sum(axis=1) >= 5]


Out[52]:
M1 M2 M3 M4 M5 M6 M7 M8
date

In [53]:
#exp_last = expirations.loc[xm_settle.iloc[-1].name]
#exp_last.index = xm_settle.iloc[-1].index
#templol = pd.concat((xm_settle.iloc[-1], exp_last), axis=1).T
#templol.index = ["Value", "Expiration"]
#templol.loc["Expiration"] = templol.loc["Expiration"].apply(lambda x: x.strftime("%b %d"))
#print(templol.to_latex())

In [54]:
xm_settle.iloc[::, ::-1].plot(colormap=cm.winter_r, figsize=(8,4), linewidth=1.0, alpha=0.8)
plt.xlabel("")
plt.savefig("termstructures.pdf", format="pdf", bbox_inches="tight", dpi=300)
plt.show()



In [55]:
fig = plt.figure(figsize=(8, 4))
ax = fig.gca(projection='3d')
xs = np.arange(len(xm_settle))
zs = np.arange(0, 8)
verts = []
xm_settle_3dplot = xm_settle.copy()
xm_settle_3dplot.index = xs
for z in zs:
    ys = xm_settle_3dplot.iloc[:,int(z)].fillna(10)
    ys.iloc[0] = 10
    ys.iloc[-1] = 10
    verts.append(list(zip(ys.index.values, ys.values)))
poly = PolyCollection(verts, linewidth=2.0, facecolors=[cm.winter(i, 0.8) for i in  np.linspace(0, 1, 8)])
ax.add_collection3d(poly, zs=zs, zdir='y')

ax.set_xlim3d(0, len(xm_settle))
ax.set_xticks([(xm_settle.index.year == year).argmax() for year in xm_settle.index.year.unique()[1::2]])
ax.set_xticklabels(xm_settle.index.year.unique()[1::2])
ax.set_ylim3d(0.0, 7.5)
ax.set_yticklabels(["M1", "M2", "M3", "M4", "M5", "M6", "M7", "M8"])
ax.set_zlim3d(11, 50)

plt.savefig("termstructures.pdf", format="pdf", bbox_inches="tight", dpi=300)
plt.show()



In [56]:
xm_settle.groupby(xm_settle.index.year).count().sum(axis=1).plot.bar(figsize=(4,3), width=0.7)
plt.xlabel("")
plt.ylabel("Number of samples")
plt.grid(axis="y")
plt.savefig("number_of_samples.pdf", format="pdf", bbox_inches="tight", dpi=300)
plt.show()



In [57]:
spreads_descr = pd.concat({key:spreads[key] for key in spreads.columns}).describe()

In [58]:
termstr_descr = pd.concat({key:xm_settle[key] for key in xm_settle.columns}).describe()

In [59]:
print(pd.concat([termstr_descr, spreads_descr], axis=1, keys=["term structure", "spread prices"]).iloc[1:].T.to_latex(float_format='%.2f'))


\begin{tabular}{lrrrrrrr}
\toprule
{} &  mean &  std &    min &   25\% &   50\% &   75\% &  max \\
\midrule
term structure & 22.38 & 6.91 &  10.24 & 17.55 & 20.70 & 25.70 & 67.9 \\
spread prices  &  0.11 & 0.74 & -13.68 & -0.10 &  0.13 &  0.36 &  4.9 \\
\bottomrule
\end{tabular}


In [114]:
fig, axes = plt.subplots(nrows=2, sharex=True, figsize=(8,8))
xm_settle.plot(ax=axes[0], cmap=cm.brg, linewidth=0.8)
axes[0].grid(axis="x")
axes[0].set_ylabel("Futures price")
axes[0].legend(loc="upper left")
def plot_vlines(ax):
    l = len(xm_settle)
    s = int(l * 0.15 / 2)
    # Validation (red)
    ax.axvspan(xm_settle.index[int(l/2-s)], xm_settle.index[int(l / 2)], facecolor="r", alpha=0.3)
    ax.axvspan(xm_settle.index[-s], xm_settle.index[-1], facecolor="r", alpha=0.3)
    # Test (green)
    ax.axvspan(xm_settle.index[int(l/2-2*s)], xm_settle.index[int(l/2)-s], facecolor="g", alpha=0.3)
    ax.axvspan(xm_settle.index[-2*s], xm_settle.index[-s], facecolor="g", alpha=0.3)
plot_vlines(axes[0])
spreads.plot(ax=axes[1], cmap=cm.brg, linewidth=0.8)
axes[1].grid(axis="x")
axes[1].set_ylabel("Spread price")
plot_vlines(axes[1])
plt.xlabel("")
plt.savefig("validation-and-test-set.pdf", format="pdf", bbox_inches="tight", dpi=300)
plt.show()



In [ ]: