In [1]:
import calendar
import operator
from collections import Counter, namedtuple
from math import log
from typing import Tuple, Sequence
from itertools import cycle, islice
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.collections import PolyCollection, LineCollection
from matplotlib import colors as mcolors
from scipy.special import erfc
from numpy import square, sqrt
from numpy import square as sq
from numpy import pi as π
from numpy import exp as ℯ
In [2]:
mpl.rcParams["figure.figsize"] = 16, 9
In [3]:
#data = (pd.read_csv("/home/leyht/Downloads/untitled folder/{}.csv".format(k), header=0, index_col=0,
# parse_dates=True) for k in calendar.month_name[1:])
#data = pd.concat(data, keys=range(1,13), names=("month", "date"))
#data.to_hdf("data/futureswise.h5", "data", complevel=9, complib="bzip2")
In [4]:
data = pd.read_hdf("data/futureswise.h5")
expirations = pd.read_csv("data/expirations.csv", header=0, usecols=range(1,10),
parse_dates=list(range(9)), index_col=0)
termstructure = pd.read_csv("data/8_m_settle.csv", usecols=range(1,10), parse_dates=True,
header=0, index_col=0, na_values=0)
symbols = pd.read_csv("data/8_m_symbols.csv", usecols=range(1,10), parse_dates=True,
header=0, index_col=0)
In [5]:
for year in data:
yint = int(year)
# This internally raises an exception but it's just some localization problem.
# The resulting data seems correct.
data[year].index = pd.MultiIndex(levels=[data[year].index.levels[0],
data[year].index.levels[1].map(lambda x: pd.Timestamp(yint, x.month, x.day))],
labels=data[year].index.labels)
In [6]:
# Concatenate the columns to get a large series of spread prices.
spreads = pd.concat((data[year].dropna() for year in data))
In [7]:
date_mask = spreads.index.droplevel(0).isin(termstructure.index)
filtered_spreads = spreads.where(date_mask).dropna()
In [8]:
# These are the spread prices loaded from file.
filtered_spreads[:,"2017-04-27"]
Out[8]:
In [9]:
# These are spread prices calculated from the term structure.
# Why are these different from each other?
termstructure.loc["2017-04-27"].aggregate(lambda x: [-x[i] + 2*x[i+1] - x[i+2] for i in range(6)])
Out[9]:
In [10]:
assert expirations.shape == termstructure.shape
assert expirations.index.equals(termstructure.index)
In [11]:
long_prices = termstructure.apply(lambda x: [np.nan] + [2*x[i] - x[i-1] - x[i+1] for i in range(1, len(x) - 1)] + [np.nan],
axis=1, reduce=False)
long_prices.dropna(axis=(0,1), how="all", inplace=True)
In [12]:
threshold_date = "2006-10-23"
In [13]:
long_prices_thresh = long_prices[threshold_date:]
day_diff = pd.Series(long_prices_thresh.index[1:]- long_prices_thresh.index[:-1])
count_day_diff = Counter(day_diff)
In [14]:
# There's not much difference between two index dates lying next to each other.
# Further modifications aren't necessary.
count_day_diff
Out[14]:
In [15]:
termstructure_thresh = termstructure[threshold_date:]
assert len(termstructure_thresh) == len(long_prices_thresh)
In [16]:
days_into_future = 1
In [17]:
y = pd.concat((long_prices_thresh.iloc[days_into_future:][column]
for column in long_prices_thresh))
x = termstructure_thresh.iloc[:-days_into_future]
for i in range(len(long_prices_thresh.columns) - 1):
x = x.append(termstructure_thresh.iloc[:-days_into_future])
assert len(y) == len(x)
x = x.where((np.tile(y.notnull().values, (len(x.columns),1)).T)).dropna(axis=0, how="all")
y = y.where(y.notnull()).dropna()
assert len(y) == len(x)
In [18]:
#x.to_hdf("data/futureswise_mapping.h5", "x", complevel=9, complib="bzip2")
#y.to_hdf("data/futureswise_mapping.h5", "y", complevel=9, complib="bzip2")
In [19]:
split_between_years = np.append(0, y.index.get_loc("2017-05-11") + 1)
splits = split_between_years
split_between_years
Out[19]:
In [20]:
def split_indices(a: int, b: int, val_split=0.15, test_split=0.15):
half = int((b - a) / 2)
val_len = int(half * val_split)
test_len = int(half * test_split)
val1 = a + half - val_len - test_len
test1 = a + half - test_len
data = a + half
val2 = b - val_len - test_len
test2 = b - test_len
return a, val1, test1, data, val2, test2, b
In [21]:
def splitted_dataset(dataset, splits: Sequence):
indices = pd.DataFrame((split_indices(splits[i], splits[i+1]) for i in range(len(splits) - 1)),
columns=("data1", "val1", "test1", "data2", "val2", "test2", "end"))
d1, v1, t1, d2, v2, t2 = ([dataset.iloc[a:b] for a, b in zip(indices.iloc[:,i], indices.iloc[:,i+1])]
for i in range(len(indices.columns) - 1))
return pd.concat(d1 + d2), pd.concat(v1 + v2), pd.concat(t1 + t2)
In [22]:
indices = pd.DataFrame((split_indices(splits[i], splits[i+1]) for i in range(len(splits) - 1)),
columns=("data1", "val1", "test1", "data2", "val2", "test2", "end"))
indices
Out[22]:
In [23]:
xdata, xval, xtest = splitted_dataset(x, split_between_years)
assert len(xdata) + len(xval) + len(xtest) == len(x)
ydata, yval, ytest = splitted_dataset(y, split_between_years)
assert len(ydata) + len(yval) + len(ytest) == len(y)
It seems a bit difficult to train a large network which has distinguish different spread prices by month by itself. So my idea would be to train twelve different networks (because there are twelve months obviously) directly with the different months' prices.
Now I have just to think about how I get these spreads out of the data. Maybe with the 8_m_symbols.csv
?
In [24]:
symbols_to_month = {"F":"January",
"G":"February",
"H":"March",
"J":"April",
"K":"May",
"M":"June",
"N":"July",
"Q":"August",
"U":"September",
"V":"October",
"X":"November",
"Z":"December"}
In [25]:
def year_month_repr(self):
return str((self.year, self.month))
YearMonth = namedtuple("YearMonth", ["year", "month"])
YearMonth.__repr__ = year_month_repr
symbols_short = symbols.applymap(lambda x: YearMonth(int(x[-3:-1]), operator.indexOf(calendar.month_name, symbols_to_month[x[0]])) if isinstance(x, str) else x)
In [26]:
count_ym = Counter(symbols_short.values.reshape(symbols_short.size))
In [27]:
def filter_by_yearmonth(dataframe, year, month):
return dataframe.applymap(lambda x: True if isinstance(x, YearMonth) and x.year == year and x.month == month else False)
In [28]:
mask_dict = {key:filter_by_yearmonth(symbols_short, key.year, key.month) for key in count_ym
if isinstance(key, YearMonth)}
In [29]:
long_prices = termstructure.apply(lambda x: [np.nan] + [2*x[i] - x[i-1] - x[i+1] for i in range(1, len(x) - 1)] + [np.nan],
axis=1, reduce=False)
assert long_prices.index.equals(symbols_short.index)
assert long_prices.shape == symbols_short.shape
Up to now I prepared masks for all spread prices in mask_dict
. With this one can select spreads with the where
method.
Next I need all the x-y-mappings from the term structure to the spread prices of the next day with appropricate shape. There will be such a mapping for each month.
In [30]:
test_16_05 = long_prices.where(mask_dict[YearMonth(16,5)]).dropna(how="all", axis=(0,1))
In [31]:
y_16_05 = pd.concat((test_16_05.iloc[days_into_future:][col] for col in test_16_05)).dropna().sort_index()
x_16_05 = termstructure.loc[test_16_05.index].iloc[:-days_into_future]
In [32]:
c_16_05 = Counter(y_16_05.index - x_16_05.index)
for key in c_16_05:
assert key.days <= 5
The code above was just playing around and testing. Now let's make a function out of this.
In [33]:
def x_y_mapping(long_prices, termstructure, mask, days_into_future=1, yearmonth=None):
prices = long_prices.where(mask).dropna(how="all", axis=(0,1))
try:
if yearmonth:
assert not prices.empty, "There are no spread prices for year {} and month {}.".format(yearmonth.year,
yearmonth.month)
else:
assert not prices.empty
except AssertionError as e:
print(e)
return None
y = pd.concat((prices.iloc[days_into_future:][column] for column in prices)).dropna().sort_index()
x = termstructure.loc[prices.index].iloc[:-days_into_future]
assert len(x) == len(y)
counter = Counter(y.index - x.index)
for key in counter:
try:
if yearmonth:
assert key.days <= 5, str(counter) + "\n" + "At year {} and month {}".format(yearmonth.year, yearmonth.month)
else:
assert key.days <= 5, str(counter)
except AssertionError as e:
print(e)
return x, y
In [34]:
mapping_dict = {key:x_y_mapping(long_prices, termstructure, mask_dict[key], yearmonth=key) for key in mask_dict}
In [35]:
sorted_keys = [item[0] for item in mapping_dict.items() if item[1]]
sorted_keys.sort()
years = list({key[0] for key in sorted_keys})
years.sort()
[str("%02d" % year) for year in years]
Out[35]:
In [36]:
for key in sorted_keys:
x, y = mapping_dict[key]
plt.plot(np.arange(len(y)), y.values)
plt.xlim(0,130)
plt.grid()
plt.show()
In [37]:
y = pd.concat((mapping_dict[key][1] for key in sorted_keys), keys=sorted_keys, names=["year", "month"])
x = pd.concat((mapping_dict[key][0] for key in sorted_keys), keys=sorted_keys, names=["year", "month"])
assert len(x) == len(y)
In [38]:
x.groupby("year").std().plot()
y.groupby("year").std().plot(secondary_y=True)
plt.xticks(np.arange(4,18), np.arange(2004,2018))
plt.show()
In [39]:
x.groupby("month").std().plot()
y.groupby("month").std().plot(secondary_y=True)
plt.xticks(np.arange(1,13), calendar.month_abbr[1:])
plt.show()
In [40]:
for month_nr in range(1,13):
assert len(y.loc(axis=0)[:,month_nr]) == len(x.loc(axis=0)[:,month_nr])
print(calendar.month_name[month_nr], len(y.loc(axis=0)[:,month_nr]))
In [41]:
#x.to_hdf("data/futures_per_year_and_month.h5", "x", complevel=9, complib="bzip2")
#y.to_hdf("data/futures_per_year_and_month.h5", "y", complevel=9, complib="bzip2")
In [42]:
x = pd.read_hdf("data/futures_per_year_and_month.h5", "x")
y = pd.read_hdf("data/futures_per_year_and_month.h5", "y")
In [43]:
x.loc(axis=0)[:, :, "2017-05-10"]
Out[43]:
In [44]:
y.loc(axis=0)[:, :, "2017-05-5"]
Out[44]:
In [45]:
symbols.loc["2017-05-5"]
Out[45]:
In [46]:
symbols_short.loc["2017-01-20"]
Out[46]:
In [47]:
x.apply(lambda x: [i for i in
islice(cycle(range(1, 13)), x.name[1] + 10, x.name[1] + 18)], axis=1)
Out[47]:
In [48]:
symbols_month = symbols_short.applymap(lambda x: x if isinstance(x, float) else x[1]).interpolate(axis=1).apply(
lambda x: [i for i in islice(cycle(range(1, 13)), int(x[3]) + 8, int(x[3]) + 16)], axis=1)
In [49]:
test_x = x.loc(axis=0)[16,1,:].iloc[0]
test_x.index = symbols_month.loc[test_x.name[2]]
symbols_month.loc[test_x.name[2]]
Out[49]:
In [50]:
def month_to_header(x):
global symbols_month
x.index = symbols_month.loc[x.name[2]]
return pd.Series({k:x.get(k, default=np.nan) for k in range(1, 13)})
x_yearly = x.apply(month_to_header, axis=1)
In [51]:
# Some tests
assert x.index.equals(x_yearly.index)
for xi, xyi in zip(x.isnull().sum(axis=1), x_yearly.isnull().sum(axis=1)):
assert xi == xyi - 4 # Because 12 month minus 8 months is 4 months
In [52]:
# More tests
for month in range(1, 13):
assert len(x.loc(axis=0)[:, month]) == len(y.loc(axis=0)[:, month])
assert len(x_yearly.loc(axis=0)[:, month]) == len(y.loc(axis=0)[:, month])
#x_yearly.to_hdf("data/futures_per_year_and_month.h5", "x_yearly", complevel=9, complib="bzip2")
In [53]:
print(pd.concat([x_yearly]*3, axis=1).loc(axis=0)[16,1].iloc[:,6:18].to_string())
In [54]:
pd.concat([x_yearly.loc(axis=0)[:, 5]] * 3, axis=1).iloc[:, 5+10:5+18]
Out[54]:
In [55]:
x_yearly.loc(axis=0)[:, 5].count()
Out[55]:
In [56]:
testblah = pd.concat([x_yearly] * 3, axis=1).iloc[:, 11:25]
In [57]:
x_spreads_yearly = testblah.apply(lambda x: [np.nan] + [2*x.iloc[i] - x.iloc[i-1] - x.iloc[i+1]
for i in range(1, len(x) - 1)] + [np.nan],
axis=1).iloc[:, 1:13]
In [58]:
x.apply(lambda x: [np.nan] + [2*x[i] - x[i-1] - x[i+1] for i in range(1, len(x) - 1)] + [np.nan], axis=1).iloc[:, 1:7]
Out[58]:
In [59]:
spread_ts = termstructure.apply(lambda x: [np.nan] + [2*x[i] - x[i-1] - x[i+1] for i in range(1, len(x) - 1)] + [np.nan], axis=1).dropna(how="all", axis=(0, 1))
In [73]:
fig = plt.figure(figsize=(10, 15))
for i in range(6):
ax = fig.add_subplot(6, 1, i + 1)
y_m = spread_ts.loc["2006-10-23":].iloc[:, i]
y_m.plot(ax=ax)
plt.axhline(0, color="black", linewidth=1)
if i != 5:
ax.set_xticklabels([])
plt.xlabel("")
plt.ylabel("V" + y_m.name[-1])
plt.savefig("spreads-legwise.pdf", format="pdf", dpi=300, bbox_inches="tight")
plt.show()
In [61]:
# Naive prediction when using term structure
spread_per_column = pd.concat((spread_ts.loc["2006-10-23":, column] for column in spread_ts), axis=1)
for column in spread_per_column:
dropped = spread_per_column.loc[:, column].dropna()
print(column, np.mean(np.square(dropped.iloc[:-1].values - dropped.iloc[1:].values)))
In [62]:
spread_per_column.describe()
Out[62]:
In [63]:
y_desc = y.describe()
y_mean = y_desc["mean"]
y_std = y_desc["std"]
y_var = np.square(y_std)
y_desc
Out[63]:
First some test with zero mean and unit variance; then use the values one line above.
From https://arxiv.org/abs/1706.02515, p. 11
In [64]:
def μ_tilde(μ, ω, ν, τ, λ, α):
t1 = (-(α + μ * ω) *
erfc((μ * ω) / (sqrt(2) * sqrt(ν * τ))))
t2 = (α * ℯ(μ * ω + (ν * τ) / 2) *
erfc((μ * ω + ν * τ) / (sqrt(2) * sqrt(ν * τ))))
t3 = (sqrt(2 / π) * sqrt(ν * τ) *
ℯ(-(sq(μ) * sq(ω)) / (2 * ν * τ)))
t4 = 2 * μ * ω
return (λ / 2) * (t1 + t2 + t3 + t4)
def xi_tilde(μ, ω, ν, τ, λ, α):
t1 = (sq(μ * ω) + ν * τ) * (erfc((μ * ω) / (sqrt(2) * sqrt(ν * τ))) + 1)
t2 = sq(α) * (-2 * ℯ(μ * ω + (ν * τ) / 2) * erfc((μ * ω + ν * τ) / (sqrt(2) * sqrt(ν * τ))) +
ℯ(2 * (μ * ω + ν * τ)) * erfc((μ * ω + 2 * ν * τ) / (sqrt(2) * sqrt(ν * τ))) +
erfc((μ * ω) / (sqrt(2) * sqrt(ν * τ))))
t3 = sqrt(2 * π) * (μ * ω) * sqrt(ν * τ) * ℯ(-(sq(μ * ω)) / (2 * ν * τ))
return (sq(λ) / 2) * (t1 + t2 + t3)
def ν_tilde(μ, ω, ν, τ, λ, α):
return xi_tilde(μ, ω, ν, τ, λ, α) - sq(μ_tilde(μ, ω, ν, τ, λ, α))
In [65]:
assert np.isclose(mu_tilde(0, 0, 1, 1, 1.0507, 1.67326), 0, atol=1e-06), "This should be close to 0 but isn't."
assert np.isclose(ny_tilde(0, 0, 1, 1, 1.0507, 1.67326), 1, atol=1e-06), "This should be close to 1 but isn't."
In [ ]:
print(mu_tilde(0, 0, 1, 1, 1.0507, 1.67326))
print(ny_tilde(0, 0, 1, 1, 1.0507, 1.67326)) # Shit. 1.55 != 1
In [ ]:
((y - y.mean()) / y.std()).describe()
In [74]:
fig = plt.figure(figsize=(10, 15))
for month in range(1, 13):
ax = fig.add_subplot(12, 1, month)
y_m = y.loc(axis=0)[:, month]
y_m.plot(ax=ax)
plt.axhline(0, color="black", linewidth=1)
plt.xticks(np.arange(0, len(y_m), 200), [])
plt.xlabel("")
plt.ylabel(calendar.month_name[month])
plt.xticks(np.arange(0, len(y_m), 200), (y_m.index.droplevel([1, 2]) + 2000)[::200])
plt.savefig("spreads-monthwise.pdf", format="pdf", dpi=300, bbox_inches="tight")
plt.show()
In [67]:
y_by_month = pd.concat((y.loc[:, i] for i in range (1,13)), axis=1, keys=calendar.month_abbr[1:])
In [68]:
print(y_by_month.describe().T.applymap(lambda x: '{:.2f}'.format(x)).to_latex())
In [69]:
sp_ts_d = spread_ts.describe().T
sp_ts_d.index = ["V" + str(i) for i in range(2, 8)]
In [70]:
print(sp_ts_d.to_latex(float_format="%.2f"))
In [71]:
spread_ts
Out[71]:
In [72]:
fig = plt.figure(figsize=(8, 4))
ax = fig.gca(projection='3d')
xs = np.arange(len(spread_ts))
zs = np.arange(6)
verts = []
xm_settle_3dplot = spread_ts.copy()
xm_settle_3dplot.index = xs
for z in zs:
ys = xm_settle_3dplot.iloc[:,int(z)].fillna(0)
ys.iloc[0] = 0
ys.iloc[-1] = 0
verts.append(list(zip(ys.index.values, ys.values)))
poly = PolyCollection(verts, linewidth=2.0, facecolors=[cm.winter(i, 0.8) for i in np.linspace(0, 1, 6)])
ax.add_collection3d(poly, zs=zs, zdir='y')
ax.set_xlim3d(0, len(spread_ts))
#ax.set_xticks([(spread_ts.index.year == year).argmax() for year in spread_ts.index.year.unique()[1::2]])
ax.set_xticklabels(spread_ts.index.year.unique()[1::2])
ax.set_ylim3d(0.0, 5.5)
ax.set_yticklabels(["V2", "V3", "V4", "V5", "V6", "V7"])
ax.set_zlim3d(-1, 2)
#plt.savefig("termstructures.pdf", format="pdf", bbox_inches="tight", dpi=300)
plt.show()