In [1]:
import calendar
import operator
from collections import Counter, namedtuple
from math import log
from typing import Tuple, Sequence
from itertools import cycle, islice

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.collections import PolyCollection, LineCollection
from matplotlib import colors as mcolors
from scipy.special import erfc
from numpy import square, sqrt
from numpy import square as sq
from numpy import pi as π
from numpy import exp as 

In [2]:
mpl.rcParams["figure.figsize"] = 16, 9

In [3]:
#data = (pd.read_csv("/home/leyht/Downloads/untitled folder/{}.csv".format(k), header=0, index_col=0,
#                    parse_dates=True) for k in calendar.month_name[1:])
#data = pd.concat(data, keys=range(1,13), names=("month", "date"))
#data.to_hdf("data/futureswise.h5", "data", complevel=9, complib="bzip2")

In [4]:
data = pd.read_hdf("data/futureswise.h5")
expirations = pd.read_csv("data/expirations.csv", header=0, usecols=range(1,10),
                          parse_dates=list(range(9)), index_col=0)
termstructure = pd.read_csv("data/8_m_settle.csv", usecols=range(1,10), parse_dates=True,
                            header=0, index_col=0, na_values=0)
symbols = pd.read_csv("data/8_m_symbols.csv", usecols=range(1,10), parse_dates=True,
                      header=0, index_col=0)

In [5]:
for year in data:
    yint = int(year)
    # This internally raises an exception but it's just some localization problem.
    # The resulting data seems correct.
    data[year].index = pd.MultiIndex(levels=[data[year].index.levels[0],
                                             data[year].index.levels[1].map(lambda x: pd.Timestamp(yint, x.month, x.day))],
                                     labels=data[year].index.labels)


Exception ignored in: 'pandas._libs.tslib._localize_tso'
Traceback (most recent call last):
  File "pandas/_libs/tslib.pyx", line 4332, in pandas._libs.tslib._get_dst_info (pandas/_libs/tslib.c:72500)
  File "pandas/_libs/tslib.pyx", line 1425, in pandas._libs.tslib._get_utcoffset (pandas/_libs/tslib.c:27137)
AttributeError: 'Int64Index' object has no attribute 'utcoffset'
Exception ignored in: 'pandas._libs.tslib._localize_tso'
Traceback (most recent call last):
  File "pandas/_libs/tslib.pyx", line 4332, in pandas._libs.tslib._get_dst_info (pandas/_libs/tslib.c:72500)
  File "pandas/_libs/tslib.pyx", line 1425, in pandas._libs.tslib._get_utcoffset (pandas/_libs/tslib.c:27137)
AttributeError: 'Int64Index' object has no attribute 'utcoffset'
Exception ignored in: 'pandas._libs.tslib._localize_tso'
Traceback (most recent call last):
  File "pandas/_libs/tslib.pyx", line 4332, in pandas._libs.tslib._get_dst_info (pandas/_libs/tslib.c:72500)
  File "pandas/_libs/tslib.pyx", line 1425, in pandas._libs.tslib._get_utcoffset (pandas/_libs/tslib.c:27137)
AttributeError: 'Int64Index' object has no attribute 'utcoffset'
Exception ignored in: 'pandas._libs.tslib._localize_tso'
Traceback (most recent call last):
  File "pandas/_libs/tslib.pyx", line 4332, in pandas._libs.tslib._get_dst_info (pandas/_libs/tslib.c:72500)
  File "pandas/_libs/tslib.pyx", line 1425, in pandas._libs.tslib._get_utcoffset (pandas/_libs/tslib.c:27137)
AttributeError: 'Int64Index' object has no attribute 'utcoffset'
Exception ignored in: 'pandas._libs.tslib._localize_tso'
Traceback (most recent call last):
  File "pandas/_libs/tslib.pyx", line 4332, in pandas._libs.tslib._get_dst_info (pandas/_libs/tslib.c:72500)
  File "pandas/_libs/tslib.pyx", line 1425, in pandas._libs.tslib._get_utcoffset (pandas/_libs/tslib.c:27137)
AttributeError: 'Int64Index' object has no attribute 'utcoffset'
Exception ignored in: 'pandas._libs.tslib._localize_tso'
Traceback (most recent call last):
  File "pandas/_libs/tslib.pyx", line 4332, in pandas._libs.tslib._get_dst_info (pandas/_libs/tslib.c:72500)
  File "pandas/_libs/tslib.pyx", line 1425, in pandas._libs.tslib._get_utcoffset (pandas/_libs/tslib.c:27137)
AttributeError: 'Int64Index' object has no attribute 'utcoffset'
Exception ignored in: 'pandas._libs.tslib._localize_tso'
Traceback (most recent call last):
  File "pandas/_libs/tslib.pyx", line 4332, in pandas._libs.tslib._get_dst_info (pandas/_libs/tslib.c:72500)
  File "pandas/_libs/tslib.pyx", line 1425, in pandas._libs.tslib._get_utcoffset (pandas/_libs/tslib.c:27137)
AttributeError: 'Int64Index' object has no attribute 'utcoffset'
Exception ignored in: 'pandas._libs.tslib._localize_tso'
Traceback (most recent call last):
  File "pandas/_libs/tslib.pyx", line 4332, in pandas._libs.tslib._get_dst_info (pandas/_libs/tslib.c:72500)
  File "pandas/_libs/tslib.pyx", line 1425, in pandas._libs.tslib._get_utcoffset (pandas/_libs/tslib.c:27137)
AttributeError: 'Int64Index' object has no attribute 'utcoffset'
Exception ignored in: 'pandas._libs.tslib._localize_tso'
Traceback (most recent call last):
  File "pandas/_libs/tslib.pyx", line 4332, in pandas._libs.tslib._get_dst_info (pandas/_libs/tslib.c:72500)
  File "pandas/_libs/tslib.pyx", line 1425, in pandas._libs.tslib._get_utcoffset (pandas/_libs/tslib.c:27137)
AttributeError: 'Int64Index' object has no attribute 'utcoffset'
Exception ignored in: 'pandas._libs.tslib._localize_tso'
Traceback (most recent call last):
  File "pandas/_libs/tslib.pyx", line 4332, in pandas._libs.tslib._get_dst_info (pandas/_libs/tslib.c:72500)
  File "pandas/_libs/tslib.pyx", line 1425, in pandas._libs.tslib._get_utcoffset (pandas/_libs/tslib.c:27137)
AttributeError: 'Int64Index' object has no attribute 'utcoffset'
Exception ignored in: 'pandas._libs.tslib._localize_tso'
Traceback (most recent call last):
  File "pandas/_libs/tslib.pyx", line 4332, in pandas._libs.tslib._get_dst_info (pandas/_libs/tslib.c:72500)
  File "pandas/_libs/tslib.pyx", line 1425, in pandas._libs.tslib._get_utcoffset (pandas/_libs/tslib.c:27137)
AttributeError: 'Int64Index' object has no attribute 'utcoffset'
Exception ignored in: 'pandas._libs.tslib._localize_tso'
Traceback (most recent call last):
  File "pandas/_libs/tslib.pyx", line 4332, in pandas._libs.tslib._get_dst_info (pandas/_libs/tslib.c:72500)
  File "pandas/_libs/tslib.pyx", line 1425, in pandas._libs.tslib._get_utcoffset (pandas/_libs/tslib.c:27137)
AttributeError: 'Int64Index' object has no attribute 'utcoffset'
Exception ignored in: 'pandas._libs.tslib._localize_tso'
Traceback (most recent call last):
  File "pandas/_libs/tslib.pyx", line 4332, in pandas._libs.tslib._get_dst_info (pandas/_libs/tslib.c:72500)
  File "pandas/_libs/tslib.pyx", line 1425, in pandas._libs.tslib._get_utcoffset (pandas/_libs/tslib.c:27137)
AttributeError: 'Int64Index' object has no attribute 'utcoffset'
Exception ignored in: 'pandas._libs.tslib._localize_tso'
Traceback (most recent call last):
  File "pandas/_libs/tslib.pyx", line 4332, in pandas._libs.tslib._get_dst_info (pandas/_libs/tslib.c:72500)
  File "pandas/_libs/tslib.pyx", line 1425, in pandas._libs.tslib._get_utcoffset (pandas/_libs/tslib.c:27137)
AttributeError: 'Int64Index' object has no attribute 'utcoffset'

In [6]:
# Concatenate the columns to get a large series of spread prices.
spreads = pd.concat((data[year].dropna() for year in data))

In [7]:
date_mask = spreads.index.droplevel(0).isin(termstructure.index)
filtered_spreads = spreads.where(date_mask).dropna()

In [8]:
# These are the spread prices loaded from file.
filtered_spreads[:,"2017-04-27"]


Out[8]:
month
6    -0.200
7     0.450
8    -0.300
9     0.375
10    0.100
11    0.200
12   -0.800
dtype: float64

In [9]:
# These are spread prices calculated from the term structure.
# Why are these different from each other?
termstructure.loc["2017-04-27"].aggregate(lambda x: [-x[i] + 2*x[i+1] - x[i+2] for i in range(6)])


Out[9]:
[-0.27000000000000135,
 0.46000000000000085,
 -0.3100000000000005,
 0.41999999999999815,
 0.050000000000004263,
 0.24999999999999645]

Problem

Doesn't seem like I can fit the new data of spread prices to the term structures.

Better to get the spread prices in their necessary representation directly from the old proven term structure data.


In [10]:
assert expirations.shape == termstructure.shape
assert expirations.index.equals(termstructure.index)

In [11]:
long_prices = termstructure.apply(lambda x: [np.nan] + [2*x[i] - x[i-1] - x[i+1] for i in range(1, len(x) - 1)] + [np.nan],
                                  axis=1, reduce=False)
long_prices.dropna(axis=(0,1), how="all", inplace=True)

In [12]:
threshold_date = "2006-10-23"

In [13]:
long_prices_thresh = long_prices[threshold_date:]
day_diff = pd.Series(long_prices_thresh.index[1:]- long_prices_thresh.index[:-1])
count_day_diff = Counter(day_diff)

In [14]:
# There's not much difference between two index dates lying next to each other.
# Further modifications aren't necessary.
count_day_diff


Out[14]:
Counter({Timedelta('1 days 00:00:00'): 2081,
         Timedelta('2 days 00:00:00'): 24,
         Timedelta('3 days 00:00:00'): 478,
         Timedelta('4 days 00:00:00'): 70,
         Timedelta('5 days 00:00:00'): 2})

In [15]:
termstructure_thresh = termstructure[threshold_date:]
assert len(termstructure_thresh) == len(long_prices_thresh)

In [16]:
days_into_future = 1

In [17]:
y = pd.concat((long_prices_thresh.iloc[days_into_future:][column]
               for column in long_prices_thresh))
x = termstructure_thresh.iloc[:-days_into_future]
for i in range(len(long_prices_thresh.columns) - 1):
    x = x.append(termstructure_thresh.iloc[:-days_into_future])
assert len(y) == len(x)
x = x.where((np.tile(y.notnull().values, (len(x.columns),1)).T)).dropna(axis=0, how="all")
y = y.where(y.notnull()).dropna()
assert len(y) == len(x)

In [18]:
#x.to_hdf("data/futureswise_mapping.h5", "x", complevel=9, complib="bzip2")
#y.to_hdf("data/futureswise_mapping.h5", "y", complevel=9, complib="bzip2")

Not we've prepared the data

Cool. So $x$ is the data mapping to target $y$. But now I'd like to split in into test and validation set similar to the other representation.


In [19]:
split_between_years = np.append(0, y.index.get_loc("2017-05-11") + 1)
splits = split_between_years
split_between_years


Out[19]:
array([    0,  2655,  5310,  7965, 10593, 13047, 15193])

In [20]:
def split_indices(a: int, b: int, val_split=0.15, test_split=0.15):
    half = int((b - a) / 2)
    val_len = int(half * val_split)
    test_len = int(half * test_split)
    val1 = a + half - val_len - test_len
    test1 = a + half - test_len
    data = a + half
    val2 = b - val_len - test_len
    test2 = b - test_len
    return a, val1, test1, data, val2, test2, b

In [21]:
def splitted_dataset(dataset, splits: Sequence):
    indices = pd.DataFrame((split_indices(splits[i], splits[i+1]) for i in range(len(splits) - 1)),
                           columns=("data1", "val1", "test1", "data2", "val2", "test2", "end"))
    d1, v1, t1, d2, v2, t2 = ([dataset.iloc[a:b] for a, b in zip(indices.iloc[:,i], indices.iloc[:,i+1])]
                              for i in range(len(indices.columns) - 1))
    return pd.concat(d1 + d2), pd.concat(v1 + v2), pd.concat(t1 + t2)

In [22]:
indices = pd.DataFrame((split_indices(splits[i], splits[i+1]) for i in range(len(splits) - 1)),
                       columns=("data1", "val1", "test1", "data2", "val2", "test2", "end"))
indices


Out[22]:
data1 val1 test1 data2 val2 test2 end
0 0 929 1128 1327 2257 2456 2655
1 2655 3584 3783 3982 4912 5111 5310
2 5310 6239 6438 6637 7567 7766 7965
3 7965 8885 9082 9279 10199 10396 10593
4 10593 11452 11636 11820 12679 12863 13047
5 13047 13800 13960 14120 14873 15033 15193

In [23]:
xdata, xval, xtest = splitted_dataset(x, split_between_years)
assert len(xdata) + len(xval) + len(xtest) == len(x)
ydata, yval, ytest = splitted_dataset(y, split_between_years)
assert len(ydata) + len(yval) + len(ytest) == len(y)

Incorporate information about futures month into data

It seems a bit difficult to train a large network which has distinguish different spread prices by month by itself. So my idea would be to train twelve different networks (because there are twelve months obviously) directly with the different months' prices.

Now I have just to think about how I get these spreads out of the data. Maybe with the 8_m_symbols.csv?


In [24]:
symbols_to_month = {"F":"January",
                    "G":"February",
                    "H":"March",
                    "J":"April",
                    "K":"May",
                    "M":"June",
                    "N":"July",
                    "Q":"August",
                    "U":"September",
                    "V":"October",
                    "X":"November",
                    "Z":"December"}

In [25]:
def year_month_repr(self):
    return str((self.year, self.month))
YearMonth = namedtuple("YearMonth", ["year", "month"])
YearMonth.__repr__ = year_month_repr
symbols_short = symbols.applymap(lambda x: YearMonth(int(x[-3:-1]), operator.indexOf(calendar.month_name, symbols_to_month[x[0]])) if isinstance(x, str) else x)

In [26]:
count_ym = Counter(symbols_short.values.reshape(symbols_short.size))

In [27]:
def filter_by_yearmonth(dataframe, year, month):
    return dataframe.applymap(lambda x: True if isinstance(x, YearMonth) and x.year == year and x.month == month else False)

In [28]:
mask_dict = {key:filter_by_yearmonth(symbols_short, key.year, key.month) for key in count_ym
             if isinstance(key, YearMonth)}

In [29]:
long_prices = termstructure.apply(lambda x: [np.nan] + [2*x[i] - x[i-1] - x[i+1] for i in range(1, len(x) - 1)] + [np.nan],
                                  axis=1, reduce=False)
assert long_prices.index.equals(symbols_short.index)
assert long_prices.shape == symbols_short.shape

Some explanation in between

Up to now I prepared masks for all spread prices in mask_dict. With this one can select spreads with the where method.

Next I need all the x-y-mappings from the term structure to the spread prices of the next day with appropricate shape. There will be such a mapping for each month.


In [30]:
test_16_05 = long_prices.where(mask_dict[YearMonth(16,5)]).dropna(how="all", axis=(0,1))

In [31]:
y_16_05 = pd.concat((test_16_05.iloc[days_into_future:][col] for col in test_16_05)).dropna().sort_index()
x_16_05 = termstructure.loc[test_16_05.index].iloc[:-days_into_future]

In [32]:
c_16_05 = Counter(y_16_05.index - x_16_05.index)
for key in c_16_05:
    assert key.days <= 5

The code above was just playing around and testing. Now let's make a function out of this.


In [33]:
def x_y_mapping(long_prices, termstructure, mask, days_into_future=1, yearmonth=None):
    prices = long_prices.where(mask).dropna(how="all", axis=(0,1))
    try:
        if yearmonth:
            assert not prices.empty, "There are no spread prices for year {} and month {}.".format(yearmonth.year,
                                                                                                   yearmonth.month)
        else:
            assert not prices.empty
    except AssertionError as e:
        print(e)
        return None
    y = pd.concat((prices.iloc[days_into_future:][column] for column in prices)).dropna().sort_index()
    x = termstructure.loc[prices.index].iloc[:-days_into_future]
    assert len(x) == len(y)
    counter = Counter(y.index - x.index)
    for key in counter:
        try:
            if yearmonth:
                assert key.days <= 5, str(counter) + "\n" + "At year {} and month {}".format(yearmonth.year, yearmonth.month)
            else:
                assert key.days <= 5, str(counter)
        except AssertionError as e:
            print(e)
    return x, y

In [34]:
mapping_dict = {key:x_y_mapping(long_prices, termstructure, mask_dict[key], yearmonth=key) for key in mask_dict}


There are no spread prices for year 4 and month 5.
There are no spread prices for year 4 and month 6.
There are no spread prices for year 4 and month 8.
There are no spread prices for year 4 and month 11.
There are no spread prices for year 5 and month 2.
There are no spread prices for year 4 and month 9.
There are no spread prices for year 5 and month 5.
There are no spread prices for year 5 and month 1.
There are no spread prices for year 5 and month 8.
There are no spread prices for year 5 and month 3.
There are no spread prices for year 5 and month 11.
There are no spread prices for year 5 and month 6.
There are no spread prices for year 6 and month 2.
There are no spread prices for year 5 and month 10.
There are no spread prices for year 6 and month 5.
There are no spread prices for year 5 and month 12.
There are no spread prices for year 6 and month 8.
There are no spread prices for year 6 and month 3.
There are no spread prices for year 6 and month 6.
There are no spread prices for year 6 and month 9.
Counter({Timedelta('1 days 00:00:00'): 58, Timedelta('3 days 00:00:00'): 12, Timedelta('4 days 00:00:00'): 3, Timedelta('37 days 00:00:00'): 1, Timedelta('5 days 00:00:00'): 1})
At year 7 and month 5
There are no spread prices for year 17 and month 12.

In [35]:
sorted_keys = [item[0] for item in mapping_dict.items() if item[1]]
sorted_keys.sort()
years = list({key[0] for key in sorted_keys})
years.sort()
[str("%02d" % year) for year in years]


Out[35]:
['04', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17']

In [36]:
for key in sorted_keys:
    x, y = mapping_dict[key]
    plt.plot(np.arange(len(y)), y.values)
plt.xlim(0,130)
plt.grid()
plt.show()



In [37]:
y = pd.concat((mapping_dict[key][1] for key in sorted_keys), keys=sorted_keys, names=["year", "month"])
x = pd.concat((mapping_dict[key][0] for key in sorted_keys), keys=sorted_keys, names=["year", "month"])
assert len(x) == len(y)

In [38]:
x.groupby("year").std().plot()
y.groupby("year").std().plot(secondary_y=True)
plt.xticks(np.arange(4,18), np.arange(2004,2018))
plt.show()



In [39]:
x.groupby("month").std().plot()
y.groupby("month").std().plot(secondary_y=True)
plt.xticks(np.arange(1,13), calendar.month_abbr[1:])
plt.show()



In [40]:
for month_nr in range(1,13):
    assert len(y.loc(axis=0)[:,month_nr]) == len(x.loc(axis=0)[:,month_nr])
    print(calendar.month_name[month_nr], len(y.loc(axis=0)[:,month_nr]))


January 1281
February 1239
March 1238
April 1329
May 1289
June 1279
July 1328
August 1224
September 1236
October 1281
November 1256
December 1213

In [41]:
#x.to_hdf("data/futures_per_year_and_month.h5", "x", complevel=9, complib="bzip2")
#y.to_hdf("data/futures_per_year_and_month.h5", "y", complevel=9, complib="bzip2")

I'll just start here anew

Idea: Work on the ordering of the data. At the moment there is this mapping:

  • M1 ... M8 -> Spread

Better would be something like this:

  • Jan ... Dec -> Spread
  • Just map the second level of the row multiindex to column headers

In [42]:
x = pd.read_hdf("data/futures_per_year_and_month.h5", "x")
y = pd.read_hdf("data/futures_per_year_and_month.h5", "y")

In [43]:
x.loc(axis=0)[:, :, "2017-05-10"]


Out[43]:
M1 M2 M3 M4 M5 M6 M7 M8
year month date
17 6 2017-05-10 11.55 12.47 13.68 14.4 15.3 15.73 16.1 16.33
7 2017-05-10 11.55 12.47 13.68 14.4 15.3 15.73 16.1 16.33
8 2017-05-10 11.55 12.47 13.68 14.4 15.3 15.73 16.1 16.33
9 2017-05-10 11.55 12.47 13.68 14.4 15.3 15.73 16.1 16.33
10 2017-05-10 11.55 12.47 13.68 14.4 15.3 15.73 16.1 16.33
11 2017-05-10 11.55 12.47 13.68 14.4 15.3 15.73 16.1 16.33

In [44]:
y.loc(axis=0)[:, :, "2017-05-5"]


Out[44]:
year  month  date      
17    6      2017-05-05   -0.10
      7      2017-05-05    0.43
      8      2017-05-05   -0.32
      9      2017-05-05    0.43
      10     2017-05-05    0.05
      11     2017-05-05    0.24
dtype: float64

In [45]:
symbols.loc["2017-05-5"]


Out[45]:
M1    K (May 17)
M2    M (Jun 17)
M3    N (Jul 17)
M4    Q (Aug 17)
M5    U (Sep 17)
M6    V (Oct 17)
M7    X (Nov 17)
M8    Z (Dec 17)
Name: 2017-05-05 00:00:00, dtype: object

In [46]:
symbols_short.loc["2017-01-20"]


Out[46]:
M1    (17, 2)
M2    (17, 3)
M3    (17, 4)
M4    (17, 5)
M5    (17, 6)
M6    (17, 7)
M7    (17, 8)
M8    (17, 9)
Name: 2017-01-20 00:00:00, dtype: object

In [47]:
x.apply(lambda x: [i for i in 
                   islice(cycle(range(1, 13)), x.name[1] + 10, x.name[1] + 18)], axis=1)


Out[47]:
M1 M2 M3 M4 M5 M6 M7 M8
year month date
4 7 2004-05-24 6 7 8 9 10 11 12 1
2004-05-25 6 7 8 9 10 11 12 1
2004-05-26 6 7 8 9 10 11 12 1
2004-05-27 6 7 8 9 10 11 12 1
2004-05-28 6 7 8 9 10 11 12 1
2004-06-01 6 7 8 9 10 11 12 1
2004-06-02 6 7 8 9 10 11 12 1
2004-06-03 6 7 8 9 10 11 12 1
2004-06-04 6 7 8 9 10 11 12 1
2004-06-07 6 7 8 9 10 11 12 1
2004-06-08 6 7 8 9 10 11 12 1
2004-06-09 6 7 8 9 10 11 12 1
2004-06-10 6 7 8 9 10 11 12 1
2004-06-14 6 7 8 9 10 11 12 1
10 2004-08-23 9 10 11 12 1 2 3 4
2004-08-24 9 10 11 12 1 2 3 4
2004-08-25 9 10 11 12 1 2 3 4
2004-08-26 9 10 11 12 1 2 3 4
2004-08-27 9 10 11 12 1 2 3 4
2004-08-30 9 10 11 12 1 2 3 4
2004-08-31 9 10 11 12 1 2 3 4
2004-09-01 9 10 11 12 1 2 3 4
2004-09-02 9 10 11 12 1 2 3 4
2004-09-03 9 10 11 12 1 2 3 4
2004-09-07 9 10 11 12 1 2 3 4
2004-09-08 9 10 11 12 1 2 3 4
2004-09-09 9 10 11 12 1 2 3 4
2004-09-10 9 10 11 12 1 2 3 4
2004-09-13 9 10 11 12 1 2 3 4
6 1 2005-11-21 12 1 2 3 4 5 6 7
... ... ... ... ... ... ... ... ... ... ...
17 10 2017-04-21 9 10 11 12 1 2 3 4
2017-04-24 9 10 11 12 1 2 3 4
2017-04-25 9 10 11 12 1 2 3 4
2017-04-26 9 10 11 12 1 2 3 4
2017-04-27 9 10 11 12 1 2 3 4
2017-04-28 9 10 11 12 1 2 3 4
2017-05-01 9 10 11 12 1 2 3 4
2017-05-02 9 10 11 12 1 2 3 4
2017-05-03 9 10 11 12 1 2 3 4
2017-05-04 9 10 11 12 1 2 3 4
2017-05-05 9 10 11 12 1 2 3 4
2017-05-08 9 10 11 12 1 2 3 4
2017-05-09 9 10 11 12 1 2 3 4
2017-05-10 9 10 11 12 1 2 3 4
11 2017-04-19 10 11 12 1 2 3 4 5
2017-04-20 10 11 12 1 2 3 4 5
2017-04-21 10 11 12 1 2 3 4 5
2017-04-24 10 11 12 1 2 3 4 5
2017-04-25 10 11 12 1 2 3 4 5
2017-04-26 10 11 12 1 2 3 4 5
2017-04-27 10 11 12 1 2 3 4 5
2017-04-28 10 11 12 1 2 3 4 5
2017-05-01 10 11 12 1 2 3 4 5
2017-05-02 10 11 12 1 2 3 4 5
2017-05-03 10 11 12 1 2 3 4 5
2017-05-04 10 11 12 1 2 3 4 5
2017-05-05 10 11 12 1 2 3 4 5
2017-05-08 10 11 12 1 2 3 4 5
2017-05-09 10 11 12 1 2 3 4 5
2017-05-10 10 11 12 1 2 3 4 5

15193 rows × 8 columns


In [48]:
symbols_month = symbols_short.applymap(lambda x: x if isinstance(x, float) else x[1]).interpolate(axis=1).apply(
                    lambda x: [i for i in islice(cycle(range(1, 13)), int(x[3]) + 8, int(x[3]) + 16)], axis=1)

In [49]:
test_x = x.loc(axis=0)[16,1,:].iloc[0]
test_x.index = symbols_month.loc[test_x.name[2]]
symbols_month.loc[test_x.name[2]]


Out[49]:
M1     7
M2     8
M3     9
M4    10
M5    11
M6    12
M7     1
M8     2
Name: 2015-06-17 00:00:00, dtype: int64

In [50]:
def month_to_header(x):
    global symbols_month
    x.index = symbols_month.loc[x.name[2]]
    return pd.Series({k:x.get(k, default=np.nan) for k in range(1, 13)})
x_yearly = x.apply(month_to_header, axis=1)

In [51]:
# Some tests
assert x.index.equals(x_yearly.index)
for xi, xyi in zip(x.isnull().sum(axis=1), x_yearly.isnull().sum(axis=1)):
    assert xi == xyi - 4  # Because 12 month minus 8 months is 4 months

In [52]:
# More tests
for month in range(1, 13):
    assert len(x.loc(axis=0)[:, month]) == len(y.loc(axis=0)[:, month])
    assert len(x_yearly.loc(axis=0)[:, month]) == len(y.loc(axis=0)[:, month])
#x_yearly.to_hdf("data/futures_per_year_and_month.h5", "x_yearly", complevel=9, complib="bzip2")

In [53]:
print(pd.concat([x_yearly]*3, axis=1).loc(axis=0)[16,1].iloc[:,6:18].to_string())


               7      8      9      10     11     12     1      2      3      4      5      6 
date                                                                                          
2015-06-17  15.65  16.40  17.01  17.54  17.91  18.05  18.75  19.10    NaN    NaN    NaN    NaN
2015-06-18  14.99  15.90  16.60  17.23  17.63  17.80  18.46  18.80    NaN    NaN    NaN    NaN
2015-06-19  15.45  16.35  16.95  17.50  17.82  18.00  18.61  18.90    NaN    NaN    NaN    NaN
2015-06-22  14.55  15.55  16.34  16.90  17.34  17.56  18.18  18.55    NaN    NaN    NaN    NaN
2015-06-23  14.10  15.25  16.10  16.70  17.18  17.42  18.05  18.38    NaN    NaN    NaN    NaN
2015-06-24  14.58  15.62  16.37  16.95  17.33  17.52  18.15  18.42    NaN    NaN    NaN    NaN
2015-06-25  14.65  15.62  16.33  16.95  17.30  17.55  18.18  18.44    NaN    NaN    NaN    NaN
2015-06-26  14.55  15.46  16.20  16.75  17.15  17.35  18.05  18.33    NaN    NaN    NaN    NaN
2015-06-29  17.40  17.42  17.68  17.93  18.17  18.24  18.82  19.10    NaN    NaN    NaN    NaN
2015-06-30  17.33  17.40  17.60  17.90  18.10  18.15  18.70  18.90    NaN    NaN    NaN    NaN
2015-07-01  15.92  16.19  16.60  16.95  17.25  17.40  17.97  18.28    NaN    NaN    NaN    NaN
2015-07-02  16.90  17.02  17.32  17.50  17.75  17.80  18.35  18.57    NaN    NaN    NaN    NaN
2015-07-06  17.25  17.40  17.65  17.90  18.05  18.10  18.65  18.92    NaN    NaN    NaN    NaN
2015-07-07  16.35  16.68  17.09  17.41  17.65  17.75  18.30  18.60    NaN    NaN    NaN    NaN
2015-07-08  18.45  18.20  18.24  18.32  18.42  18.40  18.90  19.05    NaN    NaN    NaN    NaN
2015-07-09  18.84  18.66  18.65  18.69  18.75  18.75  19.22  19.35    NaN    NaN    NaN    NaN
2015-07-10  16.75  17.15  17.56  17.85  18.05  18.12  18.65  18.85    NaN    NaN    NaN    NaN
2015-07-13  14.47  15.58  16.20  16.70  17.02  17.18  17.85  18.09    NaN    NaN    NaN    NaN
2015-07-14  14.09  15.56  16.30  16.82  17.10  17.30  17.90  18.20    NaN    NaN    NaN    NaN
2015-07-15  13.96  15.38  16.07  16.60  16.90  17.15  17.75  18.10    NaN    NaN    NaN    NaN
2015-07-16  12.93  14.48  15.40  16.15  16.60  16.80  17.45  17.80    NaN    NaN    NaN    NaN
2015-07-17  12.90  14.44  15.41  16.10  16.57  16.80  17.50  17.75    NaN    NaN    NaN    NaN
2015-07-20  12.53  14.23  15.20  15.90  16.35  16.65  17.40  17.75    NaN    NaN    NaN    NaN
2015-07-21  12.40  14.20  15.20  16.00  16.50  16.75  17.45  17.75    NaN    NaN    NaN    NaN
2015-07-22    NaN  14.20  15.30  16.10  16.55  16.90  17.50  17.90  18.00    NaN    NaN    NaN
2015-07-23    NaN  14.25  15.42  16.18  16.60  16.90  17.50  17.90  18.10    NaN    NaN    NaN
2015-07-24    NaN  14.95  15.75  16.46  16.80  17.10  17.65  18.15  18.25    NaN    NaN    NaN
2015-07-27    NaN  15.78  16.33  16.75  17.05  17.25  17.85  18.26  18.40    NaN    NaN    NaN
2015-07-28    NaN  14.40  15.50  16.15  16.55  16.80  17.55  17.95  18.09    NaN    NaN    NaN
2015-07-29    NaN  14.05  15.35  16.15  16.60  16.90  17.50  17.92  18.11    NaN    NaN    NaN
2015-07-30    NaN  13.95  15.31  16.09  16.55  16.80  17.49  17.88  18.10    NaN    NaN    NaN
2015-07-31    NaN  14.00  15.30  16.10  16.55  16.85  17.45  17.85  18.08    NaN    NaN    NaN
2015-08-03    NaN  13.90  15.10  15.85  16.30  16.60  17.25  17.75  17.95    NaN    NaN    NaN
2015-08-04    NaN  14.15  15.25  16.00  16.45  16.75  17.35  17.80  18.05    NaN    NaN    NaN
2015-08-05    NaN  13.95  15.20  15.94  16.40  16.70  17.30  17.75  18.00    NaN    NaN    NaN
2015-08-06    NaN  14.50  15.40  16.10  16.50  16.80  17.40  17.85  18.07    NaN    NaN    NaN
2015-08-07    NaN  14.25  15.20  15.90  16.30  16.60  17.25  17.75  17.95    NaN    NaN    NaN
2015-08-10    NaN  13.59  14.86  15.70  16.15  16.45  17.18  17.62  17.85    NaN    NaN    NaN
2015-08-11    NaN  14.50  15.45  16.10  16.50  16.85  17.45  17.90  18.05    NaN    NaN    NaN
2015-08-12    NaN  14.30  15.25  16.00  16.50  16.75  17.35  17.79  18.07    NaN    NaN    NaN
2015-08-13    NaN  14.15  15.05  15.85  16.35  16.60  17.35  17.74  18.01    NaN    NaN    NaN
2015-08-14    NaN  14.29  15.10  15.80  16.35  16.55  17.25  17.65  17.88    NaN    NaN    NaN
2015-08-17    NaN  13.85  15.00  15.75  16.20  16.60  17.20  17.70  17.90    NaN    NaN    NaN
2015-08-18    NaN  14.30  15.20  16.05  16.49  16.80  17.35  17.81  18.05    NaN    NaN    NaN
2015-08-19    NaN    NaN  15.70  16.30  16.75  16.95  17.55  17.95  18.25  18.55    NaN    NaN
2015-08-20    NaN    NaN  17.30  17.35  17.50  17.60  18.15  18.50  18.70  19.00    NaN    NaN
2015-08-21    NaN    NaN  19.90  18.65  18.35  18.25  18.70  19.05  19.17  19.45    NaN    NaN
2015-08-24    NaN    NaN  25.15  22.50  21.30  20.67  20.63  20.60  20.66  20.70    NaN    NaN
2015-08-25    NaN    NaN  25.35  22.60  21.47  20.75  20.75  20.83  20.83  20.90    NaN    NaN
2015-08-26    NaN    NaN  23.70  21.50  20.70  20.10  20.15  20.20  20.25  20.40    NaN    NaN
2015-08-27    NaN    NaN  24.20  21.50  20.73  20.15  20.34  20.43  20.45  20.62    NaN    NaN
2015-08-28    NaN    NaN  24.40  21.85  21.40  21.00  21.15  21.25  21.22  21.30    NaN    NaN
2015-08-31    NaN    NaN  26.45  23.45  22.60  22.10  22.25  22.30  22.30  22.20    NaN    NaN
2015-09-01    NaN    NaN  29.70  25.85  24.65  23.90  23.98  23.97  23.88  23.88    NaN    NaN
2015-09-02    NaN    NaN  25.95  23.85  23.25  22.75  22.90  23.05  22.95  23.05    NaN    NaN
2015-09-03    NaN    NaN  25.65  23.90  23.35  22.90  23.10  23.20  23.15  23.20    NaN    NaN
2015-09-04    NaN    NaN  27.95  25.80  24.75  24.00  23.90  23.90  23.80  23.80    NaN    NaN
2015-09-08    NaN    NaN  24.60  23.60  23.20  22.65  22.85  22.90  22.85  23.00    NaN    NaN
2015-09-09    NaN    NaN  25.85  24.20  23.65  23.00  23.15  23.10  23.05  23.05    NaN    NaN
2015-09-10    NaN    NaN  24.65  23.40  23.00  22.55  22.65  22.80  22.80  22.90    NaN    NaN
2015-09-11    NaN    NaN  23.75  22.95  22.70  22.30  22.60  22.65  22.70  22.75    NaN    NaN
2015-09-14    NaN    NaN  23.70  22.90  22.65  22.40  22.50  22.55  22.60  22.74    NaN    NaN
2015-09-15    NaN    NaN  22.45  20.45  20.63  20.62  20.95  21.17  21.26  21.40    NaN    NaN
2015-09-16    NaN    NaN    NaN  19.00  19.30  19.25  19.60  19.95  20.05  20.36  21.20    NaN
2015-09-17    NaN    NaN    NaN  20.40  20.10  19.96  20.20  20.50  20.55  20.77  20.78    NaN
2015-09-18    NaN    NaN    NaN  21.50  20.95  20.80  21.00  21.15  21.25  21.45  21.40    NaN
2015-09-21    NaN    NaN    NaN  19.68  19.82  19.88  20.20  20.40  20.50  20.80  20.95    NaN
2015-09-22    NaN    NaN    NaN  21.75  21.14  20.90  21.05  21.25  21.30  21.50  21.50    NaN
2015-09-23    NaN    NaN    NaN  21.10  20.70  20.50  20.75  20.95  20.90  21.20  21.16    NaN
2015-09-24    NaN    NaN    NaN  21.90  21.19  20.97  21.20  21.40  21.40  21.65  21.61    NaN
2015-09-25    NaN    NaN    NaN  22.35  21.65  21.40  21.55  21.72  21.65  21.92  21.90    NaN
2015-09-28    NaN    NaN    NaN  24.05  22.73  22.15  22.25  22.30  22.25  22.44  22.40    NaN
2015-09-29    NaN    NaN    NaN  23.85  22.70  22.25  22.35  22.50  22.50  22.60  22.55    NaN
2015-09-30    NaN    NaN    NaN  23.02  22.14  21.78  22.00  22.20  22.20  22.45  22.30    NaN
2015-10-01    NaN    NaN    NaN  22.20  21.40  21.20  21.55  21.80  21.85  22.05  22.17    NaN
2015-10-02    NaN    NaN    NaN  21.05  20.50  20.44  20.75  21.10  21.20  21.50  21.79    NaN
2015-10-05    NaN    NaN    NaN  19.90  19.50  19.50  19.90  20.25  20.40  20.70  20.85    NaN
2015-10-06    NaN    NaN    NaN  20.30  19.95  19.95  20.35  20.55  20.73  21.05  21.10    NaN
2015-10-07    NaN    NaN    NaN  19.45  19.20  19.20  19.65  20.05  20.15  20.45  20.66    NaN
2015-10-08    NaN    NaN    NaN  18.45  18.53  18.80  19.25  19.70  19.85  20.15  20.30    NaN
2015-10-09    NaN    NaN    NaN  18.15  18.40  18.75  19.30  19.70  19.90  20.25  20.35    NaN
2015-10-12    NaN    NaN    NaN  16.70  17.20  17.66  18.35  18.79  19.00  19.45  19.75    NaN
2015-10-13    NaN    NaN    NaN  18.28  18.68  18.94  19.50  19.78  19.96  20.22  20.40    NaN
2015-10-14    NaN    NaN    NaN  18.32  18.72  19.12  19.74  20.06  20.21  20.58  20.65    NaN
2015-10-15    NaN    NaN    NaN  16.55  17.15  17.65  18.45  18.95  19.10  19.55  19.70    NaN
2015-10-16    NaN    NaN    NaN  16.10  17.10  17.74  18.55  18.97  19.20  19.58  19.67    NaN
2015-10-19    NaN    NaN    NaN  15.25  16.00  16.75  17.70  18.20  18.43  18.87  19.08    NaN
2015-10-20    NaN    NaN    NaN  16.10  17.00  17.60  18.45  18.90  19.10  19.50  19.75    NaN
2015-10-21    NaN    NaN    NaN    NaN  18.20  18.60  19.25  19.60  19.75  20.19  20.35  20.50
2015-10-22    NaN    NaN    NaN    NaN  16.05  16.81  17.69  18.18  18.42  18.87  19.11  19.40
2015-10-23    NaN    NaN    NaN    NaN  16.65  17.25  18.05  18.49  18.72  19.15  19.37  19.55
2015-10-26    NaN    NaN    NaN    NaN  16.90  17.50  18.33  18.79  19.03  19.47  19.70  19.91
2015-10-27    NaN    NaN    NaN    NaN  16.50  17.15  18.03  18.45  18.75  19.15  19.30  19.65
2015-10-28    NaN    NaN    NaN    NaN  16.00  16.80  17.70  18.20  18.43  18.83  19.04  19.35
2015-10-29    NaN    NaN    NaN    NaN  16.17  16.98  17.84  18.35  18.65  19.01  19.20  19.45
2015-10-30    NaN    NaN    NaN    NaN  16.65  17.30  18.18  18.65  18.87  19.29  19.50  19.55
2015-11-02    NaN    NaN    NaN    NaN  15.90  16.60  17.51  18.02  18.35  18.72  19.00  19.03
2015-11-03    NaN    NaN    NaN    NaN  16.13  16.93  17.80  18.25  18.50  18.98  19.17  19.34
2015-11-04    NaN    NaN    NaN    NaN  16.75  17.40  18.16  18.65  18.85  19.20  19.45  19.55
2015-11-05    NaN    NaN    NaN    NaN  16.05  16.80  17.70  18.20  18.54  18.95  19.20  19.37
2015-11-06    NaN    NaN    NaN    NaN  15.70  16.50  17.37  17.95  18.35  18.75  19.00  19.20
2015-11-09    NaN    NaN    NaN    NaN  17.04  17.47  18.09  18.55  18.75  19.15  19.35  19.50
2015-11-10    NaN    NaN    NaN    NaN  16.25  16.90  17.70  18.20  18.50  18.95  19.20  19.35
2015-11-11    NaN    NaN    NaN    NaN  16.80  17.40  18.00  18.50  18.80  19.19  19.45  19.60
2015-11-12    NaN    NaN    NaN    NaN  18.58  18.75  19.12  19.50  19.75  20.07  20.20  20.35
2015-11-13    NaN    NaN    NaN    NaN  20.34  19.87  20.15  20.35  20.50  20.70  20.85  20.95
2015-11-16    NaN    NaN    NaN    NaN  18.10  17.85  18.45  18.85  19.10  19.50  19.70  19.85
2015-11-17    NaN    NaN    NaN    NaN  18.95  18.55  18.77  19.18  19.40  19.82  20.02  20.44
2015-11-18  20.10    NaN    NaN    NaN    NaN  17.80  18.35  18.80  19.00  19.40  19.60  19.85
2015-11-19  20.45    NaN    NaN    NaN    NaN  18.15  18.85  19.29  19.45  19.83  20.00  20.20
2015-11-20  20.29    NaN    NaN    NaN    NaN  17.40  18.30  18.85  19.15  19.60  19.80  20.01
2015-11-23  20.00    NaN    NaN    NaN    NaN  17.00  17.80  18.45  18.79  19.26  19.47  19.75
2015-11-24  20.32    NaN    NaN    NaN    NaN  17.45  18.30  18.93  19.20  19.65  19.85  20.10
2015-11-25  20.26    NaN    NaN    NaN    NaN  16.80  17.75  18.55  18.90  19.38  19.63  19.90
2015-11-27  20.18    NaN    NaN    NaN    NaN  17.20  18.05  18.70  19.00  19.47  19.72  20.00
2015-11-30  20.10    NaN    NaN    NaN    NaN  17.00  17.80  18.46  18.75  19.25  19.48  19.75
2015-12-01  19.71    NaN    NaN    NaN    NaN  16.15  17.10  17.89  18.24  18.75  19.00  19.35
2015-12-02  19.93    NaN    NaN    NaN    NaN  16.80  17.55  18.20  18.50  19.05  19.30  19.56
2015-12-03  20.22    NaN    NaN    NaN    NaN  18.35  18.50  19.01  19.20  19.65  19.85  20.10
2015-12-04  19.71    NaN    NaN    NaN    NaN  16.25  17.30  18.00  18.35  18.88  19.13  19.40
2015-12-07  19.84    NaN    NaN    NaN    NaN  16.50  17.50  18.14  18.40  18.95  19.17  19.45
2015-12-08  20.20    NaN    NaN    NaN    NaN  17.80  18.30  18.77  18.95  19.45  19.70  19.90
2015-12-09  20.38    NaN    NaN    NaN    NaN  18.99  19.02  19.30  19.40  19.77  19.90  20.20
2015-12-10  20.60    NaN    NaN    NaN    NaN  19.35  19.10  19.48  19.65  20.00  20.20  20.40
2015-12-11  21.85    NaN    NaN    NaN    NaN  23.70  21.95  21.60  21.40  21.56  21.57  21.72
2015-12-14  21.00    NaN    NaN    NaN    NaN  22.45  20.42  20.45  20.43  20.65  20.82  21.00

Yearly structure has too many NaNs

When looking at future for month x make it so that for the inputs the rows are in the following order:

$x-1, x, x+1, \dots, x+6$ therefore having inputs with eight rows.


In [54]:
pd.concat([x_yearly.loc(axis=0)[:, 5]] * 3, axis=1).iloc[:, 5+10:5+18]


Out[54]:
4 5 6 7 8 9 10 11
year month date
7 5 2006-11-20 13.90 14.560 15.45 NaN NaN NaN NaN NaN
2006-12-27 14.44 15.060 15.31 NaN 15.755 NaN NaN NaN
2006-12-28 14.50 15.130 15.47 NaN 15.850 NaN NaN NaN
2006-12-29 14.53 15.250 15.48 NaN 15.950 NaN NaN NaN
2007-01-03 14.53 15.250 15.48 NaN 15.920 NaN NaN NaN
2007-01-04 14.30 15.150 15.45 NaN 15.950 NaN NaN NaN
2007-01-05 14.39 15.200 15.50 NaN 15.950 NaN NaN NaN
2007-01-08 14.44 15.000 15.46 NaN 15.900 NaN NaN NaN
2007-01-09 14.34 14.900 15.35 NaN 15.940 NaN NaN NaN
2007-01-10 14.39 14.800 15.30 NaN 15.900 NaN NaN NaN
2007-01-11 14.00 14.710 15.04 NaN 15.710 NaN NaN NaN
2007-01-12 13.80 14.410 14.88 NaN 15.440 NaN NaN NaN
2007-01-16 13.72 14.400 14.86 NaN 15.450 NaN NaN NaN
2007-01-17 13.57 14.360 14.71 NaN 15.300 NaN NaN NaN
2007-01-18 13.44 14.010 14.64 NaN 15.080 NaN NaN NaN
2007-01-19 13.41 13.970 14.59 NaN 15.040 NaN NaN NaN
2007-01-22 13.39 14.050 14.54 14.73 15.070 NaN NaN NaN
2007-01-23 13.40 13.910 14.40 14.68 14.900 NaN NaN NaN
2007-01-24 13.22 13.810 14.27 14.41 14.760 NaN NaN NaN
2007-01-25 13.51 13.930 14.11 14.43 14.770 NaN NaN NaN
2007-01-26 13.55 13.910 14.20 14.40 14.800 NaN NaN NaN
2007-01-29 13.45 13.890 14.21 14.40 14.800 NaN NaN NaN
2007-01-30 13.35 13.750 14.14 14.41 14.750 NaN NaN NaN
2007-01-31 13.15 13.700 14.06 14.33 14.660 NaN NaN NaN
2007-02-01 13.08 13.650 14.00 14.31 14.600 NaN NaN NaN
2007-02-02 13.05 13.710 14.05 14.26 14.680 NaN NaN NaN
2007-02-05 13.02 13.760 14.05 14.40 14.750 NaN NaN NaN
2007-02-06 12.83 13.395 13.89 14.32 14.600 NaN NaN NaN
2007-02-07 12.80 13.500 13.82 14.10 14.550 NaN NaN NaN
2007-02-08 12.86 13.520 13.85 14.13 14.600 NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ...
17 5 2017-03-06 14.40 15.350 15.99 16.65 16.900 17.46 17.72 NaN
2017-03-07 14.45 15.350 16.05 16.65 16.950 17.55 17.75 NaN
2017-03-08 14.45 15.320 15.98 16.60 16.880 17.50 17.78 NaN
2017-03-09 14.45 15.250 15.95 16.55 16.850 17.55 17.77 NaN
2017-03-10 14.20 15.100 15.79 16.40 16.800 17.40 17.67 NaN
2017-03-13 13.85 14.870 15.65 16.35 16.700 17.29 17.60 NaN
2017-03-14 14.08 15.010 15.75 16.50 16.850 17.48 17.77 NaN
2017-03-15 13.71 14.710 15.54 16.26 16.600 17.25 17.55 NaN
2017-03-16 13.25 14.330 15.24 16.01 16.350 17.05 17.39 NaN
2017-03-17 13.25 14.230 15.10 15.85 16.300 16.90 17.30 NaN
2017-03-20 13.18 14.140 14.99 15.79 16.200 16.90 17.30 NaN
2017-03-21 13.77 14.550 15.35 16.13 16.510 17.19 17.52 NaN
2017-03-22 13.92 14.580 15.27 16.00 16.450 17.25 17.61 17.83
2017-03-23 14.30 14.770 15.50 16.20 16.700 17.40 17.75 17.94
2017-03-24 13.95 14.350 15.12 15.91 16.360 17.09 17.45 17.70
2017-03-27 13.60 14.050 14.73 15.53 16.020 16.82 17.22 17.50
2017-03-28 12.92 13.560 14.37 15.22 15.740 16.51 16.93 17.29
2017-03-29 12.94 13.500 14.23 15.07 15.600 16.35 16.80 17.10
2017-03-30 12.82 13.350 14.17 15.05 15.490 16.28 16.75 17.00
2017-03-31 13.28 13.580 14.25 15.00 15.500 16.26 16.70 17.03
2017-04-03 13.48 13.580 14.27 15.08 15.570 16.34 16.75 17.05
2017-04-04 13.24 13.370 14.13 14.95 15.400 16.10 16.50 16.82
2017-04-05 13.90 13.850 14.55 15.30 15.680 16.45 16.74 17.00
2017-04-06 13.60 13.550 14.15 14.96 15.350 16.05 16.47 16.78
2017-04-07 14.05 13.860 14.55 15.25 15.600 16.30 16.70 16.95
2017-04-10 15.05 14.330 14.79 15.44 15.760 16.48 16.80 17.07
2017-04-11 16.00 14.530 14.88 15.53 15.870 16.52 16.85 17.15
2017-04-12 16.30 14.950 15.02 15.59 15.900 16.56 16.88 17.18
2017-04-13 16.30 15.200 15.27 15.85 16.100 16.70 17.07 17.34
2017-04-17 15.07 14.300 14.55 15.22 15.550 16.25 16.60 16.85

1289 rows × 8 columns


In [55]:
x_yearly.loc(axis=0)[:, 5].count()


Out[55]:
1      622
2      838
3     1073
4     1289
5     1289
6     1289
7     1039
8      872
9      569
10     364
11     416
12     412
dtype: int64

How to easily extract spread prices from x


In [56]:
testblah = pd.concat([x_yearly] * 3, axis=1).iloc[:, 11:25]

In [57]:
x_spreads_yearly = testblah.apply(lambda x: [np.nan] + [2*x.iloc[i] - x.iloc[i-1] - x.iloc[i+1]
                                                        for i in range(1, len(x) - 1)] + [np.nan],
                                  axis=1).iloc[:, 1:13]

In [58]:
x.apply(lambda x: [np.nan] + [2*x[i] - x[i-1] - x[i+1] for i in range(1, len(x) - 1)] + [np.nan], axis=1).iloc[:, 1:7]


Out[58]:
M2 M3 M4 M5 M6 M7
year month date
4 7 2004-05-24 -1.13 NaN NaN NaN NaN NaN
2004-05-25 -0.61 NaN NaN NaN NaN NaN
2004-05-26 -1.19 NaN NaN NaN NaN NaN
2004-05-27 -1.02 NaN NaN NaN NaN NaN
2004-05-28 -1.23 NaN NaN NaN NaN NaN
2004-06-01 -0.92 NaN NaN NaN NaN NaN
2004-06-02 -0.89 NaN NaN NaN NaN NaN
2004-06-03 -0.86 NaN NaN NaN NaN NaN
2004-06-04 -0.51 NaN NaN NaN NaN NaN
2004-06-07 -0.18 NaN NaN NaN NaN NaN
2004-06-08 0.15 NaN NaN NaN NaN NaN
2004-06-09 0.61 NaN NaN NaN NaN NaN
2004-06-10 -0.40 NaN NaN NaN NaN NaN
2004-06-14 -0.40 NaN NaN NaN NaN NaN
10 2004-08-23 -0.51 NaN NaN NaN NaN NaN
2004-08-24 -0.29 NaN NaN NaN NaN NaN
2004-08-25 -0.61 NaN NaN NaN NaN NaN
2004-08-26 -0.56 NaN NaN NaN NaN NaN
2004-08-27 -0.16 NaN NaN NaN NaN NaN
2004-08-30 -0.24 NaN NaN NaN NaN NaN
2004-08-31 0.24 NaN NaN NaN NaN NaN
2004-09-01 0.25 NaN NaN NaN NaN NaN
2004-09-02 -0.26 NaN NaN NaN NaN NaN
2004-09-03 -0.10 NaN NaN NaN NaN NaN
2004-09-07 0.10 NaN NaN NaN NaN NaN
2004-09-08 0.46 NaN NaN NaN NaN NaN
2004-09-09 0.45 NaN NaN NaN NaN NaN
2004-09-10 0.33 NaN NaN NaN NaN NaN
2004-09-13 0.66 NaN NaN NaN NaN NaN
6 1 2005-11-21 0.51 NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ...
17 10 2017-04-21 -0.62 0.21 -0.43 0.37 0.05 0.31
2017-04-24 -0.27 0.38 -0.38 0.46 0.07 0.29
2017-04-25 -0.20 0.30 -0.30 0.35 0.05 0.27
2017-04-26 -0.02 0.36 -0.30 0.42 0.02 0.20
2017-04-27 -0.27 0.46 -0.31 0.42 0.05 0.25
2017-04-28 -0.01 0.37 -0.28 0.35 0.02 0.30
2017-05-01 -0.20 0.38 -0.23 0.42 -0.03 0.25
2017-05-02 0.09 0.48 -0.40 0.50 0.00 0.18
2017-05-03 -0.06 0.42 -0.26 0.30 0.10 0.20
2017-05-04 0.01 0.42 -0.33 0.44 -0.05 0.30
2017-05-05 -0.10 0.43 -0.32 0.43 0.05 0.24
2017-05-08 -0.10 0.57 -0.22 0.40 0.08 0.17
2017-05-09 -0.11 0.51 -0.12 0.33 0.09 0.19
2017-05-10 -0.29 0.49 -0.18 0.47 0.06 0.14
11 2017-04-19 -0.64 0.28 -0.43 0.34 0.10 0.22
2017-04-20 -0.63 0.26 -0.40 0.39 0.10 0.27
2017-04-21 -0.62 0.21 -0.43 0.37 0.05 0.31
2017-04-24 -0.27 0.38 -0.38 0.46 0.07 0.29
2017-04-25 -0.20 0.30 -0.30 0.35 0.05 0.27
2017-04-26 -0.02 0.36 -0.30 0.42 0.02 0.20
2017-04-27 -0.27 0.46 -0.31 0.42 0.05 0.25
2017-04-28 -0.01 0.37 -0.28 0.35 0.02 0.30
2017-05-01 -0.20 0.38 -0.23 0.42 -0.03 0.25
2017-05-02 0.09 0.48 -0.40 0.50 0.00 0.18
2017-05-03 -0.06 0.42 -0.26 0.30 0.10 0.20
2017-05-04 0.01 0.42 -0.33 0.44 -0.05 0.30
2017-05-05 -0.10 0.43 -0.32 0.43 0.05 0.24
2017-05-08 -0.10 0.57 -0.22 0.40 0.08 0.17
2017-05-09 -0.11 0.51 -0.12 0.33 0.09 0.19
2017-05-10 -0.29 0.49 -0.18 0.47 0.06 0.14

15193 rows × 6 columns

Working on some ideas for alternative spread representations


In [59]:
spread_ts = termstructure.apply(lambda x: [np.nan] + [2*x[i] - x[i-1] - x[i+1] for i in range(1, len(x) - 1)] + [np.nan], axis=1).dropna(how="all", axis=(0, 1))

In [73]:
fig = plt.figure(figsize=(10, 15))
for i in range(6):
    ax = fig.add_subplot(6, 1, i + 1)
    y_m = spread_ts.loc["2006-10-23":].iloc[:, i]
    y_m.plot(ax=ax)
    plt.axhline(0, color="black", linewidth=1)
    if i != 5:
        ax.set_xticklabels([])
    plt.xlabel("")
    plt.ylabel("V" + y_m.name[-1])
plt.savefig("spreads-legwise.pdf", format="pdf", dpi=300, bbox_inches="tight")
plt.show()



In [61]:
# Naive prediction when using term structure
spread_per_column = pd.concat((spread_ts.loc["2006-10-23":, column] for column in spread_ts), axis=1)
for column in spread_per_column:
    dropped = spread_per_column.loc[:, column].dropna()
    print(column, np.mean(np.square(dropped.iloc[:-1].values - dropped.iloc[1:].values)))


M2 0.383355706215
M3 0.11088653484
M4 0.0885584180791
M5 0.10773685312
M6 0.152070680522
M7 0.179882983683

In [62]:
spread_per_column.describe()


Out[62]:
M2 M3 M4 M5 M6 M7
count 2656.000000 2656.000000 2656.000000 2629.000000 2455.000000 2146.000000
mean 0.277579 0.178172 0.014480 0.054692 0.006312 0.129515
std 1.289689 0.740439 0.543991 0.497983 0.481377 0.456469
min -13.680000 -9.490000 -3.250000 -2.910000 -3.970000 -3.110000
25% -0.102500 -0.040000 -0.150000 -0.110000 -0.150000 -0.050000
50% 0.290000 0.230000 0.090000 0.100000 0.050000 0.150000
75% 0.800000 0.480000 0.260000 0.300000 0.200000 0.300000
max 3.950000 3.290000 2.100000 4.900000 2.080000 3.120000

Some descriptive statistics about spreads


In [63]:
y_desc = y.describe()
y_mean = y_desc["mean"]
y_std = y_desc["std"]
y_var = np.square(y_std)
y_desc


Out[63]:
count    15193.000000
mean         0.110257
std          0.746659
min        -13.680000
25%         -0.100000
50%          0.130000
75%          0.360000
max          4.900000
dtype: float64

Calculating some SELU stuff

First some test with zero mean and unit variance; then use the values one line above.

From https://arxiv.org/abs/1706.02515, p. 11


In [64]:
def μ_tilde(μ, ω, ν, τ, λ, α):
    t1 = (-(α + μ * ω) *
             erfc((μ * ω) / (sqrt(2) * sqrt(ν * τ))))
    t2 = (α * (μ * ω + (ν * τ) / 2) *
             erfc((μ * ω + ν * τ) / (sqrt(2) * sqrt(ν * τ))))
    t3 = (sqrt(2 / π) * sqrt(ν * τ) *
             (-(sq(μ) * sq(ω)) / (2 * ν * τ)))
    t4 = 2 * μ * ω
    return (λ / 2) * (t1 + t2 + t3 + t4)

def xi_tilde(μ, ω, ν, τ, λ, α):
    t1 = (sq(μ * ω) + ν * τ) * (erfc((μ * ω) / (sqrt(2) * sqrt(ν * τ))) + 1)
    t2 = sq(α) * (-2 * (μ * ω + (ν * τ) / 2) * erfc((μ * ω + ν * τ) / (sqrt(2) * sqrt(ν * τ))) +
                  (2 * (μ * ω + ν * τ)) * erfc((μ * ω + 2 * ν * τ) / (sqrt(2) * sqrt(ν * τ))) +
                  erfc((μ * ω) / (sqrt(2) * sqrt(ν * τ))))
    t3 = sqrt(2 * π) * (μ * ω) * sqrt(ν * τ) * (-(sq(μ * ω)) / (2 * ν * τ))
    return (sq(λ) / 2) * (t1 + t2 + t3)

def ν_tilde(μ, ω, ν, τ, λ, α):
    return xi_tilde(μ, ω, ν, τ, λ, α) - sq(μ_tilde(μ, ω, ν, τ, λ, α))

In [65]:
assert np.isclose(mu_tilde(0, 0, 1, 1, 1.0507, 1.67326), 0, atol=1e-06), "This should be close to 0 but isn't."
assert np.isclose(ny_tilde(0, 0, 1, 1, 1.0507, 1.67326), 1, atol=1e-06), "This should be close to 1 but isn't."


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-65-eb7811016a53> in <module>()
----> 1 assert np.isclose(mu_tilde(0, 0, 1, 1, 1.0507, 1.67326), 0, atol=1e-06), "This should be close to 0 but isn't."
      2 assert np.isclose(ny_tilde(0, 0, 1, 1, 1.0507, 1.67326), 1, atol=1e-06), "This should be close to 1 but isn't."

NameError: name 'mu_tilde' is not defined

In [ ]:
print(mu_tilde(0, 0, 1, 1, 1.0507, 1.67326))
print(ny_tilde(0, 0, 1, 1, 1.0507, 1.67326))  # Shit. 1.55 != 1

In [ ]:
((y - y.mean()) / y.std()).describe()

Plotting the spreads per month


In [74]:
fig = plt.figure(figsize=(10, 15))
for month in range(1, 13):
    ax = fig.add_subplot(12, 1, month)
    y_m = y.loc(axis=0)[:, month]
    y_m.plot(ax=ax)
    plt.axhline(0, color="black", linewidth=1)
    plt.xticks(np.arange(0, len(y_m), 200), [])
    plt.xlabel("")
    plt.ylabel(calendar.month_name[month])
plt.xticks(np.arange(0, len(y_m), 200), (y_m.index.droplevel([1, 2]) + 2000)[::200])
plt.savefig("spreads-monthwise.pdf", format="pdf", dpi=300, bbox_inches="tight")
plt.show()



In [67]:
y_by_month = pd.concat((y.loc[:, i] for i in range (1,13)), axis=1, keys=calendar.month_abbr[1:])

In [68]:
print(y_by_month.describe().T.applymap(lambda x: '{:.2f}'.format(x)).to_latex())


\begin{tabular}{lllllllll}
\toprule
{} &    count &   mean &   std &     min &    25\% &    50\% &    75\% &   max \\
\midrule
Jan &  1281.00 &   0.54 &  0.67 &   -3.06 &   0.15 &   0.40 &   0.87 &  2.75 \\
Feb &  1239.00 &   0.25 &  0.61 &   -1.55 &   0.00 &   0.18 &   0.35 &  4.90 \\
Mar &  1238.00 &  -0.06 &  0.58 &   -3.05 &  -0.23 &  -0.05 &   0.22 &  1.55 \\
Apr &  1329.00 &   0.28 &  0.53 &   -2.89 &   0.05 &   0.22 &   0.45 &  3.60 \\
May &  1289.00 &   0.04 &  0.56 &   -3.97 &  -0.10 &   0.07 &   0.21 &  2.75 \\
Jun &  1279.00 &   0.03 &  0.39 &   -2.24 &  -0.15 &  -0.01 &   0.16 &  1.86 \\
Jul &  1328.00 &   0.32 &  0.46 &   -2.72 &   0.10 &   0.28 &   0.47 &  3.12 \\
Aug &  1224.00 &  -0.13 &  0.43 &   -2.03 &  -0.30 &  -0.10 &   0.05 &  2.85 \\
Sep &  1236.00 &   0.29 &  0.67 &   -7.08 &   0.09 &   0.21 &   0.45 &  3.60 \\
Oct &  1281.00 &   0.31 &  0.65 &   -4.01 &   0.10 &   0.23 &   0.45 &  3.30 \\
Nov &  1256.00 &   0.23 &  0.93 &  -13.68 &   0.10 &   0.24 &   0.45 &  3.54 \\
Dec &  1213.00 &  -0.87 &  1.15 &  -10.90 &  -0.94 &  -0.60 &  -0.33 &  1.64 \\
\bottomrule
\end{tabular}


In [69]:
sp_ts_d = spread_ts.describe().T
sp_ts_d.index = ["V" + str(i) for i in range(2, 8)]

In [70]:
print(sp_ts_d.to_latex(float_format="%.2f"))


\begin{tabular}{lrrrrrrrr}
\toprule
{} &  count &  mean &  std &    min &   25\% &  50\% &  75\% &  max \\
\midrule
V2 & 2790.0 &  0.26 & 1.27 & -13.68 & -0.13 & 0.27 & 0.79 & 3.95 \\
V3 & 2656.0 &  0.18 & 0.74 &  -9.49 & -0.04 & 0.23 & 0.48 & 3.29 \\
V4 & 2656.0 &  0.01 & 0.54 &  -3.25 & -0.15 & 0.09 & 0.26 & 2.10 \\
V5 & 2629.0 &  0.05 & 0.50 &  -2.91 & -0.11 & 0.10 & 0.30 & 4.90 \\
V6 & 2455.0 &  0.01 & 0.48 &  -3.97 & -0.15 & 0.05 & 0.20 & 2.08 \\
V7 & 2146.0 &  0.13 & 0.46 &  -3.11 & -0.05 & 0.15 & 0.30 & 3.12 \\
\bottomrule
\end{tabular}


In [71]:
spread_ts


Out[71]:
M2 M3 M4 M5 M6 M7
date
2004-05-24 -1.13 NaN NaN NaN NaN NaN
2004-05-25 -0.61 NaN NaN NaN NaN NaN
2004-05-26 -1.19 NaN NaN NaN NaN NaN
2004-05-27 -1.02 NaN NaN NaN NaN NaN
2004-05-28 -1.23 NaN NaN NaN NaN NaN
2004-06-01 -0.92 NaN NaN NaN NaN NaN
2004-06-02 -0.89 NaN NaN NaN NaN NaN
2004-06-03 -0.86 NaN NaN NaN NaN NaN
2004-06-04 -0.51 NaN NaN NaN NaN NaN
2004-06-07 -0.18 NaN NaN NaN NaN NaN
2004-06-08 0.15 NaN NaN NaN NaN NaN
2004-06-09 0.61 NaN NaN NaN NaN NaN
2004-06-10 -0.40 NaN NaN NaN NaN NaN
2004-06-14 -0.40 NaN NaN NaN NaN NaN
2004-06-15 -0.25 NaN NaN NaN NaN NaN
2004-08-23 -0.51 NaN NaN NaN NaN NaN
2004-08-24 -0.29 NaN NaN NaN NaN NaN
2004-08-25 -0.61 NaN NaN NaN NaN NaN
2004-08-26 -0.56 NaN NaN NaN NaN NaN
2004-08-27 -0.16 NaN NaN NaN NaN NaN
2004-08-30 -0.24 NaN NaN NaN NaN NaN
2004-08-31 0.24 NaN NaN NaN NaN NaN
2004-09-01 0.25 NaN NaN NaN NaN NaN
2004-09-02 -0.26 NaN NaN NaN NaN NaN
2004-09-03 -0.10 NaN NaN NaN NaN NaN
2004-09-07 0.10 NaN NaN NaN NaN NaN
2004-09-08 0.46 NaN NaN NaN NaN NaN
2004-09-09 0.45 NaN NaN NaN NaN NaN
2004-09-10 0.33 NaN NaN NaN NaN NaN
2004-09-13 0.66 NaN NaN NaN NaN NaN
... ... ... ... ... ... ...
2017-03-30 -0.29 -0.06 0.44 -0.35 0.32 0.22
2017-03-31 -0.37 -0.08 0.25 -0.26 0.32 0.11
2017-04-03 -0.59 -0.12 0.32 -0.28 0.36 0.11
2017-04-04 -0.63 -0.06 0.37 -0.25 0.30 0.08
2017-04-05 -0.75 -0.05 0.37 -0.39 0.48 0.03
2017-04-06 -0.65 -0.21 0.42 -0.31 0.28 0.11
2017-04-07 -0.88 -0.01 0.35 -0.35 0.30 0.15
2017-04-10 -1.18 -0.19 0.33 -0.40 0.40 0.05
2017-04-11 -1.82 -0.30 0.31 -0.31 0.32 0.03
2017-04-12 -1.42 -0.50 0.26 -0.35 0.34 0.02
2017-04-13 -1.17 -0.51 0.33 -0.35 0.23 0.10
2017-04-17 -1.02 -0.42 0.34 -0.37 0.35 0.10
2017-04-18 -0.60 -0.41 0.31 -0.39 0.36 0.01
2017-04-19 -0.64 0.28 -0.43 0.34 0.10 0.22
2017-04-20 -0.63 0.26 -0.40 0.39 0.10 0.27
2017-04-21 -0.62 0.21 -0.43 0.37 0.05 0.31
2017-04-24 -0.27 0.38 -0.38 0.46 0.07 0.29
2017-04-25 -0.20 0.30 -0.30 0.35 0.05 0.27
2017-04-26 -0.02 0.36 -0.30 0.42 0.02 0.20
2017-04-27 -0.27 0.46 -0.31 0.42 0.05 0.25
2017-04-28 -0.01 0.37 -0.28 0.35 0.02 0.30
2017-05-01 -0.20 0.38 -0.23 0.42 -0.03 0.25
2017-05-02 0.09 0.48 -0.40 0.50 0.00 0.18
2017-05-03 -0.06 0.42 -0.26 0.30 0.10 0.20
2017-05-04 0.01 0.42 -0.33 0.44 -0.05 0.30
2017-05-05 -0.10 0.43 -0.32 0.43 0.05 0.24
2017-05-08 -0.10 0.57 -0.22 0.40 0.08 0.17
2017-05-09 -0.11 0.51 -0.12 0.33 0.09 0.19
2017-05-10 -0.29 0.49 -0.18 0.47 0.06 0.14
2017-05-11 0.08 0.42 -0.29 0.52 0.03 0.23

2790 rows × 6 columns


In [72]:
fig = plt.figure(figsize=(8, 4))
ax = fig.gca(projection='3d')
xs = np.arange(len(spread_ts))
zs = np.arange(6)
verts = []
xm_settle_3dplot = spread_ts.copy()
xm_settle_3dplot.index = xs
for z in zs:
    ys = xm_settle_3dplot.iloc[:,int(z)].fillna(0)
    ys.iloc[0] = 0
    ys.iloc[-1] = 0
    verts.append(list(zip(ys.index.values, ys.values)))
poly = PolyCollection(verts, linewidth=2.0, facecolors=[cm.winter(i, 0.8) for i in  np.linspace(0, 1, 6)])
ax.add_collection3d(poly, zs=zs, zdir='y')

ax.set_xlim3d(0, len(spread_ts))
#ax.set_xticks([(spread_ts.index.year == year).argmax() for year in spread_ts.index.year.unique()[1::2]])
ax.set_xticklabels(spread_ts.index.year.unique()[1::2])
ax.set_ylim3d(0.0, 5.5)
ax.set_yticklabels(["V2", "V3", "V4", "V5", "V6", "V7"])
ax.set_zlim3d(-1, 2)

#plt.savefig("termstructures.pdf", format="pdf", bbox_inches="tight", dpi=300)
plt.show()