In [1]:
# imports
import pandas as pd
import numpy as np
from scipy import stats
import sklearn
from sklearn import preprocessing as pp
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import interactive
interactive(True)
import sys
import tensorflow as tf
import time
import os
import collections
import os.path
import pickle
import logging as log
log.basicConfig(level=log.DEBUG)
import quandl
import pyfolio as pf
In [2]:
f = 'U.pkl'
P = pickle.load(open(f))
log.info('loaded <%s>',f)
P.describe()
Out[2]:
In [3]:
# let's skip the dotcom period
U = P[P.index >= '2005-01-01']
U.describe()
Out[3]:
In [4]:
import sim
_,B = sim.sim(U)
#plot NAV
B.NAV.plot(title='Equal Weight Everyone')
Out[4]:
In [5]:
pd.__version__
Out[5]:
In [6]:
B.index=B.index.tz_localize('UTC')
pf.create_full_tear_sheet(B.NET_Return)
In [7]:
#quandl_auth = os.environ.get('QUANDL_AUTH', 'sCGfc5f6ph5H2wLATVVW')
#SPY = quandl.get("GOOG/NYSE_SPY", authtoken=quandl_auth)
#SPY.Close.plot()
_,Bw = sim.sim(U, sim_FUN=sim.worst_strat)
Bw.NAV.plot(title="Buy yesterday's 10 worst")
Out[7]:
In [8]:
Bw.index=Bw.index.tz_localize('UTC')
pf.create_full_tear_sheet(Bw.NET_Return)
In [9]:
# let's see if we can give the trade more names to
# add capacity and reduce vol a bit
kvargs = {'num_names':20}
_,Bw20 = sim.sim(U, sim_FUN=sim.worst_strat ,kvargs=kvargs)
Bw20.NAV.plot(title="Buy yesterday's 20 worst")
Out[9]:
In [10]:
Bw20.index=Bw20.index.tz_localize('UTC')
pf.create_returns_tear_sheet(Bw20.NET_Return)
In [11]:
pf.plot_annual_returns(Bw20.NET_Return)
#.create_capacity_tear_sheet(Bw20.NET_Return)
Out[11]:
In [12]:
# let's bring the 'best' into this:
kvargs = {'num_names':20}
_,Bb20 = sim.sim(U, sim_FUN=sim.best_strat ,kvargs=kvargs)
Bb20.NAV.plot(title="Buy yesterday's 20 best")
Bb20.index=Bb20.index.tz_localize('UTC')
pf.create_full_tear_sheet(Bb20.NET_Return)
None is especially good: only the second (barely) cracks .6 sharpe, all have vol of > 15% and returns of < 10% and all have suffered wrenching drawdowns.
However, they've all done broadly better than the S&P in most environments with the unhappy exception of the so-called new normal.
Let's see if we can improve them just a bit as-is and then we'll see if we can apply any of our lessons from machine-learning to improve yet a bit more.
Back in 1990, Andrew Lo and Craig MacKinlay wrote of a simple 'contrarian' strategy which combines 1, 2 & 3 to create a stat arb strategy. The gist of it is simple. Buy the prior period's losers and sell the prior period's winners in proportion to their over- or under-performance of the equal-weighted market.
Let's take a look at that strategy as our candidate target strategy for amelioration with ML techniques.
In [13]:
def eq_wt( U, cfg, kvargs ) :
# simple, default strategy: equal weight universe on daily basis
U.Weight = 1/float(len(U.index))
return U
def bestworst( U, cfg, kvargs ) :
# Buy the prior period's losers and sell the prior period's winners in
# proportion to their over- or under-performance of the equal-weighted market.
N = len(U.index)
mktR = U.Return.mean()
Weight = np.add( U.Return, -mktR ) / (-N)
# now let's ensure that we spend 100% on each side
U.Weight = 2 * np.sign(Weight) * (abs(Weight) / sum(abs(Weight)))
return U
#aday = U[U.index==U.index.unique()[3]]
#Weight = np.add(aday.Return, -aday.Return.mean())/(-len(aday.index))
#Ws = 2 * np.sign(Weight) * (abs(Weight) / sum(abs(Weight)))
#sum(abs(Ws))
#Ws
In [14]:
# let's run it
Sbw,Bbw = sim.sim(U, sim_FUN=bestworst)
Bbw.NAV.plot(title="LoMacKinlay")
Bbw.index=Bbw.index.tz_localize('UTC')
pf.create_full_tear_sheet(Bbw.NET_Return)
This is a much better strategy than the others with a sharpe over .8, much lower drawdowns than the others and decent capacity. But as a cousin, it doesn't escape the family influence: it beats the market most of the time, but not in the recently relevant 'recovery' and 'new normal' phases.
So, let's see if we can improve this strategy by utilizing the predictive capabilities we'd fostered with tensorflow's random forest implementation.
Well, the classifier tells us when it thinks we are headed for a move down, a small move one way or the other, or a move up. When it tells us that the market is going down, mostly the market stays still or goes down.
The possibilities are:
So, let's leave the OKs alone and deallocate from the Bads - say, 50% - and reallocate to the Goods.
If we wanted to trade this, we should adjust for liquidity but since we're just playing we'll keep it simple.
In [15]:
fname = 'forsims.pkl'
Dataset = collections.namedtuple('Dataset', ['data', 'target'])
forsims = pickle.load(open(fname))
log.info('read %s', fname)
src_train = forsims['src_train']
src_vlad = forsims['src_vlad']
Kvlad = forsims['Kvlad']
forsims = None
Kvlad.head()
Out[15]:
In [16]:
print Kvlad.shape
print src_vlad.data.shape
print src_train.data.shape
def _fitntestRandomForest( train, vlad, max_nodes=1024, steps=100, model_dir='/tmp/rf') :
# build fit & test random forest for input
fsize = len(train.data.columns)
nclasses = len(train.target.unique())
hparams = tf.contrib.tensor_forest.python.tensor_forest.ForestHParams(
num_trees=nclasses, max_nodes=max_nodes, num_classes=nclasses, num_features=fsize)
classifier = tf.contrib.learn.TensorForestEstimator(hparams)
tdata = train.data.as_matrix().astype(np.float32)
ttgt = train.target.as_matrix().astype(np.float32)
vdata = vlad.data.as_matrix().astype(np.float32)
vtgt = vlad.target.as_matrix().astype(np.float32)
monitors = [tf.contrib.learn.TensorForestLossMonitor(10, 10)]
classifier.fit(x=tdata, y=ttgt, steps=steps, monitors=monitors)
result = classifier.evaluate(x=vdata, y=vtgt)#, steps=np.round(steps/10)
print('Accuracy: {0:f}'.format(result["accuracy"]))
return result,classifier
In [17]:
# let's train our model
src_rf = _fitntestRandomForest(train=src_train, vlad=src_vlad, model_dir='/tmp/src_rf',steps=100)
In [18]:
# now let's use our trained model to fit the validation set
vdata = src_vlad.data.as_matrix().astype(np.float32)
vtgt = src_vlad.target.as_matrix().astype(np.float32)
p=src_rf[1].predict( x=vdata)
# how'd it do?
R = pd.DataFrame( {'predicted':p,'actual':vtgt})
R['dist'] = np.abs(R.actual-R.predicted)
# avg distance is meaningful. a null predictor should get about .88,
# so anything below provides some edge
print R.dist.mean()
twos=R.dist[R.dist==2]
len(twos.index)/float(len(R.index))
Out[18]:
In [19]:
# ok, let's create a df with date,symbol and prediction which we'll then join onto the simulation dataset
V = pd.DataFrame( {'Date': Kvlad.Date,
'Sym': Kvlad.Sym,
'MLPrediction': R.predicted })
#Kvlad.head()
print U.shape
print V.shape
V.head()
Uv = U[U.index >= V.Date.min()]
print Uv.shape
Uv.reset_index(inplace=True)
Uv.head()
#V.set_index('Date',inplace=True)
Uml = Uv.merge( V, how='left', on=['Date','Sym'] )
Uml.sort_values(['Date','Sym'],inplace=True)
Uml.set_index('Date',inplace=True)
Uml.head()
Out[19]:
In [20]:
def bestworst_ML( U, cfg, kvargs ) :
""" Buy the prior period's losers and sell the prior period's winners in
proportion to their over- or under-performance of the equal-weighted market.
Then, cross-reference ML's views with this. The possibilities are:
- Good: We're buying something predicted to go up
- Good: We're selling something predicted to go down
- OK : We're buying something predicted to stay flat
- OK : We're selling something predicted to stay flat
- Bad : We're buying something predicted to go down
- Bad : We're selling something predicted to go up
We leave the *OK*s alone and deallocate from the *Bad*s according to
value in kvargs 'realloc' and reallocate to the *Goods*.
"""
realloc = kvargs.get('realloc', 0.5)
N = len(U.index)
mktR = U.Return.mean()
Weight = np.add( U.Return, -mktR ) / (-N)
# now let's ensure that we spend 100% on each side
U.Weight = 2 * np.sign(Weight) * (abs(Weight) / sum(abs(Weight)))
# now, let's add-in our ML insights
# we're going to deallocate from these guys
bad1 = np.logical_and(U.MLPrediction==0, U.Weight > 0)
bad2 = np.logical_and(U.MLPrediction==2, U.Weight < 0)
bads = np.logical_or(bad1, bad2)
lbads = np.logical_and(bads, U.Weight>0)
sbads = np.logical_and(bads, U.Weight<0)
# and reallocate to these
good1 = np.logical_and(U.MLPrediction==0, U.Weight < 0)
good2 = np.logical_and(U.MLPrediction==2, U.Weight > 0)
goods = np.logical_or( good1, good2 )
lgoods = np.logical_and(goods, U.Weight>0)
sgoods = np.logical_and(goods, U.Weight<0)
numlgoods = len(U[lgoods].index)
numsgoods = len(U[sgoods].index)
# how much weight to add to longs & shorts?
lwt = U[lbads].Weight.sum() * realloc
swt = U[sbads].Weight.sum() * realloc
# let's deallocate from bads
U.Weight = np.where( bads, U.Weight * (1-realloc),U.Weight)
# and allocate to goods long & short
if numlgoods > 0:
U.Weight = np.where( lgoods, U.Weight + (lwt/numlgoods), U.Weight )
if numsgoods > 0:
U.Weight = np.where( sgoods, U.Weight + (swt/numsgoods), U.Weight )
# pdb.set_trace()
return U
In [21]:
# first, what is our baseline over this period? Let's look at equal-weight and bestworst and then bestworst_ML
_,Beq = sim.sim(Uml)
_,Bbw = sim.sim(Uml,sim_FUN=bestworst)
kvargs = {'realloc': 0.5}
_,Bbwml = sim.sim(Uml,sim_FUN=bestworst_ML,kvargs=kvargs)
In [22]:
Beq.NAV.plot(color='black')
Bbw.NAV.plot(color='green')
Bbwml.NAV.plot(color='blue')
Out[22]:
In [23]:
# let's try reallocating the whole shebang
kvargs = {'realloc': 1}
_,Bbwml1 = sim.sim(Uml,sim_FUN=bestworst_ML,kvargs=kvargs)
In [24]:
Beq.NAV.plot(color='black')
Bbw.NAV.plot(color='green')
Bbwml.NAV.plot(color='blue')
Bbwml1.NAV.plot(color='red')
def pyfolio_tf ( Balances ):
Balances.index=Balances.index.tz_localize('UTC')
pf.create_full_tear_sheet(Balances.NET_Return)
In [25]:
print sim.sharpe(Beq.NET_Return)
print sim.sharpe(Bbw.NET_Return)
print sim.sharpe(Bbwml.NET_Return)
print sim.sharpe(Bbwml1.NET_Return)
pyfolio_tf(Bbwml1)
The market since 2013 has been odd - the 'new normal.' Simply equal-weighting this upward trending period yields a stout .99 Sharpe while the (long/short) baseline bestworst strat struggled to reach .70. Adding the quite limited edge from the random forest model improved things, though how much depends on how heavily you trusted its results. If you trust it about the same as the bestworst model, it improve the ratio to a .95 ratio. Trusting it entirely such that it over-rides the bestworst model in disagreements yields a 1.16 ratio with annual returns of 14% and volatility of 12%.
Let's start simple and largely disregard liquidity concerns and simply pack equal-dollars into long and short sides where we buy things predicted to go up, sell things predicted to go down and ignore things predicted to stay flattish.
In [ ]: