SVD of Minute-Market-Data

Dan Schmidt's Script Fall 2016


In [2]:
%reload_ext autoreload
%load_ext autoreload
%autoreload 2
import os
import sys

# add the 'src' directory as one where we can import modules
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)
%aimport preprocess
from preprocess.process import get_symbol
from preprocess.process import get_symbols_matrix
from preprocess.process import df_to_returns

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os.path
from IPython.core.debugger import Tracer
import scipy as sp
from scipy.linalg import svd
from sklearn.decomposition import PCA
%matplotlib inline


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Preprocessing

Read the data in, convert dates to a useful format


In [3]:
data_loc = "/home/dan/code/market_data/"
symbols = [line.rstrip('\n') for line in open(data_loc+'sp500.txt')]

# Running the SVD on two weeks in 2015, check error on next week
train_start_date = "03/02/2015"
train_end_date = "03/14/2015"

test_start_date = "03/15/2015"
test_end_date = "03/21/2015"

#aapl = get_symbol("AAPL", data_loc, train_start_date, train_end_date)

test_symbols = symbols[400:]
print(test_symbols)

test_mat, syms = get_symbols_matrix(
    test_symbols, data_loc, train_start_date, train_end_date
)


['SEE', 'SE', 'SHW', 'SIG', 'SJM', 'SLB', 'SLG', 'SNA', 'SNI', 'SO', 'SPGI', 'SPG', 'SPLS', 'SRCL', 'SRE', 'STI', 'STJ', 'STT', 'STX', 'STZ', 'SWKS', 'SWK', 'SWN', 'SYF', 'SYK', 'SYMC', 'SYY', 'TAP', 'TDC', 'TDG', 'TEL', 'TGNA', 'TGT', 'TIF', 'TJX', 'TMK', 'TMO', 'TRIP', 'TROW', 'TRV', 'TSCO', 'TSN', 'TSO', 'TSS', 'TWX', 'TXN', 'TXT', 'T', 'UAL', 'UA', 'UDR', 'UHS', 'ULTA', 'UNH', 'UNM', 'UNP', 'UPS', 'URBN', 'URI', 'USB', 'UTX', 'VAR', 'VFC', 'VIAB', 'VLO', 'VMC', 'VNO', 'VRSK', 'VRSN', 'VRTX', 'VTR', 'V', 'VZ', 'WAT', 'WBA', 'WDC', 'WEC', 'WFC', 'WFM', 'WHR', 'WLTW', 'WMB', 'WMT', 'WM', 'WRK', 'WU', 'WYNN', 'WYN', 'WY', 'XEC', 'XEL', 'XLNX', 'XL', 'XOM', 'XRAY', 'XRX', 'XYL', 'YHOO', 'YUM', 'ZBH', 'ZION', 'ZTS']
Reading SEE from pickle
SEE failed integrity check
Reading SE from pickle
Reading SHW from pickle
SHW failed integrity check
Reading SIG from pickle
SIG failed integrity check
Reading SJM from pickle
SJM failed integrity check
Reading SLB from pickle
Reading SLG from pickle
SLG failed integrity check
Reading SNA from pickle
SNA failed integrity check
Reading SNI from pickle
SNI failed integrity check
Reading SO from pickle
Reading SPGI from pickle
SPGI failed integrity check
Reading SPG from pickle
SPG failed integrity check
Reading SPLS from pickle
Reading SRCL from pickle
SRCL failed integrity check
Reading SRE from pickle
SRE failed integrity check
Reading STI from pickle
Reading STJ from pickle
STJ failed integrity check
Reading STT from pickle
Reading STX from pickle
Reading STZ from pickle
STZ failed integrity check
Reading SWKS from pickle
Reading SWK from pickle
SWK failed integrity check
Reading SWN from pickle
Reading SYF from pickle
SYF failed integrity check
Reading SYK from pickle
SYK failed integrity check
Reading SYMC from pickle
Reading SYY from pickle
Reading TAP from pickle
TAP failed integrity check
Reading TDC from pickle
TDC failed integrity check
Reading TDG from pickle
TDG failed integrity check
Reading TEL from pickle
TEL failed integrity check
Reading TGNA from pickle
TGNA failed integrity check
Reading TGT from pickle
Reading TIF from pickle
TIF failed integrity check
Reading TJX from pickle
Reading TMK from pickle
TMK failed integrity check
Reading TMO from pickle
TMO failed integrity check
Reading TRIP from pickle
TRIP failed integrity check
Reading TROW from pickle
TROW failed integrity check
Reading TRV from pickle
Reading TSCO from pickle
TSCO failed integrity check
Reading TSN from pickle
Reading TSO from pickle
Reading TSS from pickle
TSS failed integrity check
Reading TWX from pickle
Reading TXN from pickle
Reading TXT from pickle
TXT failed integrity check
Dumped  T  to pickle
Dumped  UAL  to pickle
Dumped  UA  to pickle
UA failed integrity check
Dumped  UDR  to pickle
UDR failed integrity check
Dumped  UHS  to pickle
UHS failed integrity check
Dumped  ULTA  to pickle
ULTA failed integrity check
Dumped  UNH  to pickle
Dumped  UNM  to pickle
UNM failed integrity check
Dumped  UNP  to pickle
Dumped  UPS  to pickle
Dumped  URBN  to pickle
URBN failed integrity check
Dumped  URI  to pickle
URI failed integrity check
Dumped  USB  to pickle
Dumped  UTX  to pickle
Dumped  VAR  to pickle
VAR failed integrity check
Dumped  VFC  to pickle
VFC failed integrity check
Dumped  VIAB  to pickle
Dumped  VLO  to pickle
Dumped  VMC  to pickle
VMC failed integrity check
Dumped  VNO  to pickle
VNO failed integrity check
Dumped  VRSK  to pickle
VRSK failed integrity check
Dumped  VRSN  to pickle
VRSN failed integrity check
Dumped  VRTX  to pickle
VRTX failed integrity check
Dumped  VTR  to pickle
Dumped  V  to pickle
Dumped  VZ  to pickle
Dumped  WAT  to pickle
WAT failed integrity check
Dumped  WBA  to pickle
Dumped  WDC  to pickle
WDC failed integrity check
Dumped  WEC  to pickle
WEC failed integrity check
Dumped  WFC  to pickle
Dumped  WFM  to pickle
Dumped  WHR  to pickle
WHR failed integrity check
Dumped  WLTW  to pickle
Dumped  WMB  to pickle
Dumped  WMT  to pickle
Dumped  WM  to pickle
Dumped  WRK  to pickle
Dumped  WU  to pickle
Dumped  WYNN  to pickle
Dumped  WYN  to pickle
WYN failed integrity check
Dumped  WY  to pickle
Dumped  XEC  to pickle
XEC failed integrity check
Dumped  XEL  to pickle
Dumped  XLNX  to pickle
Dumped  XL  to pickle
XL failed integrity check
Dumped  XOM  to pickle
Dumped  XRAY  to pickle
XRAY failed integrity check
Dumped  XRX  to pickle
Dumped  XYL  to pickle
XYL failed integrity check
Dumped  YHOO  to pickle
Dumped  YUM  to pickle
Dumped  ZBH  to pickle
ZBH failed integrity check
Dumped  ZION  to pickle
ZION failed integrity check
Dumped  ZTS  to pickle

In [4]:
X = test_mat.values.T

# Remove the mean (which roughly removes linear 2-week trend)
Xmean = X.mean(axis=0)
Xd = X-Xmean
#Xd = X

print(Xd.shape)
# Try the SVD
U, S, VT = svd(X)


(48, 3893)

In [5]:
print(U.shape)
print(S.shape)
print(VT.shape)
print(Xmean.shape)


(48, 48)
(48,)
(3893, 3893)
(3893,)

In [6]:
pca = PCA(svd_solver='randomized')
pca.fit(Xd)

plt.plot(pca.explained_variance_)
plt.show()

market = pca.components_[0, :]
plt.plot( (1+(Xmean)).cumprod() )
plt.show()

# compare to actual market
spy = get_symbol("SPY", data_loc, train_start_date, train_end_date)

plt.plot(spy['Close'].values)
plt.show()


Reading SPY from pickle

In [ ]:


In [8]:


In [10]:


In [ ]:
# Bayesian MLP

# Hyperparameters
middle_dim = window_length
encoding_dim = 10
reg = 1e-12
ret_thresh = 1e-2

input_w = Input(shape=(2*window_length,))

in_layer = Dense(
    middle_dim,
    init='normal',
    activation='tanh',
    W_regularizer=l2(reg))(input_w)

mid_layer = Dense(
    encoding_dim,
    init='normal', 
    activation='tanh',
    W_regularizer=l2(reg))(in_layer)

output = Dense(
    1,
    init='normal',
    activation='sigmoid')(mid_layer)

mlp_pred = Model(input=input_w, output=output)
mlp_pred.compile(optimizer='adam', loss='binary_crossentropy')

dm_y = (np.abs(aapl_y[:, 0]) > ret_thresh)*(1 / (2*aapl_y[:,0]))+0.5
dm_val = (np.abs(val_y[:,0]) > ret_thresh)*(1 / (2*val_y[:,0]))+0.5

aapl_f = np.concatenate( (aapl_rets, aapl_vol), axis=1)
val_f = np.concatenate( (val_rets, val_vol), axis=1)

mlp_pred.fit(
    aapl_f, 
    dm_y, 
    batch_size=100,
    nb_epoch=25,
    shuffle=True, 
    validation_data=(val_f, dm_val) 
)