In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# import matplotlib.finance as mf
from matplotlib.widgets import MultiCursor
import statsmodels.tsa.stattools as stt
# import scipy.signal as sgn
import statsmodels.api as sm
# from statsmodels.sandbox.regression.predstd import wls_prediction_std
# from matplotlib.mlab import PCA
In [2]:
%matplotlib inline
In [3]:
# Style. 1
sns.set_context('paper')
sns.set_style("darkgrid")
In [4]:
# Style. 2
sns.set_context('paper')
sns.set_style("dark",
rc={'axes.facecolor': 'black',
'grid.color': 'red',
'grid.linestyle': '--',
'figure.facecolor': 'grey'})
In [5]:
hft = pd.read_hdf('/home/bingnan/HFT_SR_RM_MA_TA.hdf')
In [6]:
ta = hft.minor_xs('TA0001').copy()
In [7]:
#------------------------------------------------
'''Some time length'''
night_len = int(4*3600*2.5)
mor_len = int(4*3600*2.25)
aftn_len = int(4*3600*1.5)
day_len = night_len + mor_len + aftn_len + 4
In [26]:
ta.ix[:, ['bidPrc_0', 'bidPrc_1', 'bidPrc_2', 'askPrc_3', 'bidPrc_4',
'askPrc_1', 'askPrc_2', 'askPrc_3', 'askPrc_4']].any(axis=0) == 0
Out[26]:
In [8]:
#-----------------------------------------------
'''add columns'''
def AddCol(df):
vol = df.ix[:, 'volume'].diff()
# this addition is for the convenience of Log y scale plot
# vol +=1
vol = vol.rename('vol_diff')
openint = df.ix[:, 'openInterest'].diff()
# this addition is for the convenience of Log y scale plot
# openint += 1
openint = openint.rename('openInt_diff')
mid = (df.ix[:, 'askPrc_0'] + df.ix[:, 'bidPrc_0']) / 2.
mid = mid.rename('midPrc')
ret = df.join([vol, openint, mid])
return ret
# -------------------------------------------------
def ForwardDiff(df, n=1):
'''
The reverse of pandas' function 'DataFrame.diff()'
'''
ret = df.diff(periods=n)
ret = ret.shift(periods= -1 * n)
ret = ret.dropna()
return ret
def CutHighVar(df, length=200):
'''
Purpose: Cut a small period after opening in the morning and at night.
Because this time range, the var of price is high, which harmd our model.
df: pd.DataFrame or pd.Series. With datetime index
length: int. the length you want to cut, counted in ticks. Cannot be larger than 240
'''
ret = df
bool_arr1 = np.logical_or(ret.index.hour == 21, ret.index.hour == 9)
bool_arr = np.logical_and.reduce([bool_arr1,
ret.index.minute == 0,
ret.index.second <= int(length//4) - 1])
ret = ret[np.logical_not(bool_arr)]
return ret
def CutTail(df, length=60):
'''
Purpose: Cut a small period before market close.
df: pd.DataFrame or pd.Series. With datetime index
length: int. the length you want to cut, counted in ticks. Cannot be larger than 240
'''
ret = df
last_boolean1 = np.logical_and.reduce(
[ret.index.hour == 14,
ret.index.minute == 59,
ret.index.second >= 60 - int(length//4)])
# this is the last tick
last_boolean2 = ret.index.hour == 15
ret = ret[np.logical_not(np.logical_or(last_boolean1, last_boolean2))]
return ret
def DayChangeNum(ser, distance=7):
'''
ser is price move series after process.
distance counting in hours
'''
h = ser.index.hour
h_diff = np.diff(h)
h_diff = np.insert(h_diff, 1, 0)
ret = np.where(np.abs(h_diff) > distance)[0]
return ret
def NormPriceMove(ser, daychgnum):
ret = ser.copy()
for i in range(len(daychgnum) - 1):
mysamp = ret.iloc[daychgnum[i]: daychgnum[i+1]]
#print mysamp
mystd = mysamp.std()
print mystd
ret.iloc[daychgnum[i]: daychgnum[i+1]] /= mystd
return ret
def CuthlLimit(df, how='all', depth=0):
s1 = 'bidQty_' + str(depth)
s2 = 'askQty_' + str(depth)
if how == 'all':
arr1 = df.ix[:, s1] == 0
arr2 = df[s2] == 0
bool_arr = np.logical_or(arr1, arr2)
#bool_arr = np.logical_or(df[s1] == 0, df[s2] == 0)
elif how == 'bid':
bool_arr = (df[s1] == 0)
elif how == 'ask':
bool_arr = (df[s2] == 0)
else:
print 'ERROR!'
return df[np.logical_not(bool_arr)]
def GiveMePM(df, n=44, lim=[0, 20], cutdepth=0, norm=False, high_var_length=200):
'''
Purpose: from original DataFrame calculate price move Series. Including CutTail and CutHighVar.
df: the pinzhong DataFrame.
n: forward_ticks
lim: can be like (0, 20), counting in days, or an int array of index.
norm: if True, normalize the price move using every day std.
Return (last) price move series.
'''
global day_len
samp = CuthlLimit(df, how='all', depth=cutdepth)
if len(lim) == 2:
samp = samp.ix[day_len*lim[0]: day_len*lim[1], 'midPrc']
else:
samp = samp.ix[lim, 'midPrc']
#print samp.index[0], samp.index[-1]
ret = ForwardDiff(samp, n)
ret = CutTail(ret, n)
ret = CutHighVar(ret, length=high_var_length)
if norm:
ret_daychangenum = DayChangeNum(ret)
ret = NormPriceMove(ret, ret_daychangenum)
return ret
In [9]:
ta.columns
Out[9]:
In [10]:
ta = AddCol(ta)
In [24]:
def myols(ser, pm, norm=False):
'''
ser is indicator Series
pm is Price move Series
sm is satatsmodel module
this function also automatically align index of df and pm
'''
global sm
ser = ser[pm.index]
ser = ser.dropna()
if norm:
ser = (ser - ser.mean()) / ser.std()
X = sm.add_constant(ser)
Y = pm[X.index]
#print Y[211000:211070]
#print X[211000:211070]
model = sm.OLS(Y, X)
ret = model.fit()
return ret
def Rsquare(y, yhat):
ret = 1 - (y-yhat).var() / y.var()
return ret
def Rsquare2(y, yhat):
ret = 1 - ((y-yhat)**2).mean() / y.var()
return ret
def PredictedRsquare(res, xnew, pm):
'''
pm: outsample price move Series
xnew: indicator Series (or DataFrame)
res: insample regression results (comes from statsmodel's model.fit() )
'''
# first we need to align xnew with outsample
xnew = xnew[pm.index]
xnew = xnew.dropna()
y = pm[xnew.index]
xnew = sm.add_constant(xnew)
ynew = res.predict(xnew)
rsq = Rsquare(y, ynew)
return rsq
def PredictedRsquare2(res, xnew, pm):
'''
pm: outsample price move Series
xnew: indicator Series (or DataFrame)
res: insample regression results (comes from statsmodel's model.fit() )
'''
# first we need to align xnew with outsample
xnew = xnew[pm.index]
xnew = xnew.dropna()
y = pm[xnew.index]
xnew = sm.add_constant(xnew)
ynew = res.predict(xnew)
rsq = Rsquare2(y, ynew)
return rsq
In [189]:
%matplotlib auto
In [187]:
def prc_total(df, t1, t2, fs=(15,10)):
fig = plt.figure(figsize=fs)
ax1 = fig.add_subplot(311)
ax1.plot(df.ix[t1: t2, 'last'], color='#f5f112', marker='*')
ax1.plot(df.ix[t1: t2, 'askPrc_0'], color='lightgreen')
ax1.plot(df.ix[t1: t2, 'bidPrc_0'], color='lightcoral')
ax3 = fig.add_subplot(312, sharex=ax1)
ax3.plot(df.ix[t1: t2, 'vol_diff'], color='white', lw=0.4, marker='*')
ax4 = fig.add_subplot(313, sharex=ax1)
ax4.plot(df.ix[t1: t2, 'openInt_diff'],
color='red', marker='*')
return fig
In [191]:
# --------------------------------------------
'''this is for macroly searching for patterns'''
t11, t22 = '2015-12-05 21:00:01', '2015-12-08 15:00:00'
# samp = ta_10day.ix[pm_index, :]
thefig = prc_total(ta, t11, t22, (15,10))
multi = MultiCursor(thefig.canvas, thefig.axes, color='c', lw=1)
thefig.show()
In [137]:
vwap_indicator = VWAPIndicator(ta, 50)
In [147]:
ta.ix[120:190, 'last']
Out[147]:
In [142]:
ta_in_pm
Out[142]:
In [12]:
def VWAPIndicator4(df, mywindow=50):
'''
use both openInt and volume to calculate 'real' new position.
'''
#TODO: ewm
last = df.ix[:, 'midPrc']
new_position = df.ix[:, ['vol_diff', 'openInt_diff']].sum(axis=1)
new_position += .1
last_multiply_newpos = last * new_position
multiply_roll = last_multiply_newpos.rolling(window=mywindow)
newpos_roll = new_position.rolling(window=mywindow)
vwap = multiply_roll.sum()*1. / newpos_roll.sum()
#print vwap[:15]
#print last[:15]
ret = vwap - last
ret.name = 'vwap_indicator2'
vwap.name = 'vwaPrc2'
return vwap, ret
In [13]:
def VWAPIndicator3(df, mywindow=50):
#TODO: ewm
last = df.ix[:, 'midPrc']
#print (last.isnull()).sum()
vol = df.ix[:, 'vol_diff']
vol += 1.
#print (vol.isnull()).sum()
last_multiply_vol = last * vol
#print (last_multiply_vol.isnull()).sum()
multiply_roll = last_multiply_vol.rolling(window=mywindow)
vol_roll = vol.rolling(window=mywindow)
vwap = multiply_roll.sum()*1. / vol_roll.sum()
#print (vwap.isnull()).sum()
#print vwap[:15]
#print last[:15]
ret = vwap - last
ret.name = 'vwap_indicator'
vwap.name = 'vwaPrc'
return vwap, ret
In [14]:
def GiveMeIndex(arri, arro):
'''
Generate integer index
arr is a two dim ndarray, with each element being a time range(counting in days).
'''
global day_len
index_in = list()
for k in arri:
index_in = index_in + list(range(day_len * k[0], day_len * k[1]))
index_out = list()
for k in arro:
index_out = index_out + list(range(day_len * k[0], day_len * k[1]))
return index_in, index_out
In [15]:
insample_index0, outsample_index0 = GiveMeIndex([[0,2], [4, 6], [8, 10], [14, 16], [18,20], [22, 24], [26,28]],
[[2, 4], [6, 8], [12, 14], [16, 18], [20,22], [24, 26], [28,30]])
insample_index1, outsample_index1 = GiveMeIndex([[0, 20]], [[20, 25]])
insample_index2, outsample_index2 = GiveMeIndex([[5, 25]], [[0, 5]])
insample_index3, outsample_index3 = GiveMeIndex([[0, 5], [10, 25]], [[5, 10]])
insample_index4, outsample_index4 = GiveMeIndex([[0, 10], [15, 25]], [[10, 15]])
insample_index5, outsample_index5 = GiveMeIndex([[0, 13]], [[13, 25]])
insample_index6, outsample_index6 = GiveMeIndex([[0, 18]], [[18, 25]])
# different_io_index = ((insample_index0, outsample_index0), (insample_index1, outsample_index1),
# (insample_index2, outsample_index2), (insample_index3, outsample_index3),
# (insample_index4, outsample_index4), (insample_index5, outsample_index5))
insample_index00, outsample_index00 = GiveMeIndex([[0, 20]],
[[25, 30]])
In [26]:
#-------------------------------------------------------------------------------
'''
Automatically generate in and out sample data, then calculate Qty_indicator,
then regression under different parameters
'''
for mywind in [10, 30, 100, 240, 1000, 2400]:
vwap, vwap_indicator = VWAPIndicator4(ta, mywind)
#print pd.concat([ta[['last','vol_diff']],vwap,vwap_indicator],axis=1)
for forward_ticks in range(44, 45, 1):
ta_in_pm = GiveMePM(ta, n=forward_ticks, lim=insample_index1)
#print pd.concat([ta_in_pm,ta['last']],axis=1).dropna()
ta_out_pm = GiveMePM(ta, n=forward_ticks, lim=outsample_index1)
#df = pd.concat([vwap_indicator, ta_in_pm],axis=1)
#df.dropna(inplace=True)
res = myols(vwap_indicator, ta_in_pm)
print ('\n\n\n------------------------------X length: %d, Y length: %d ------------------------------'
%(mywind, forward_ticks))
print ('In sample time range: %s to %s \nOut sample time range: %s to %s.\n' %(ta_in_pm.index[0],
ta_in_pm.index[-1],
ta_out_pm.index[0],
ta_out_pm.index[-1]))
print res.summary()
#print ta_in_pm.describe()
#print ta_out_pm.describe()
print '\n[OutSample R_SQUARE: %f]' % PredictedRsquare(res, vwap_indicator, ta_out_pm)
print '\n[OutSample R_SQUARE: %f]' % PredictedRsquare2(res, vwap_indicator, ta_out_pm)
In [66]:
TEMP1 = pd.concat([ta1,vwap_indicator,vwap],axis=1)
In [69]:
vwap.ix['2015-11-23 23:13:40': '2015-11-23 23:13:50']
Out[69]:
Out[69]:
Out[69]:
In [71]:
TEMP1.ix['2015-11-23 23:13:40': '2015-11-23 23:13:50', ['last', 'volume', 'vol_diff', 'vwap', 'vwap_indicator']]
Out[71]:
In [50]:
ta1.ix['2015-11-23 23:13:40': '2015-11-23 23:13:50', ['last', 'volume', 'vol_diff']]
Out[50]:
In [27]:
ta1.ix['2015-11-23 23:13:50': '2015-11-23 23:14:04', ['last', 'volume', 'vol_diff']]
Out[27]:
In [345]:
%matplotlib inline
In [297]:
plt.plot(vwap.index, ta.ix[vwap.index, 'last'])
plt.plot(vwap.index, vwap)
Out[297]:
In [270]:
len(vwap_indicator)
Out[270]:
In [271]:
len(ta_in_pm)
Out[271]:
In [272]:
day_len*20
Out[272]:
In [310]:
day_len*20 - len(ta_in_pm)
Out[310]:
In [242]:
(200+44)*20
Out[242]:
In [303]:
temp = vwap_indicator[ta_in_pm.index]
In [307]:
temp = temp.dropna()
In [309]:
(vwap_indicator.isnull()).sum()
Out[309]:
In [308]:
len(temp)
Out[308]:
In [284]:
vwap, vwap_indicator = VWAPIndicator3(ta, 40)
In [ ]:
y = pm[xnew.index]
In [321]:
%matplotlib inline
In [48]:
# --------------------------------------------
'''plot fit'''
fig, ax = plt.subplots()
fig = sm.graphics.plot_fit(res, res.model.exog_names[1], ax=ax)
In [54]:
vwap_indicator.hist(bins=np.arange(-10.25, 10.25, .5))
Out[54]:
In [35]:
ta_pm = GiveMePM(ta, n=44, lim=[0, 30])
In [53]:
vwap, vwap_indicator = VWAPIndicator4(ta, 240)
vwap_indicator = vwap_indicator[ta_pm.index]
In [57]:
ta_pm.hist(bins=np.arange(-50.5, 50.5, 1))
Out[57]:
In [185]:
t11, t22 = '2015-11-25 23:03:06.000', '2015-11-25 23:03:12.000'
In [186]:
plt.plot(ta.ix[t11: t22, 'vol_diff'])
#plt.ylim([0,3000])
Out[186]:
In [183]:
ta.ix[t11: t22, ['last', 'volume', 'vol_diff']]
Out[183]:
In [170]:
vwap_indicator[t11: t22]
Out[170]:
In [171]:
vwap[t11: t22]
Out[171]:
In [247]:
ta.ix['2015-11-19 21:00:55.000': '2015-11-19 21:01:58.000', 'last'][:55]
Out[247]:
In [241]:
ta_in_pm.ix['2015-11-19 21:00:55.000': '2015-11-19 21:00:58.000']
Out[241]:
In [197]:
%matplotlib inline
In [ ]:
temp1
In [195]:
vwap_indicator
Out[195]:
In [133]:
ta_out_pm.plot()
Out[133]:
In [82]:
for mywindow in [10, 60, 200, 500, 1000, 2400, 5000, 10000, 24000]:
print '\n\n\n---------------mywindow: %d ----------------------' %mywindow
#mywindow = 2400 * 3
vol_roll = ta.ix[:, 'volume'].rolling(window=mywindow)
last_multiply_vol = ta.ix[:, 'last'] * ta.ix[:, 'volume']
multi_roll = last_multiply_vol.rolling(window=mywindow)
vwap = multi_roll.sum()*1. / vol_roll.sum()
vwap_indicator = vwap - ta.ix[:, 'last']
# vwap_indicator.dropna(inplace=True)
#-------------------------------------------------------------------------------
'''
Automatically generate in and out sample data, then calculate Qty_indicator,
then regression under different parameters
'''
insample_index = list()
for l in [[0,2], [4, 6], [8, 10], [14, 16], [18,20], [22, 24], [26,28]]:
insample_index = insample_index + list(range(day_len*l[0], day_len*l[1]))
outsample_index = list()
for l in [[2, 4], [6, 8], [12, 14], [16, 18], [20,22], [24, 26], [28,30]]:
outsample_index = outsample_index + list(range(day_len*l[0], day_len*l[1]))
for forward_ticks in range(50, 52, 2):
ta_in_pm = GiveMePM(ta, n=forward_ticks, lim=[0, 20])
ta_out_pm = GiveMePM(ta, n=forward_ticks, lim=[25, 30])
res = myols(vwap_indicator, ta_in_pm)
print '\n\n\n--------------------- %d ----------------------' %forward_ticks
print res.summary()
print '\n[OutSample R_SQUARE: %f]' % (PredictedRsquare(res, vwap_indicator, ta_out_pm))[1]
In [ ]: