In [1]:
    
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#import matplotlib.finance as mf
from matplotlib.widgets import MultiCursor
import statsmodels.tsa.stattools as stt
#import scipy.signal as sgn
import statsmodels.api as sm
#from statsmodels.sandbox.regression.predstd import wls_prediction_std
#from matplotlib.mlab import PCA
    
In [186]:
    
%matplotlib inline
    
In [3]:
    
# Style. 1
sns.set_context('paper')
sns.set_style("darkgrid")
    
In [241]:
    
# Style. 2
sns.set_context('paper')
sns.set_style("dark", 
	rc={'axes.facecolor': 'black', 
	'grid.color': 'red', 
	'grid.linestyle': '--', 
	'figure.facecolor': 'grey'})
    
In [6]:
    
hft = pd.read_hdf('HFT_SR_RM_MA_TA.hdf')
    
In [178]:
    
ta = hft.minor_xs('TA0001')
    
In [7]:
    
#------------------------------------------------
'''Some time length'''
night_len = int(4*3600*2.5)
mor_len = int(4*3600*2.25)
aftn_len = int(4*3600*1.5)
day_len = night_len + mor_len + aftn_len + 3
    
In [62]:
    
#-----------------------------------------------
'''add columns'''
def AddCol(df):
    vol = df.ix[:, 'volume'].diff()
    # this addition is for the convenience of Log y scale plot
    vol += 1
    vol = vol.rename('vol_diff')
    df = df.join(vol)
    openint = df.ix[:, 'openInterest'].diff()
    # this addition is for the convenience of Log y scale plot
    openint += 1
    openint = openint.rename('openInt_diff')
    df = df.join(openint)
    mid = (df.ix[:, 'askPrc_0'] + df.ix[:, 'bidPrc_0']) / 2.
    mid = mid.rename('midPrc')
    df = df.join(mid)
    return df
    
In [179]:
    
ta = AddCol(ta)
    
In [180]:
    
#------------------------------------------------   
'''training dataset and outsample dataset'''
ta_10day = ta.ix[:day_len*10 + 10, :]
ta_out = ta.ix[day_len*10 + 10: , :]
    
In [181]:
    
# -------------------------------------------------
def ForwardDiff(df, n=1):
    '''
    The reverse of pandas' function 'DataFrame.diff()'
    '''
    ret = df.diff(periods=n)
    ret = ret.shift(periods= -1 * n)
    ret.dropna(inplace=True)
    return ret
def ForwardPricemove(df, n=1):
    '''
    calculate price move and delete NaN
    '''
    ret = ForwardDiff(df.ix[:, 'last'], n)
    last_boolean1 = np.logical_and.reduce(
                                          [ret.index.hour == 14, 
                                           ret.index.minute == 59, 
                                           ret.index.second >= 60 - int(n//4) - 1])
    
    # this is the last tick
    last_boolean2 = ret.index.hour == 15
    # outlier_boolean = abs(ta_10day_pm) > 10
    # ta_10day_pm_no_outlier = ta_10day_pm.ix[np.logical_not(outlier_boolean)]
    ret = ret.ix[np.logical_not(np.logical_or(last_boolean1, last_boolean2))]
    ret = ret.rename('price move')
    return ret
    
In [182]:
    
forward_ticks = 40
ta_10day_pm = ForwardPricemove(ta_10day, forward_ticks)
ta_out_pm = ForwardPricemove(ta_out, forward_ticks)
    
In [187]:
    
ta_10day_pm.plot()
    
    Out[187]:
    
In [50]:
    
ta_out_pm.plot()
    
    Out[50]:
    
In [82]:
    
#---------------------------
def UpDownCalc(df):
    '''
    map up as +1, down as -1
    return (elementwise) 1 for last > mid; -1 for last < mid; 0 for last == mid
    '''
    up = df.ix[:, 'last'] > df.ix[:, 'midPrc']
    down = df.ix[:, 'last'] < df.ix[:, 'midPrc']
    up *= 1
    down *= -1
    updown = up + down
    updown = updown.rename('updown')
    return updown
    
In [188]:
    
ta_updown = UpDownCalc(ta)
plt.figure()
plt.hist(ta_10day_updown, bins=50)
    
    Out[188]:
    
In [200]:
    
def rolling_mean(df, n=3):
    '''
    calculate rolling mean and delete NaN
    '''
    roll_obj = df.rolling(window=n)
    ret = (roll_obj.mean())
    
    last_boolean = np.logical_and.reduce(
                                          [ret.index.hour == 21, 
                                           ret.index.minute == 0, 
                                           ret.index.second <= int(n//4) + 1])
    ret = ret.ix[np.logical_not(last_boolean)]
    #ret = ret.rename('price move')
    return ret
    
In [228]:
    
#------------------------------------------
'''rolling mean'''
mywindow = 12
lastmid_indicator = rolling_mean(ta_updown, n=mywindow)
lastmid_indicator = lastmid_indicator.rename('lastmidIndicator')
    
In [202]:
    
plt.figure()
plt.hist(lastmid_indicator.ix[ta_out_pm.index].dropna(), bins=20)
plt.figure()
plt.hist(ta_out_pm, bins=np.arange(-5.5, 5.5, 1))
    
    Out[202]:
    
    
In [90]:
    
def myols(df, pm, norm=False):
    '''
    df is indicator DataFrame
    pm is Price move Series
    sm is satatsmodel module
    this function also automatically align index of df and pm
    '''
    global sm
    df = df[pm.index]
    df.dropna(inplace=True)
    if norm:
        df = (df - df.mean()) / df.std()
    X = sm.add_constant(df)
    Y = pm[df.index]
    model = sm.OLS(Y, X)
    ret = model.fit()
    return ret
    
In [215]:
    
def Rsquare(y, yhat):
    ybar = y.mean()
    #print ybar
    #print y-ybar
    ss_tot = ((y - ybar) ** 2).sum()
    ss_reg = ((yhat - ybar) ** 2).sum()
    ss_res = ((yhat - y) ** 2).sum()
    #print ss_reg, ss_tot
    ret = ss_reg / ss_tot
    return ret
def PredictedRsquare(result, xnew, pm):
    '''
    pm: outsample price move Series
    xnew: indicator Series (or DataFrame)
    result: insample regression results (comes from statsmodel's model.fit() )
    '''
    global sm
    # first we need to align xnew with outsample
    xnew = xnew[pm.index]
    xnew.dropna(inplace=True)
    pm = pm[xnew.index]
    xnew = sm.add_constant(xnew)
    ypredict = result.predict(xnew)
    rsq = Rsquare(pm, ypredict)
    return ypredict, rsq
    
In [203]:
    
(ta.ix[:, 'vol_diff'] == 0).sum()
    
    Out[203]:
In [229]:
    
#------------------------------------------
'''rolling mean'''
mywindow = 12
lastmid_indicator = rolling_mean(ta_updown, n=mywindow)
lastmid_indicator = lastmid_indicator.rename('lastmidIndicator')
vol_roll = rolling_mean(np.log(ta.ix[:, 'vol_diff']), n=mywindow)
vol_roll = vol_roll.rename('vol_roll')
lastmid_indicator1 = lastmid_indicator * vol_roll
    
In [231]:
    
res = myols(lastmid_indicator1, ta_10day_pm)
print(res.summary())
    
    
In [232]:
    
PredictedRsquare(res, lastmid_indicator1, ta_out_pm)
    
    Out[232]:
In [138]:
    
temp1 = lastmid_indicator1.ix[ta_out_pm.index]
temp1.dropna(inplace=True)
print type(temp1)
plt.figure(figsize=(20,10))
sns.swarmplot(x=temp1.ix[:300000:500], y=ta_out_pm.ix[:300000:500])
    
    
    Out[138]:
    
In [130]:
    
# --------------------------------------------
'''plot fit'''
fig, ax = plt.subplots()
fig = sm.graphics.plot_fit(res, res.model.exog_names[1], ax=ax)
    
    
In [125]:
    
#--------------------------------------------------
'''find good window parameter'''
for mywindow in np.arange(1, 60, 1):
#     mywindow = 
    r = ta_updown.rolling(window=mywindow)
    lastmid_indicator = (r.mean())
    lastmid_indicator = lastmid_indicator.rename('lastmid_indicator')
    res = myols(lastmid_indicator, ta_10day_pm)
    print '\n--------------------'
    print ('window = %d, Rsquare = %f. ' %(mywindow, res.rsquared))
    
    
In [233]:
    
%matplotlib auto
    
    
In [221]:
    
def prc_total(df, t1, t2, fs=(15,10)):
    fig = plt.figure(figsize=fs)
    ax1 = fig.add_subplot(411)
    
    ax1.plot(df.ix[t1: t2, 'last'], color='#f5f112', marker='*')
    ax1.plot(df.ix[t1: t2, 'askPrc_0'], color='lightgreen')
    ax1.plot(df.ix[t1: t2, 'bidPrc_0'], color='lightcoral')
    
    ax2 = fig.add_subplot(412, sharex=ax1)
    ax2.semilogy(100 * np.ones_like(df.ix[t11: t22].values), color='orange')
    ax2.semilogy(df.ix[t11: t22, 'vol_diff']/2., color='orange', marker='*')
    
    ax3 = fig.add_subplot(413, sharex=ax1)
    ax3.plot(df.ix[t1: t2, 'openInt_diff'], color='white', lw=0.4, marker='*')
    
    ax4 = fig.add_subplot(414, sharex=ax1)
    ax4.plot(df.ix[t1: t2, 'TotalBidLot'], 
             color='red')
    ax4.plot(df.ix[t1: t2, 'TotalAskLot'], 
             color='green')
    return fig
    
In [244]:
    
t11, t22 = '2015-11-19 21:00:01','2015-11-28 15:00:00'
temp = ta_10day.ix[ta_10day_pm.index, :]
thefig = prc_total(temp, t11, t22, (15,10))
multi = MultiCursor(thefig.canvas, thefig.axes, color='c', lw=1)
thefig.show()
    
    
In [237]:
    
thefig.axes[2].cla()
thefig.axes[2].plot(lastmid_indicator1.ix[ta_10day_pm.index].ix['2015-12-01 21:00:01': '2015-12-05 15:00:00'].ix[t11:t22])
    
    Out[237]:
In [ ]:
    
n = len(temp.ix[t11:t22, :])
for i, txt in enumerate(temp.ix[t11:t22, 'askQty_0']):
    (thefig.axes[0]).annotate(txt, ((xrange(n))[i], (temp.ix[t11: t22, 'askPrc_0'])[i] + .3), color='white', size=10)
for i, txt in enumerate(temp.ix[t11:t22, 'askQty_1']):
    (thefig.axes[0]).annotate(txt, ((xrange(n))[i], (temp.ix[t11: t22, 'askPrc_1'])[i] +.7), color='white', size=10)
for i, txt in enumerate(temp.ix[t11:t22, 'bidQty_0']):
    (thefig.axes[0]).annotate(txt, ((xrange(n))[i], (temp.ix[t11: t22, 'bidPrc_0'])[i] - .6), color='white', size=10)
for i, txt in enumerate(temp.ix[t11:t22, 'bidQty_1']):
    (thefig.axes[0]).annotate(txt, ((xrange(n))[i], (temp.ix[t11: t22, 'bidPrc_1'])[i] - 1.), color='white', size=10)
for i, txt in enumerate(temp.ix[t11:t22, 'vol_diff']/2.):
    (thefig.axes[1]).annotate(txt, ((xrange(n))[i], (temp.ix[t11: t22, 'vol_diff'])[i] + .3), color='white', size=10)