In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#import matplotlib.finance as mf
from matplotlib.widgets import MultiCursor
import statsmodels.tsa.stattools as stt
#import scipy.signal as sgn
import statsmodels.api as sm
#from statsmodels.sandbox.regression.predstd import wls_prediction_std
#from matplotlib.mlab import PCA

In [186]:
%matplotlib inline

In [3]:
# Style. 1
sns.set_context('paper')
sns.set_style("darkgrid")

In [241]:
# Style. 2
sns.set_context('paper')
sns.set_style("dark", 
	rc={'axes.facecolor': 'black', 
	'grid.color': 'red', 
	'grid.linestyle': '--', 
	'figure.facecolor': 'grey'})

In [6]:
hft = pd.read_hdf('HFT_SR_RM_MA_TA.hdf')

In [178]:
ta = hft.minor_xs('TA0001')

In [7]:
#------------------------------------------------
'''Some time length'''
night_len = int(4*3600*2.5)
mor_len = int(4*3600*2.25)
aftn_len = int(4*3600*1.5)
day_len = night_len + mor_len + aftn_len + 3

In [62]:
#-----------------------------------------------
'''add columns'''
def AddCol(df):
    vol = df.ix[:, 'volume'].diff()
    # this addition is for the convenience of Log y scale plot
    vol += 1
    vol = vol.rename('vol_diff')
    df = df.join(vol)

    openint = df.ix[:, 'openInterest'].diff()
    # this addition is for the convenience of Log y scale plot
    openint += 1
    openint = openint.rename('openInt_diff')
    df = df.join(openint)

    mid = (df.ix[:, 'askPrc_0'] + df.ix[:, 'bidPrc_0']) / 2.
    mid = mid.rename('midPrc')
    df = df.join(mid)
    return df

In [179]:
ta = AddCol(ta)

In [180]:
#------------------------------------------------   
'''training dataset and outsample dataset'''
ta_10day = ta.ix[:day_len*10 + 10, :]
ta_out = ta.ix[day_len*10 + 10: , :]

In [181]:
# -------------------------------------------------
def ForwardDiff(df, n=1):
    '''
    The reverse of pandas' function 'DataFrame.diff()'
    '''
    ret = df.diff(periods=n)
    ret = ret.shift(periods= -1 * n)
    ret.dropna(inplace=True)
    return ret

def ForwardPricemove(df, n=1):
    '''
    calculate price move and delete NaN
    '''
    ret = ForwardDiff(df.ix[:, 'last'], n)
    last_boolean1 = np.logical_and.reduce(
                                          [ret.index.hour == 14, 
                                           ret.index.minute == 59, 
                                           ret.index.second >= 60 - int(n//4) - 1])
    
    # this is the last tick
    last_boolean2 = ret.index.hour == 15

    # outlier_boolean = abs(ta_10day_pm) > 10
    # ta_10day_pm_no_outlier = ta_10day_pm.ix[np.logical_not(outlier_boolean)]
    ret = ret.ix[np.logical_not(np.logical_or(last_boolean1, last_boolean2))]
    ret = ret.rename('price move')
    return ret

In [182]:
forward_ticks = 40
ta_10day_pm = ForwardPricemove(ta_10day, forward_ticks)
ta_out_pm = ForwardPricemove(ta_out, forward_ticks)

In [187]:
ta_10day_pm.plot()


Out[187]:
<matplotlib.axes.AxesSubplot at 0x7f5371d86f90>

In [50]:
ta_out_pm.plot()


Out[50]:
<matplotlib.axes.AxesSubplot at 0x7f53b0095590>

UpDownIndicator Test for in and out sample


In [82]:
#---------------------------

def UpDownCalc(df):
    '''
    map up as +1, down as -1
    return (elementwise) 1 for last > mid; -1 for last < mid; 0 for last == mid
    '''
    up = df.ix[:, 'last'] > df.ix[:, 'midPrc']
    down = df.ix[:, 'last'] < df.ix[:, 'midPrc']
    up *= 1
    down *= -1
    updown = up + down
    updown = updown.rename('updown')
    return updown

In [188]:
ta_updown = UpDownCalc(ta)
plt.figure()
plt.hist(ta_10day_updown, bins=50)


Out[188]:
(array([ 455528.,       0.,       0.,       0.,       0.,       0.,
             0.,       0.,       0.,       0.,       0.,       0.,
             0.,       0.,       0.,       0.,       0.,       0.,
             0.,       0.,       0.,       0.,       0.,       0.,
             0.,    7573.,       0.,       0.,       0.,       0.,
             0.,       0.,       0.,       0.,       0.,       0.,
             0.,       0.,       0.,       0.,       0.,       0.,
             0.,       0.,       0.,       0.,       0.,       0.,
             0.,  436939.]),
 array([-1.  , -0.96, -0.92, -0.88, -0.84, -0.8 , -0.76, -0.72, -0.68,
       -0.64, -0.6 , -0.56, -0.52, -0.48, -0.44, -0.4 , -0.36, -0.32,
       -0.28, -0.24, -0.2 , -0.16, -0.12, -0.08, -0.04,  0.  ,  0.04,
        0.08,  0.12,  0.16,  0.2 ,  0.24,  0.28,  0.32,  0.36,  0.4 ,
        0.44,  0.48,  0.52,  0.56,  0.6 ,  0.64,  0.68,  0.72,  0.76,
        0.8 ,  0.84,  0.88,  0.92,  0.96,  1.  ]),
 <a list of 50 Patch objects>)

In [200]:
def rolling_mean(df, n=3):
    '''
    calculate rolling mean and delete NaN
    '''
    roll_obj = df.rolling(window=n)
    ret = (roll_obj.mean())
    
    last_boolean = np.logical_and.reduce(
                                          [ret.index.hour == 21, 
                                           ret.index.minute == 0, 
                                           ret.index.second <= int(n//4) + 1])
    ret = ret.ix[np.logical_not(last_boolean)]
    #ret = ret.rename('price move')
    return ret

In [228]:
#------------------------------------------
'''rolling mean'''
mywindow = 12
lastmid_indicator = rolling_mean(ta_updown, n=mywindow)
lastmid_indicator = lastmid_indicator.rename('lastmidIndicator')

In [202]:
plt.figure()
plt.hist(lastmid_indicator.ix[ta_out_pm.index].dropna(), bins=20)
plt.figure()
plt.hist(ta_out_pm, bins=np.arange(-5.5, 5.5, 1))


Out[202]:
(array([      0.,   21461.,       0.,  377740.,       0.,  995665.,
             0.,  376945.,       0.,   21679.]),
 array([-5.5, -4.5, -3.5, -2.5, -1.5, -0.5,  0.5,  1.5,  2.5,  3.5,  4.5]),
 <a list of 10 Patch objects>)

In [90]:
def myols(df, pm, norm=False):
    '''
    df is indicator DataFrame
    pm is Price move Series
    sm is satatsmodel module
    this function also automatically align index of df and pm
    '''
    global sm
    df = df[pm.index]
    df.dropna(inplace=True)
    if norm:
        df = (df - df.mean()) / df.std()
    X = sm.add_constant(df)
    Y = pm[df.index]
    model = sm.OLS(Y, X)
    ret = model.fit()
    return ret

In [215]:
def Rsquare(y, yhat):
    ybar = y.mean()
    #print ybar
    #print y-ybar
    ss_tot = ((y - ybar) ** 2).sum()
    ss_reg = ((yhat - ybar) ** 2).sum()
    ss_res = ((yhat - y) ** 2).sum()
    #print ss_reg, ss_tot
    ret = ss_reg / ss_tot
    return ret
def PredictedRsquare(result, xnew, pm):
    '''
    pm: outsample price move Series
    xnew: indicator Series (or DataFrame)
    result: insample regression results (comes from statsmodel's model.fit() )
    '''
    global sm
    # first we need to align xnew with outsample
    xnew = xnew[pm.index]
    xnew.dropna(inplace=True)
    pm = pm[xnew.index]
    xnew = sm.add_constant(xnew)
    ypredict = result.predict(xnew)
    rsq = Rsquare(pm, ypredict)
    return ypredict, rsq

In [203]:
(ta.ix[:, 'vol_diff'] == 0).sum()


Out[203]:
0

In [229]:
#------------------------------------------
'''rolling mean'''
mywindow = 12
lastmid_indicator = rolling_mean(ta_updown, n=mywindow)
lastmid_indicator = lastmid_indicator.rename('lastmidIndicator')
vol_roll = rolling_mean(np.log(ta.ix[:, 'vol_diff']), n=mywindow)
vol_roll = vol_roll.rename('vol_roll')
lastmid_indicator1 = lastmid_indicator * vol_roll

In [231]:
res = myols(lastmid_indicator1, ta_10day_pm)
print(res.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:             price move   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     3980.
Date:                Sun, 10 Jul 2016   Prob (F-statistic):               0.00
Time:                        13:02:24   Log-Likelihood:            -1.8292e+06
No. Observations:              899390   AIC:                         3.658e+06
Df Residuals:                  899388   BIC:                         3.658e+06
Df Model:                           1                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          0.0063      0.002      3.226      0.001         0.002     0.010
None          -0.1906      0.003    -63.088      0.000        -0.196    -0.185
==============================================================================
Omnibus:                   221416.941   Durbin-Watson:                   0.347
Prob(Omnibus):                  0.000   Jarque-Bera (JB):         13593133.442
Skew:                           0.229   Prob(JB):                         0.00
Kurtosis:                      22.040   Cond. No.                         1.55
==============================================================================

In [232]:
PredictedRsquare(res, lastmid_indicator1, ta_out_pm)


Out[232]:
(array([-0.14766787,  0.00629304, -0.14696118, ...,  0.07110718,
        0.07536682,  0.07589793]),
 0.0060778064651004719)

In [138]:
temp1 = lastmid_indicator1.ix[ta_out_pm.index]
temp1.dropna(inplace=True)
print type(temp1)
plt.figure(figsize=(20,10))
sns.swarmplot(x=temp1.ix[:300000:500], y=ta_out_pm.ix[:300000:500])


<class 'pandas.core.series.Series'>
Out[138]:
<matplotlib.axes.AxesSubplot at 0x7f53a535c8d0>

In [130]:
# --------------------------------------------
'''plot fit'''
fig, ax = plt.subplots()
fig = sm.graphics.plot_fit(res, res.model.exog_names[1], ax=ax)



In [125]:
#--------------------------------------------------
'''find good window parameter'''
for mywindow in np.arange(1, 60, 1):
#     mywindow = 
    r = ta_updown.rolling(window=mywindow)
    lastmid_indicator = (r.mean())
    lastmid_indicator = lastmid_indicator.rename('lastmid_indicator')
    res = myols(lastmid_indicator, ta_10day_pm)
    print '\n--------------------'
    print ('window = %d, Rsquare = %f. ' %(mywindow, res.rsquared))


--------------------
window = 1, Rsquare = 0.079394. 

--------------------
window = 2, Rsquare = 0.054486. 

--------------------
window = 3, Rsquare = 0.039333. 

--------------------
window = 4, Rsquare = 0.029954. 

--------------------
window = 5, Rsquare = 0.023794. 

--------------------
window = 6, Rsquare = 0.019466. 

--------------------
window = 7, Rsquare = 0.016307. 

--------------------
window = 8, Rsquare = 0.013948. 

--------------------
window = 9, Rsquare = 0.012110. 

--------------------
window = 10, Rsquare = 0.010672. 

--------------------
window = 11, Rsquare = 0.009513. 

--------------------
window = 12, Rsquare = 0.008578. 

--------------------
window = 13, Rsquare = 0.007821. 

--------------------
window = 14, Rsquare = 0.007165. 

--------------------
window = 15, Rsquare = 0.006591. 

--------------------
window = 16, Rsquare = 0.006094. 

--------------------
window = 17, Rsquare = 0.005655. 

--------------------
window = 18, Rsquare = 0.005268. 

--------------------
window = 19, Rsquare = 0.004926. 

--------------------
window = 20, Rsquare = 0.004627. 

--------------------
window = 21, Rsquare = 0.004371. 

--------------------
window = 22, Rsquare = 0.004132. 

--------------------
window = 23, Rsquare = 0.003908. 

--------------------
window = 24, Rsquare = 0.003706. 

--------------------
window = 25, Rsquare = 0.003524. 

--------------------
window = 26, Rsquare = 0.003363. 

--------------------
window = 27, Rsquare = 0.003217. 

--------------------
window = 28, Rsquare = 0.003075. 

--------------------
window = 29, Rsquare = 0.002946. 

--------------------
window = 30, Rsquare = 0.002823. 

--------------------
window = 31, Rsquare = 0.002712. 

--------------------
window = 32, Rsquare = 0.002610. 

--------------------
window = 33, Rsquare = 0.002513. 

--------------------
window = 34, Rsquare = 0.002428. 

--------------------
window = 35, Rsquare = 0.002343. 

--------------------
window = 36, Rsquare = 0.002265. 

--------------------
window = 37, Rsquare = 0.002191. 

--------------------
window = 38, Rsquare = 0.002125. 

--------------------
window = 39, Rsquare = 0.002062. 

--------------------
window = 40, Rsquare = 0.002000. 

--------------------
window = 41, Rsquare = 0.001946. 

--------------------
window = 42, Rsquare = 0.001899. 

--------------------
window = 43, Rsquare = 0.001856. 

--------------------
window = 44, Rsquare = 0.001816. 

--------------------
window = 45, Rsquare = 0.001778. 

--------------------
window = 46, Rsquare = 0.001742. 

--------------------
window = 47, Rsquare = 0.001706. 

--------------------
window = 48, Rsquare = 0.001671. 

--------------------
window = 49, Rsquare = 0.001637. 

--------------------
window = 50, Rsquare = 0.001601. 

--------------------
window = 51, Rsquare = 0.001568. 

--------------------
window = 52, Rsquare = 0.001537. 

--------------------
window = 53, Rsquare = 0.001504. 

--------------------
window = 54, Rsquare = 0.001474. 

--------------------
window = 55, Rsquare = 0.001446. 

--------------------
window = 56, Rsquare = 0.001417. 

--------------------
window = 57, Rsquare = 0.001390. 

--------------------
window = 58, Rsquare = 0.001364. 

--------------------
window = 59, Rsquare = 0.001340. 

In [233]:
%matplotlib auto


Using matplotlib backend: TkAgg

In [221]:
def prc_total(df, t1, t2, fs=(15,10)):
    fig = plt.figure(figsize=fs)
    ax1 = fig.add_subplot(411)
    
    ax1.plot(df.ix[t1: t2, 'last'], color='#f5f112', marker='*')
    ax1.plot(df.ix[t1: t2, 'askPrc_0'], color='lightgreen')
    ax1.plot(df.ix[t1: t2, 'bidPrc_0'], color='lightcoral')
    
    ax2 = fig.add_subplot(412, sharex=ax1)
    ax2.semilogy(100 * np.ones_like(df.ix[t11: t22].values), color='orange')
    ax2.semilogy(df.ix[t11: t22, 'vol_diff']/2., color='orange', marker='*')
    
    ax3 = fig.add_subplot(413, sharex=ax1)
    ax3.plot(df.ix[t1: t2, 'openInt_diff'], color='white', lw=0.4, marker='*')
    
    ax4 = fig.add_subplot(414, sharex=ax1)
    ax4.plot(df.ix[t1: t2, 'TotalBidLot'], 
             color='red')
    ax4.plot(df.ix[t1: t2, 'TotalAskLot'], 
             color='green')
    return fig

In [244]:
t11, t22 = '2015-11-19 21:00:01','2015-11-28 15:00:00'
temp = ta_10day.ix[ta_10day_pm.index, :]

thefig = prc_total(temp, t11, t22, (15,10))
multi = MultiCursor(thefig.canvas, thefig.axes, color='c', lw=1)
thefig.show()


  File "/usr/lib/pymodules/python2.7/matplotlib/cbook.py", line 527, in process
    proxy(*args, **kwargs)
  File "/usr/lib/pymodules/python2.7/matplotlib/cbook.py", line 405, in __call__
    return mtd(*args, **kwargs)
  File "/usr/lib/pymodules/python2.7/matplotlib/widgets.py", line 995, in onmove
    self._update()
  File "/usr/lib/pymodules/python2.7/matplotlib/widgets.py", line 1003, in _update
    ax.draw_artist(line)
  File "/usr/lib/pymodules/python2.7/matplotlib/axes.py", line 2097, in draw_artist
    assert self._cachedRenderer is not None
AssertionError

In [237]:
thefig.axes[2].cla()
thefig.axes[2].plot(lastmid_indicator1.ix[ta_10day_pm.index].ix['2015-12-01 21:00:01': '2015-12-05 15:00:00'].ix[t11:t22])


Out[237]:
[<matplotlib.lines.Line2D at 0x7f53496fea50>]

In [ ]:
n = len(temp.ix[t11:t22, :])
for i, txt in enumerate(temp.ix[t11:t22, 'askQty_0']):
    (thefig.axes[0]).annotate(txt, ((xrange(n))[i], (temp.ix[t11: t22, 'askPrc_0'])[i] + .3), color='white', size=10)
for i, txt in enumerate(temp.ix[t11:t22, 'askQty_1']):
    (thefig.axes[0]).annotate(txt, ((xrange(n))[i], (temp.ix[t11: t22, 'askPrc_1'])[i] +.7), color='white', size=10)
for i, txt in enumerate(temp.ix[t11:t22, 'bidQty_0']):
    (thefig.axes[0]).annotate(txt, ((xrange(n))[i], (temp.ix[t11: t22, 'bidPrc_0'])[i] - .6), color='white', size=10)
for i, txt in enumerate(temp.ix[t11:t22, 'bidQty_1']):
    (thefig.axes[0]).annotate(txt, ((xrange(n))[i], (temp.ix[t11: t22, 'bidPrc_1'])[i] - 1.), color='white', size=10)
for i, txt in enumerate(temp.ix[t11:t22, 'vol_diff']/2.):
    (thefig.axes[1]).annotate(txt, ((xrange(n))[i], (temp.ix[t11: t22, 'vol_diff'])[i] + .3), color='white', size=10)