In [1]:
    
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# import matplotlib.finance as mf
from matplotlib.widgets import MultiCursor
import statsmodels.tsa.stattools as stt
# import scipy.signal as sgn
import statsmodels.api as sm
# from statsmodels.sandbox.regression.predstd import wls_prediction_std
# from matplotlib.mlab import PCA
    
In [2]:
    
%matplotlib inline
    
In [3]:
    
# Style. 1
sns.set_context('paper')
sns.set_style("darkgrid")
    
In [4]:
    
# Style. 2
sns.set_context('paper')
sns.set_style("dark", 
    rc={'axes.facecolor': 'black', 
    'grid.color': 'red', 
    'grid.linestyle': '--', 
    'figure.facecolor': 'grey'})
    
In [5]:
    
hft = pd.read_hdf('/home/bingnan/HFT_SR_RM_MA_TA.hdf')
    
    
In [6]:
    
ta = hft.minor_xs('TA0001').copy()
    
In [7]:
    
#------------------------------------------------
'''Some time length'''
night_len = int(4*3600*2.5)
mor_len = int(4*3600*2.25)
aftn_len = int(4*3600*1.5)
day_len = night_len + mor_len + aftn_len + 4
    
In [26]:
    
ta.ix[:, ['bidPrc_0', 'bidPrc_1', 'bidPrc_2', 'askPrc_3', 'bidPrc_4', 
          'askPrc_1', 'askPrc_2', 'askPrc_3', 'askPrc_4']].any(axis=0) == 0
    
    Out[26]:
In [8]:
    
#-----------------------------------------------
'''add columns'''
def AddCol(df):
    vol = df.ix[:, 'volume'].diff()
    # this addition is for the convenience of Log y scale plot
    # vol +=1
    vol = vol.rename('vol_diff')
    openint = df.ix[:, 'openInterest'].diff()
    # this addition is for the convenience of Log y scale plot
    # openint += 1
    openint = openint.rename('openInt_diff')
    mid = (df.ix[:, 'askPrc_0'] + df.ix[:, 'bidPrc_0']) / 2.
    mid = mid.rename('midPrc')
    ret = df.join([vol, openint, mid])
    return ret
# -------------------------------------------------
def ForwardDiff(df, n=1):
    '''
    The reverse of pandas' function 'DataFrame.diff()'
    '''
    ret = df.diff(periods=n)
    ret = ret.shift(periods= -1 * n)
    ret = ret.dropna()
    return ret
def CutHighVar(df, length=200):
    '''
    Purpose: Cut a small period after opening in the morning and at night.
    Because this time range, the var of price is high, which harmd our model.
    df: pd.DataFrame or pd.Series. With datetime index
    length: int. the length you want to cut, counted in ticks. Cannot be larger than 240
    '''
    ret = df
    bool_arr1 = np.logical_or(ret.index.hour == 21, ret.index.hour == 9)
    bool_arr = np.logical_and.reduce([bool_arr1, 
        ret.index.minute == 0, 
        ret.index.second <= int(length//4) - 1])
    ret = ret[np.logical_not(bool_arr)]
    return ret
def CutTail(df, length=60):
    '''
    Purpose: Cut a small period before market close.
    df: pd.DataFrame or pd.Series. With datetime index
    length: int. the length you want to cut, counted in ticks. Cannot be larger than 240
    '''
    ret = df
    last_boolean1 = np.logical_and.reduce(
                                          [ret.index.hour == 14, 
                                           ret.index.minute == 59, 
                                           ret.index.second >= 60 - int(length//4)])  
    # this is the last tick
    last_boolean2 = ret.index.hour == 15
    ret = ret[np.logical_not(np.logical_or(last_boolean1, last_boolean2))]
    return ret
def DayChangeNum(ser, distance=7):
    '''
    ser is price move series after process.
    distance counting in hours
    '''
    h = ser.index.hour
    h_diff = np.diff(h)
    h_diff = np.insert(h_diff, 1, 0)
    ret = np.where(np.abs(h_diff) > distance)[0]
    return ret
def NormPriceMove(ser, daychgnum):
    ret = ser.copy()
    for i in range(len(daychgnum) - 1):
        mysamp = ret.iloc[daychgnum[i]: daychgnum[i+1]]
        #print mysamp
        mystd = mysamp.std()
        print mystd
        ret.iloc[daychgnum[i]: daychgnum[i+1]] /= mystd
    return ret
def CuthlLimit(df, how='all', depth=0):
    s1 = 'bidQty_' + str(depth)
    s2 = 'askQty_' + str(depth)
    if how == 'all':
        arr1 = df.ix[:, s1] == 0
        arr2 = df[s2] == 0
        bool_arr = np.logical_or(arr1, arr2)
        #bool_arr = np.logical_or(df[s1] == 0, df[s2] == 0)
    elif how == 'bid':
        bool_arr = (df[s1] == 0)
    elif how == 'ask':
        bool_arr = (df[s2] == 0)
    else:
        print 'ERROR!'
    return df[np.logical_not(bool_arr)]
def GiveMePM(df, n=44, lim=[0, 20], cutdepth=0, norm=False, high_var_length=200):
    '''
    Purpose: from original DataFrame calculate price move Series. Including CutTail and CutHighVar.
    df: the pinzhong DataFrame.
    n: forward_ticks
    lim: can be like (0, 20), counting in days, or an int array of index.
    norm: if True, normalize the price move using every day std.
    Return (last) price move series.
    '''
    global day_len
    samp = CuthlLimit(df, how='all', depth=cutdepth)
    if len(lim) == 2:
        samp = samp.ix[day_len*lim[0]: day_len*lim[1], 'midPrc']
    else:
        samp = samp.ix[lim, 'midPrc']
    #print samp.index[0], samp.index[-1]
    ret = ForwardDiff(samp, n)
    ret = CutTail(ret, n)
    ret = CutHighVar(ret, length=high_var_length)
    if norm:
        ret_daychangenum = DayChangeNum(ret)
        ret = NormPriceMove(ret, ret_daychangenum)
    return ret
    
In [9]:
    
ta.columns
    
    Out[9]:
In [10]:
    
ta = AddCol(ta)
    
In [24]:
    
def myols(ser, pm, norm=False):
    '''
    ser is indicator Series
    pm is Price move Series
    sm is satatsmodel module
    this function also automatically align index of df and pm
    '''
    global sm
    ser = ser[pm.index]
    ser = ser.dropna()
    if norm:
        ser = (ser - ser.mean()) / ser.std()
    X = sm.add_constant(ser)
    Y = pm[X.index]
    #print Y[211000:211070]
    #print X[211000:211070]
    model = sm.OLS(Y, X)
    ret = model.fit()
    return ret
def Rsquare(y, yhat):
    ret = 1 - (y-yhat).var() / y.var()
    return ret
def Rsquare2(y, yhat):
    ret = 1 - ((y-yhat)**2).mean() / y.var()
    return ret
def PredictedRsquare(res, xnew, pm):
    '''
    pm: outsample price move Series
    xnew: indicator Series (or DataFrame)
    res: insample regression results (comes from statsmodel's model.fit() )
    '''
    # first we need to align xnew with outsample
    xnew = xnew[pm.index]
    xnew = xnew.dropna()
    y = pm[xnew.index]
    
    xnew = sm.add_constant(xnew)
    ynew = res.predict(xnew)
    rsq = Rsquare(y, ynew)
    return rsq
def PredictedRsquare2(res, xnew, pm):
    '''
    pm: outsample price move Series
    xnew: indicator Series (or DataFrame)
    res: insample regression results (comes from statsmodel's model.fit() )
    '''
    # first we need to align xnew with outsample
    xnew = xnew[pm.index]
    xnew = xnew.dropna()
    y = pm[xnew.index]
    
    xnew = sm.add_constant(xnew)
    ynew = res.predict(xnew)
    rsq = Rsquare2(y, ynew)
    return rsq
    
In [189]:
    
%matplotlib auto
    
    
In [187]:
    
def prc_total(df, t1, t2, fs=(15,10)):
    fig = plt.figure(figsize=fs)
    ax1 = fig.add_subplot(311)
    
    ax1.plot(df.ix[t1: t2, 'last'], color='#f5f112', marker='*')
    ax1.plot(df.ix[t1: t2, 'askPrc_0'], color='lightgreen')
    ax1.plot(df.ix[t1: t2, 'bidPrc_0'], color='lightcoral')
    ax3 = fig.add_subplot(312, sharex=ax1)
    ax3.plot(df.ix[t1: t2, 'vol_diff'], color='white', lw=0.4, marker='*')
    
    ax4 = fig.add_subplot(313, sharex=ax1)
    ax4.plot(df.ix[t1: t2, 'openInt_diff'],
             color='red', marker='*')
    return fig
    
In [191]:
    
# --------------------------------------------
'''this is for macroly searching for patterns'''
t11, t22 = '2015-12-05 21:00:01', '2015-12-08 15:00:00'
# samp = ta_10day.ix[pm_index, :]
thefig = prc_total(ta, t11, t22, (15,10))
multi = MultiCursor(thefig.canvas, thefig.axes, color='c', lw=1)
thefig.show()
    
In [137]:
    
vwap_indicator = VWAPIndicator(ta, 50)
    
In [147]:
    
ta.ix[120:190, 'last']
    
    Out[147]:
In [142]:
    
ta_in_pm
    
    Out[142]:
In [12]:
    
def VWAPIndicator4(df, mywindow=50):
    '''
    use both openInt and volume to calculate 'real' new position.
    '''
    #TODO: ewm
    last = df.ix[:, 'midPrc']
    new_position = df.ix[:, ['vol_diff', 'openInt_diff']].sum(axis=1)
    new_position += .1
    last_multiply_newpos = last * new_position
    multiply_roll = last_multiply_newpos.rolling(window=mywindow)
    newpos_roll = new_position.rolling(window=mywindow)
    
    vwap = multiply_roll.sum()*1. / newpos_roll.sum()
    #print vwap[:15]
    #print last[:15]
    ret = vwap - last
    ret.name = 'vwap_indicator2'
    vwap.name = 'vwaPrc2'
    return vwap, ret
    
In [13]:
    
def VWAPIndicator3(df, mywindow=50):
    #TODO: ewm
    last = df.ix[:, 'midPrc']
    #print (last.isnull()).sum()
    vol = df.ix[:, 'vol_diff']
    vol += 1.
    #print (vol.isnull()).sum()
    last_multiply_vol = last * vol
    #print (last_multiply_vol.isnull()).sum()
    multiply_roll = last_multiply_vol.rolling(window=mywindow)
    vol_roll = vol.rolling(window=mywindow)
    
    vwap = multiply_roll.sum()*1. / vol_roll.sum()
    #print (vwap.isnull()).sum()
    #print vwap[:15]
    #print last[:15]
    ret = vwap - last
    ret.name = 'vwap_indicator'
    vwap.name = 'vwaPrc'
    return vwap, ret
    
In [14]:
    
def GiveMeIndex(arri, arro):
    '''
    Generate integer index
    arr is a two dim ndarray, with each element being a time range(counting in days).
    '''
    global day_len
    index_in = list()
    for k in arri:
        index_in = index_in + list(range(day_len * k[0], day_len * k[1]))
    index_out = list()
    for k in arro:
        index_out = index_out + list(range(day_len * k[0], day_len * k[1]))
    return index_in, index_out
    
In [15]:
    
insample_index0, outsample_index0 = GiveMeIndex([[0,2], [4, 6], [8, 10], [14, 16], [18,20], [22, 24], [26,28]],
                                               [[2, 4], [6, 8], [12, 14], [16, 18], [20,22], [24, 26], [28,30]])
insample_index1, outsample_index1 = GiveMeIndex([[0, 20]], [[20, 25]])
insample_index2, outsample_index2 = GiveMeIndex([[5, 25]], [[0, 5]])
insample_index3, outsample_index3 = GiveMeIndex([[0, 5], [10, 25]], [[5, 10]])
insample_index4, outsample_index4 = GiveMeIndex([[0, 10], [15, 25]], [[10, 15]])
insample_index5, outsample_index5 = GiveMeIndex([[0, 13]], [[13, 25]])
insample_index6, outsample_index6 = GiveMeIndex([[0, 18]], [[18, 25]])
# different_io_index = ((insample_index0, outsample_index0), (insample_index1, outsample_index1), 
#                       (insample_index2, outsample_index2), (insample_index3, outsample_index3),
#                       (insample_index4, outsample_index4), (insample_index5, outsample_index5))
insample_index00, outsample_index00 = GiveMeIndex([[0, 20]],
                                               [[25, 30]])
    
In [26]:
    
#-------------------------------------------------------------------------------
'''
Automatically generate in and out sample data, then calculate Qty_indicator,
then regression under different parameters
'''
for mywind in [10, 30, 100, 240, 1000, 2400]:
    vwap, vwap_indicator = VWAPIndicator4(ta, mywind)
    #print pd.concat([ta[['last','vol_diff']],vwap,vwap_indicator],axis=1)
    for forward_ticks in range(44, 45, 1):
        ta_in_pm = GiveMePM(ta, n=forward_ticks, lim=insample_index1)
        #print pd.concat([ta_in_pm,ta['last']],axis=1).dropna()
        ta_out_pm = GiveMePM(ta, n=forward_ticks, lim=outsample_index1)
        #df = pd.concat([vwap_indicator, ta_in_pm],axis=1)
        #df.dropna(inplace=True)
        res = myols(vwap_indicator, ta_in_pm)
        print ('\n\n\n------------------------------X length: %d, Y length: %d ------------------------------' 
               %(mywind, forward_ticks))
        print ('In sample time range: %s to %s \nOut sample time range: %s to %s.\n' %(ta_in_pm.index[0], 
                                                                                       ta_in_pm.index[-1],
                                                                                       ta_out_pm.index[0],
                                                                                       ta_out_pm.index[-1]))
        print res.summary()
        #print ta_in_pm.describe()
        #print ta_out_pm.describe()
        print '\n[OutSample R_SQUARE: %f]' % PredictedRsquare(res, vwap_indicator, ta_out_pm)
        print '\n[OutSample R_SQUARE: %f]' % PredictedRsquare2(res, vwap_indicator, ta_out_pm)
    
    
In [66]:
    
TEMP1 = pd.concat([ta1,vwap_indicator,vwap],axis=1)
    
In [69]:
    
vwap.ix['2015-11-23 23:13:40': '2015-11-23 23:13:50']
    
    Out[69]:
    Out[69]:
    Out[69]:
In [71]:
    
TEMP1.ix['2015-11-23 23:13:40': '2015-11-23 23:13:50', ['last', 'volume', 'vol_diff', 'vwap', 'vwap_indicator']]
    
    Out[71]:
In [50]:
    
ta1.ix['2015-11-23 23:13:40': '2015-11-23 23:13:50', ['last', 'volume', 'vol_diff']]
    
    Out[50]:
In [27]:
    
ta1.ix['2015-11-23 23:13:50': '2015-11-23 23:14:04', ['last', 'volume', 'vol_diff']]
    
    Out[27]:
In [345]:
    
%matplotlib inline
    
In [297]:
    
plt.plot(vwap.index, ta.ix[vwap.index, 'last'])
plt.plot(vwap.index, vwap)
    
    Out[297]:
In [270]:
    
len(vwap_indicator)
    
    Out[270]:
In [271]:
    
len(ta_in_pm)
    
    Out[271]:
In [272]:
    
day_len*20
    
    Out[272]:
In [310]:
    
day_len*20 - len(ta_in_pm)
    
    Out[310]:
In [242]:
    
(200+44)*20
    
    Out[242]:
In [303]:
    
temp = vwap_indicator[ta_in_pm.index]
    
In [307]:
    
temp = temp.dropna()
    
In [309]:
    
(vwap_indicator.isnull()).sum()
    
    Out[309]:
In [308]:
    
len(temp)
    
    Out[308]:
In [284]:
    
vwap, vwap_indicator = VWAPIndicator3(ta, 40)
    
    
In [ ]:
    
y = pm[xnew.index]
    
In [321]:
    
%matplotlib inline
    
In [48]:
    
# --------------------------------------------
'''plot fit'''
fig, ax = plt.subplots()
fig = sm.graphics.plot_fit(res, res.model.exog_names[1], ax=ax)
    
    
In [54]:
    
vwap_indicator.hist(bins=np.arange(-10.25, 10.25, .5))
    
    Out[54]:
    
In [35]:
    
ta_pm = GiveMePM(ta, n=44, lim=[0, 30])
    
In [53]:
    
vwap, vwap_indicator = VWAPIndicator4(ta, 240)
vwap_indicator = vwap_indicator[ta_pm.index]
    
In [57]:
    
ta_pm.hist(bins=np.arange(-50.5, 50.5, 1))
    
    Out[57]:
    
In [185]:
    
t11, t22 = '2015-11-25 23:03:06.000', '2015-11-25 23:03:12.000'
    
In [186]:
    
plt.plot(ta.ix[t11: t22, 'vol_diff'])
#plt.ylim([0,3000])
    
    Out[186]:
    
In [183]:
    
ta.ix[t11: t22, ['last', 'volume', 'vol_diff']]
    
    Out[183]:
In [170]:
    
vwap_indicator[t11: t22]
    
    Out[170]:
In [171]:
    
vwap[t11: t22]
    
    Out[171]:
In [247]:
    
ta.ix['2015-11-19 21:00:55.000': '2015-11-19 21:01:58.000', 'last'][:55]
    
    Out[247]:
In [241]:
    
ta_in_pm.ix['2015-11-19 21:00:55.000': '2015-11-19 21:00:58.000']
    
    Out[241]:
In [197]:
    
%matplotlib inline
    
In [ ]:
    
temp1
    
In [195]:
    
vwap_indicator
    
    Out[195]:
In [133]:
    
ta_out_pm.plot()
    
    Out[133]:
    
In [82]:
    
for mywindow in [10, 60, 200, 500, 1000, 2400, 5000, 10000, 24000]:
    print '\n\n\n---------------mywindow: %d ----------------------' %mywindow
    #mywindow = 2400 * 3
    vol_roll = ta.ix[:, 'volume'].rolling(window=mywindow)
    last_multiply_vol = ta.ix[:, 'last'] * ta.ix[:, 'volume']
    multi_roll = last_multiply_vol.rolling(window=mywindow)
    vwap = multi_roll.sum()*1. / vol_roll.sum()
    vwap_indicator = vwap - ta.ix[:, 'last']
#     vwap_indicator.dropna(inplace=True)
#-------------------------------------------------------------------------------
    '''
    Automatically generate in and out sample data, then calculate Qty_indicator,
    then regression under different parameters
    '''
    insample_index = list()
    for l in [[0,2], [4, 6], [8, 10], [14, 16], [18,20], [22, 24], [26,28]]:
        insample_index = insample_index + list(range(day_len*l[0], day_len*l[1]))
    outsample_index = list()
    for l in [[2, 4], [6, 8], [12, 14], [16, 18], [20,22], [24, 26], [28,30]]:
        outsample_index = outsample_index + list(range(day_len*l[0], day_len*l[1]))
    
    for forward_ticks in range(50, 52, 2):
        ta_in_pm = GiveMePM(ta, n=forward_ticks, lim=[0, 20])
        ta_out_pm = GiveMePM(ta, n=forward_ticks, lim=[25, 30])
    
        res = myols(vwap_indicator, ta_in_pm)
        print '\n\n\n--------------------- %d ----------------------' %forward_ticks
        print res.summary()
        print '\n[OutSample R_SQUARE: %f]' % (PredictedRsquare(res, vwap_indicator, ta_out_pm))[1]
    
    
In [ ]: