notebook.community

Edit and run



In [3]:

    
# Direct Python to plot all figures inline (i.e., not in a separate window)
%matplotlib inline

# Load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# Division of two integers in Python 2.7 does not return a floating point result. The default is to round down 
# to the nearest integer. The following piece of code changes the default.
from __future__ import division



In [115]:

    
# Directory where the csv data file is located
workdir =  '../../data/raw_data/'
data_message = pd.read_csv('../../data/raw_data/AAPL_05222012_0930_1300_message.tar.gz',compression='gzip')
data_lob = pd.read_csv('../../data/raw_data/AAPL_05222012_0930_1300_LOB_2.tar.gz',compression='gzip')



In [116]:

    
#drop redundant time
col_names=data_lob.columns
delete_list=[i for i in col_names if 'UPDATE_TIME' in i]
for i in delete_list:
    data_lob=data_lob.drop(i,1)



In [117]:

    
#functions for renaming
def rename(txt):
    txt=txt[16:].split('..')[0]
    index=0
    ask_bid=''
    p_v=''
    if txt[-2].isdigit():
        index=txt[-2:]
    else:
        index=txt[-1]
    if txt[:3]=="BID":
        ask_bid='bid'
    else:
        ask_bid='ask'
    if txt[4:9]=="PRICE":
        p_v='P'
    else:
        p_v='V'
    return('_'.join([p_v,index,ask_bid]))



In [118]:

    
#rename columns
col_names=data_lob.columns
new_col_names=[]
new_col_names.append('index')
new_col_names.append('Time')
for i in col_names[2:]:
    new_col_names.append(rename(i))
len(new_col_names)
data_lob.columns=new_col_names



In [119]:

    
#feature: bid-ask spreads and mid price
for i in list(range(1, 11)):
    bid_ask_col_name='_'.join(['spreads',str(i)])
    p_i_ask='_'.join(['P',str(i),'ask'])
    p_i_bid='_'.join(['P',str(i),'bid'])
    data_lob[bid_ask_col_name]=data_lob[p_i_ask]-data_lob[p_i_bid]
    
    mid_price_col_name = '_'.join(['mid_price',str(i)])
    data_lob[mid_price_col_name]=(data_lob[p_i_ask]+data_lob[p_i_bid])/2



In [228]:

    
data_lob.head()









    Out[228]:






  
    
      
      index
      Time
      P_1_bid
      V_1_bid
      P_1_ask
      V_1_ask
      P_2_bid
      V_2_bid
      P_2_ask
      V_2_ask
      ...
      spreads_6
      mid_price_6
      spreads_7
      mid_price_7
      spreads_8
      mid_price_8
      spreads_9
      mid_price_9
      spreads_10
      mid_price_10
    
  
  
    
      0
      1
      2012/05/22 09:30:00.000
      569.02
      40
      570.0
      400
      568.80
      100
      570.01
      100
      ...
      2.4
      569.20
      2.80
      569.270
      3.64
      569.170
      3.70
      569.150
      4.08
      569.200
    
    
      1
      2
      2012/05/22 09:30:00.003
      569.02
      40
      570.0
      400
      568.80
      100
      570.01
      100
      ...
      2.4
      569.20
      2.68
      569.210
      3.32
      569.010
      3.69
      569.145
      3.84
      569.080
    
    
      2
      3
      2012/05/22 09:30:00.003
      569.02
      40
      570.0
      400
      568.80
      100
      570.01
      100
      ...
      2.4
      569.20
      2.67
      569.205
      3.20
      568.950
      3.37
      568.985
      3.83
      569.075
    
    
      3
      4
      2012/05/22 09:30:00.003
      569.03
      8
      570.0
      400
      569.02
      40
      570.01
      100
      ...
      2.3
      569.25
      2.54
      569.270
      2.68
      569.210
      3.32
      569.010
      3.69
      569.145
    
    
      4
      5
      2012/05/22 09:30:00.003
      569.03
      8
      570.0
      400
      569.02
      40
      570.01
      100
      ...
      2.3
      569.25
      2.52
      569.260
      2.67
      569.205
      3.20
      568.950
      3.37
      568.985
    
  

5 rows × 62 columns

Analyze $\Delta t$ and generate good graphs!



In [184]:

    
#compute frequency of midprice stationary
diff=data_lob['mid_price_1']
bid = data_lob['P_1_bid']
ask = data_lob['P_1_ask']

# print('t', 'up', 'stationary','down', 'up', 'stationary','down')

upper_step = 100
for t in list(range(1, upper_step, 5)):
    print(t)
    # label from mid-price
    delta_midprice = np.array(diff[t:]) - np.array(diff[:(-1*t)])
    dummy_up = delta_midprice > 0
    dummy_down = delta_midprice < 0
    dummy_stationary = delta_midprice == 0
    freq_stationary = sum(dummy_stationary) / len(dummy_stationary)
    freq_up   = sum(dummy_up) / len(dummy_up)
    freq_down = sum(dummy_down) / len(dummy_down)
    
    if t == 1:
        delta_t = np.array([t,])
        up_MP = np.array([freq_up,])
        stationary_MP = np.array([freq_stationary,])
        down_MP = np.array([freq_down,])
    else:
        delta_t = np.append(delta_t, t)
        up_MP = np.append(up_MP, freq_up)
        stationary_MP = np.append(stationary_MP, freq_stationary)
        down_MP = np.append(down_MP, freq_down)

df_MP = pd.DataFrame({'delta_t' : delta_t,
                              '1' : up_MP  ,
                              '0' : stationary_MP ,
                              '-1': down_MP})


upper_step_sc = 2000
for t in range(1, upper_step_sc, 100):
    print(t)
    # label from spread crossing
    up            = np.array(bid[t:]) - np.array(ask[:(-1*t)]) > 0
    down          = np.array(ask[t:]) - np.array(bid[:(-1*t)]) < 0
    stationary    = np.logical_not(np.logical_or(up, down))
    freq_stationary_crossing = sum(stationary) / len(stationary)
    freq_up_crossing = sum(up) / len(up)
    freq_down_crossing = sum(down) / len(down)

    if t == 1:
        delta_t = np.array([t,])
        up_SC = np.array([freq_up_crossing,])
        stationary_SC = np.array([freq_stationary_crossing,])
        down_SC = np.array([freq_down_crossing,])
    else:
        delta_t = np.append(delta_t, t)
        up_SC = np.append(up_SC, freq_up_crossing)
        stationary_SC = np.append(stationary_SC, freq_stationary_crossing)
        down_SC = np.append(down_SC, freq_down_crossing)

df_SC = pd.DataFrame({'delta_t' : delta_t,
                              '1' : up_SC  ,
                              '0' : stationary_SC ,
                              '-1': down_SC})



In [188]:

    
print(df_MP)
print(df_SC)









    



          -1         0         1  delta_t
0   0.085643  0.829787  0.084570        1
1   0.259719  0.497485  0.242795        6
2   0.344446  0.335358  0.320196       11
3   0.394626  0.241642  0.363732       16
4   0.429767  0.176322  0.393910       21
5   0.451250  0.135645  0.413105       26
6   0.466886  0.105792  0.427321       31
7   0.477896  0.084951  0.437152       36
8   0.486294  0.068676  0.445029       41
9   0.490165  0.058564  0.451271       46
10  0.494327  0.049783  0.455890       51
11  0.496559  0.044123  0.459318       56
12  0.499387  0.037765  0.462849       61
13  0.500846  0.034154  0.464999       66
14  0.502598  0.029919  0.467484       71
15  0.503516  0.027625  0.468859       76
16  0.504360  0.025827  0.469813       81
17  0.505426  0.024421  0.470154       86
18  0.506317  0.022274  0.471409       91
19  0.505949  0.021890  0.472161       96
          -1         0         1  delta_t
0   0.000000  1.000000  0.000000        1
1   0.056117  0.886828  0.057055      101
2   0.135287  0.734372  0.130342      201
3   0.190970  0.627748  0.181282      301
4   0.226965  0.565064  0.207971      401
5   0.256427  0.515567  0.228005      501
6   0.278858  0.477746  0.243396      601
7   0.293022  0.448243  0.258736      701
8   0.309375  0.418267  0.272358      801
9   0.324593  0.393590  0.281817      901
10  0.339519  0.370625  0.289856     1001
11  0.349182  0.349417  0.301401     1101
12  0.357101  0.337347  0.305552     1201
13  0.365218  0.328326  0.306456     1301
14  0.371682  0.318415  0.309902     1401
15  0.375732  0.310555  0.313713     1501
16  0.381198  0.303665  0.315137     1601
17  0.388945  0.290526  0.320529     1701
18  0.396105  0.277660  0.326235     1801
19  0.404541  0.268780  0.326678     1901



In [231]:

    
fig_MP = plt.figure()
plt.plot(df_MP['delta_t'], df_MP['1'], 'r--', 
         df_MP['delta_t'], df_MP['0'], 'b--',
         df_MP['delta_t'], df_MP['-1'], 'g--')
plt.legend(['Up', 'Stationary', 'Down'])
plt.xlabel('delta t')
plt.ylabel('Frequency')
plt.title('Proportion of labels: Midprice')

plt.show()

fig_SC = plt.figure()
plt.plot(df_SC['delta_t'], df_SC['1'], 'r--', 
         df_SC['delta_t'], df_SC['0'], 'b--',
         df_SC['delta_t'], df_SC['-1'], 'g--')
plt.legend(['Up', 'Stationary', 'Down'])
plt.xlabel('delta t')
plt.ylabel('Frequency')
plt.title('Proportion of labels: Bid-ask spread crossing')
plt.show()

fig_MP.savefig('../report/delta_t_MP.png')
fig_SC.savefig('../report/delta_t_SC.png')

A test for profit calculator



In [21]:

    
##### Old version #####

def profit_calculator(data, delta_t = 30, simple = True):
    """Calculate the profit of trading strategy based on precisely the prediction of the model
        Parameters
        ----------
        data    : a data frame with "predicted" "P_1_bid" "P_1_ask"
        delta_t : time gap between 
        simple  : a dummy, True, means we make transection decisions only every delta_t period. False, means we track the current 
                  hand every period, only if we don't have anything at hand, we make transactions

        Returns
        -------
        profit        : a numeric, the net profit at the end
        profit_series : a np.array, time series tracking net profit at each point of time
        
        """    
    if simple == True:
        data_effective = data.loc[np.arange(len(data)) % delta_t == 0]
        bid = data_effective['P_1_bid']
        ask = data_effective['P_1_ask']
        trade_decision = data_effective['predicted'][:-1]
        buy_profit = np.array(bid[1:]) - np.array(ask[:-1])
        profit = sum(trade_decision * buy_profit)
        return profit
    else:
        
#         print(data_effective)
#         print(data['P_1_bid'])
#         print(data['P_1_ask'])
        
#         print(data['P_1_bid'][delta_t:])
#         print(data['P_1_ask'][:(-1 * delta_t)])
        
#         print(np.array(data['P_1_bid'][delta_t:]))
#         print(np.array(data['P_1_ask'][:(-1 * delta_t)]))
        
        buy_profit = np.array(data['P_1_bid'][delta_t:]) - np.array(data['P_1_ask'][:(-1 * delta_t)])
        trade_decision_draft = data['predicted'][:(-1 * delta_t)]
        T = len(buy_profit)
        
        #print(T)
        #print(buy_profit)
        #print(trade_decision_draft)
        
        current_state = [0] * T
        trade_decision = [0] * T
        #print(current_state[T])
        for i in range(T):
            if current_state[i] == 1:
                trade_decision[i] = 0
            else:
                trade_decision[i] = trade_decision_draft[i]
                if i < T-1:
                    #print(i)
                    current_state[i+1] = int(sum(trade_decision[max(0, i - delta_t):i]) != 0)
        profit = sum(trade_decision * buy_profit)
        return profit



In [22]:

    
d = {'P_1_bid' : pd.Series([1., 2., 3., 5., 1., 2., 3., 5.]),
     'P_1_ask' : pd.Series([1., 2., 3., 4., 1., 2., 3., 4.]),
     'predicted' : pd.Series([0, 1, -1, 1, 1, 1, -1, 1])} 
df = pd.DataFrame(d)
print(df)









    



   P_1_ask  P_1_bid  predicted
0      1.0      1.0          0
1      2.0      2.0          1
2      3.0      3.0         -1
3      4.0      5.0          1
4      1.0      1.0          1
5      2.0      2.0          1
6      3.0      3.0         -1
7      4.0      5.0          1



In [27]:

    
print(profit_calculator(df, delta_t = 1, simple = True))
print(profit_calculator(df, delta_t = 1, simple = False))









    



[False  True False  True  True  True False] [False False  True False False False  True] [ 1.  1.  2. -3.  1.  1.  2.] [-1. -1. -1.  4. -1. -1. -1.]
[ 0.  1.  0. -3.  1.  1.  0.]
-2.0
-2.0



In [29]:

    
df.iloc[:-1,:]



In [26]:

    
##### New version #####

def profit_calculator(data, delta_t = 30, simple = False):
    """Calculate the profit of trading strategy based on precisely the prediction of the model
        Parameters
        ----------
        data    : a data frame with "predicted" "P_1_bid" "P_1_ask"
        delta_t : time gap between 
        simple  : a dummy, True, means we make transection decisions only every delta_t period. False, means we track the current 
                  hand every period, only if we don't have anything at hand, we make new transactions

        Returns
        -------
        profit        : a numeric, the net profit at the end
        
        """    
    if simple == True:
        data_effective = data.loc[np.arange(len(data)) % delta_t == 0]
        bid            = data_effective['P_1_bid']
        ask            = data_effective['P_1_ask']
        trade_decision = data_effective['predicted'][:-1]
        buy_profit     = np.array(bid[1:]) - np.array(ask[:-1])
        sell_profit    = np.array(bid[:-1]) - np.array(ask[1:])
        profit         = sum((np.array(trade_decision) > 0) * buy_profit + (np.array(trade_decision) < 0) * sell_profit)
        print(np.array(trade_decision) > 0, np.array(trade_decision) < 0, buy_profit, sell_profit)
        print((np.array(trade_decision) > 0) * buy_profit)
        return profit
    else:
        buy_profit           = np.array(data['P_1_bid'][delta_t:]) - np.array(data['P_1_ask'][:(-1 * delta_t)])
        sell_profit           = np.array(data['P_1_bid'][:(-1 * delta_t)]) - np.array(data['P_1_ask'][delta_t:])
        trade_decision_draft = data['predicted'][:(-1 * delta_t)]
        T                    = len(buy_profit)
        current_state        = [0] * T
        trade_decision       = [0] * T
        profit               = 0
        for i in range(T):
            if current_state[i] == 1:
                trade_decision[i] = 0
            else:
                trade_decision[i] = trade_decision_draft[i]
                if trade_decision[i] == 1:
                    profit += buy_profit[i]
                elif trade_decision[i] == -1:
                    profit += sell_profit[i]
                    
            if i < T-1:
                current_state[i+1] = int(sum(trade_decision[max(0, i - delta_t):i]) != 0)
        profit = sum(trade_decision * buy_profit)
        return profit

	index	Time	P_1_bid	V_1_bid	P_1_ask	V_1_ask	P_2_bid	V_2_bid	P_2_ask	V_2_ask	...	spreads_6	mid_price_6	spreads_7	mid_price_7	spreads_8	mid_price_8	spreads_9	mid_price_9	spreads_10	mid_price_10
0	1	2012/05/22 09:30:00.000	569.02	40	570.0	400	568.80	100	570.01	100	...	2.4	569.20	2.80	569.270	3.64	569.170	3.70	569.150	4.08	569.200
1	2	2012/05/22 09:30:00.003	569.02	40	570.0	400	568.80	100	570.01	100	...	2.4	569.20	2.68	569.210	3.32	569.010	3.69	569.145	3.84	569.080
2	3	2012/05/22 09:30:00.003	569.02	40	570.0	400	568.80	100	570.01	100	...	2.4	569.20	2.67	569.205	3.20	568.950	3.37	568.985	3.83	569.075
3	4	2012/05/22 09:30:00.003	569.03	8	570.0	400	569.02	40	570.01	100	...	2.3	569.25	2.54	569.270	2.68	569.210	3.32	569.010	3.69	569.145
4	5	2012/05/22 09:30:00.003	569.03	8	570.0	400	569.02	40	570.01	100	...	2.3	569.25	2.52	569.260	2.67	569.205	3.20	568.950	3.37	568.985