In [ ]:
# Trading Physics To Orders Cancels Trades 2.0 -converter
##### Usage examples:

In [ ]:
# Example 1: Download orderbook data from trading physics
obdt = OrderBookDataTool()
dl_path = 'D:/Data' # target folder
ticker = 'SPY' # ticker
min_date = '2014-05-26' # first date to get
max_date = '2015-01-01' # first date not to get
c_num = '111-222-333' # trading physics account customer number
pw_hash = 'asd1GHlaR9IU13094u8dLi' # trading physics account password hash
obdt.getdata(ticker,min_date,max_date,c_num,pw_hash,dl_path)

In [ ]:
# Example 2: Convert a folder of trading physics imput data into OCT2-format.
obdt = OrderBookDataTool()
input_folder = 'F:/Data/20130111-SPY.csv'
output_folder = 'F:/Output'
mode = 'w'
obdt.convert(input_folder,output_folder,mode) # convert data to oct2 format

In [ ]:
# Example 3: extend initially converted OCT2 data with order book state dependent featuers. 
ob = OrderBook('SPY','F:/Output/SPY_OCT2.h5')
ob.build_features(date='2013_01_11',save_path='F:/Output/SPY_OCT2_PLUS.h5',debug_mode=True)

In [ ]:
# Example 4: plot ob data using step
ob = OrderBook('SPY','C:/temp/Data/output/SPY_OCT2.h5')
ylim = [1468000, 1473100]
xlim = [-30000, 32000]
ob.plot('2013_01_11',67000,ylim,xlim)

In [ ]:
# Example 5: plot ob data using ms time (12:00:00.000)
ob = OrderBook('SPY','C:/temp/Data/output/SPY_OCT2.h5')
ylim = [1468000, 1473100]
xlim = [-30000, 32000]
date= '2013_01_11'
ob.plot(date,ob.time2step(date,43200000)[0],ylim,xlim)

In [ ]:
# Example 6: plot ob data using time string
ob = OrderBook('SPY','C:/temp/Data/output/SPY_OCT2.h5')
ylim = [1468000, 1473100]
xlim = [-30000, 32000]
date= '2013_01_11'
strtime = '12:00:00.000'
step = ob.str2step(date,strtime)[0]
ob.plot(date,step,ylim,xlim)

In [ ]:
import h5py as h5          # 2.6.0    + hdf5 1.8.15.1
import numpy as np         # 1.11.1
import pandas as pd        # 0.19.2
import bisect as bs
import random as rn
import warnings as wrn
import requests as rq
from matplotlib import pyplot as plt
from matplotlib.figure import Figure
import matplotlib.patches as patches
from datetime import datetime as dt
import math
import time
import os
import re
from IPython import display

# figure/plotting specifications
%matplotlib inline
fig_size = [20,16]
plt.rcParams["figure.figsize"] = fig_size

wrn.filterwarnings('ignore')

class OrderBookDataTool(object):

    def __init__(self):
        pass

    def convert(self,input_path,output_path,mode):
        # params:
        # input_path - input file or folder containing (only) input files
        # output_path - output folder
        # mode - 'w' (over)write or 'a' append
        
        # ticker = 'SPY' # ticker code
        # path_in = 'D:/SPY_project/data_example/input/' # input file location
        # path_out = 'D:/SPY_project/data_example/output/' # output file location
        # file_out = 'spy_to_hdf_test.h5' # input file name        
        
        # time the funcintion
        start_time = time.time()

        # suppress warnings generated by having hdf5 group named with yyyy_mm_dd since it is not really a problem
        wrn.filterwarnings('ignore', '.*NaturalNameWarning.*') 
        
        # handle params
        if os.path.isfile(input_path): # if input_path points to a single file
            file_path ,file_name = os.path.split(input_path)
            files = [file_name]
        elif os.path.isdir(input_path): # else if input_path points to a directory
            files = [f for f in os.listdir(input_path) if os.path.isfile(os.path.join(input_path, f))] # get all files in the dir
            file_path = input_path
        else: # invalid input_path
            print('Invalid input_path. Stopping.')
            return -2
        
        ticker = files[0].split('-',2)[1].split('.',2)[0] # get the ticker code from the first file
        file_re = re.compile(r'[0-9]{8}-'+ticker+'.csv') # construct file name re
        convert_files = (list(filter(file_re.match, files))) # find all that match the re
        convert_files = [os.path.join(file_path, f) for f in convert_files] # extend to full paths
        if (len(convert_files) != len(files)): # if all files didn't match
            print('Invalid output_path. All files should match: "[0-9]{8}-'+ticker+'.csv". Stopping.')
            return -2
        
        file_out = ticker + '_OCT2.h5' # output file name  
        if os.path.isdir(output_path): # if input_path points to a single file
            path_out_full = os.path.join(output_path,file_out)# everything ok
        else: # invalid input_path
            print('Invalid output_path. Stopping.')
            return -3
        
        if ((mode=='w')|((mode=='a')&(os.path.isfile(path_out_full)))):
            pass # everything ok
        else: # invalid input_path
            print('Invalid mode. Needs to be "w"(rite) or "a"(ppend). To append the file must exist in output_path. Stopping.')
            return -4        
        
        print('Started converting '+str(len(convert_files))+' file(s).')
        
        # Loop input files 
        for file in convert_files:
            
            loop_time = time.time() # time the loop
            
            ## get the date from the input file name
            path ,file_name = os.path.split(file)
            yyyy_str = file_name[0:4] # yyyy date str
            mm_str = file_name[4:6] # mm date str
            dd_str = file_name[6:8] # dd date str          
            date = yyyy_str + mm_str + dd_str # yyyymmdd date str
            grp_date = yyyy_str + '_' + mm_str + '_' + dd_str # yyyy_mm_dd date str
            
            ## Open/Read input file
            allcols = {'Time','Ticker','Order','T','Shares','Price', 'MPID', 'X'} # Bring in all data for the testing
            cols = {'Time','Order','T','Shares','Price', 'MPID'} # Bring in just the columns that we want
            cols_dtype = {'Time': 'u4', 'Ticker':'a3', 'Order':'u4', 'T': 'object', 'Shares': 'i4', 'Price': 'u4', 'MPID': 'object', 'X': 'object'}
            df = pd.read_csv(file, dtype = cols_dtype, engine = 'c',usecols = cols) # read input file into pandas datarame
            df.reset_index(level=0, inplace=True) # create a column from the index

            ## Reorganize the input data into the OCT2 format
            ### Create the orders table first
            or_df = df.query('T == "B" or T == "S" or T == "E" or T == "C"').copy() # filter a dataframe with only new limit order creations
            rn_cols = {'index':'STEP','Time':'T_CREATED','Order':'ORDER','T':'SIDE','Shares':'DELTA_Q','Price':'PRICE','MPID':'PARTICIPANT'};
            or_df.rename(columns=rn_cols, inplace=True) # rename the columns to match the output format
            or_df.loc[:,'STEP'] = or_df['STEP'].astype('u4') # convert the key from int64 to uint32
            or_df.loc[or_df['PRICE'] == 0,'DELTA_Q'] = -or_df.loc[or_df['PRICE']==0,'DELTA_Q'] # multiply by -1 the quanities of partials 
            or_df['UPDATE'] = or_df['SIDE'].copy() # get a copy of the 'SIDE'-column and name it as 'UPDATE'
            or_df['STEP_PRIORITY'] = or_df['STEP'].copy() # get a copy of the 'STEP'-column and name it as 'STEP_PRIORITY'
            ind_update = (or_df['PRICE'] == 0) # find boolean array indicating rows with order updates
            or_df.loc[(ind_update == 0),'UPDATE'] = 'L'; # the non-updates are new (L)imit orders
            or_df.loc[ind_update,'STEP_PRIORITY'] = np.nan # the update rows priority is invalid so fill it with nan's
            or_df.loc[ind_update,'SIDE'] = np.nan # the update rows data side & price is empty/invalid so fill them with nan's
            or_df.loc[ind_update,'PRICE'] = np.nan # the update rows data side & price is empty/invalid so fill them with nan's

            ### Filter and use the data on full exections and cancellations to get the destruction times
            fd_df = df[['index','Order','Time','T']].query('T == "F" or T == "D"').copy().set_index('Order') # filter to get full executions or cancellations
            orj_df = or_df.join(fd_df,on = ['ORDER'],how = 'left') # join the destruction times back to the order submission dataframe
            orj_df.rename(columns={'Time':'T_DESTROYED','index':'S_DESTROYED','T':'DESTROYED_BY'},inplace=True) # rename the newly acquired destruction times accordingly
            orj_df.set_index(['ORDER','STEP'],drop=False,inplace=True) # change to a multi-index to group the orders' rows together
            orj_df.sort_index(ascending=True, inplace=True) # sort by the new index
            ind_update_next = (orj_df.shift(-1)['UPDATE'] != 'L') # find boolean array indicating rows prior to order updates
            ind_update_next.iloc[-1] = False # there cannot be an update after the last entry
            orj_df.loc[ind_update_next,'S_DESTROYED'] = orj_df.shift(-1).loc[ind_update_next,'STEP'].astype('u4') # assign correct destruction STEPs
            orj_df.loc[ind_update_next,'T_DESTROYED'] = orj_df.shift(-1).loc[ind_update_next,'T_CREATED'].astype('u4') # assign correct destruction times
            orj_df[['PRICE','SIDE','STEP_PRIORITY']] = orj_df[['PRICE','SIDE','STEP_PRIORITY']].fillna(method='ffill') # forward fill in the price, side and step_priority for the orders updates         
            
            ### Create the traded column:
            ind_cancel_next = (orj_df.shift(-1)['UPDATE'] == 'C') # find boolean array indicating rows prior to partial cancellations
            ind_trade_next = (ind_update_next & ~(ind_cancel_next)) # find boolean array indicating rows prior to partial trades   
            ind_final_before_full_trade = (~(ind_update_next) & (orj_df['DESTROYED_BY'] == 'F')) # find barray to indicate that there is no more updates and order was eventually traded (F messages)
            orj_df['TRADED'] = False # create boolean field to indicate if the order is eventually traded or is eventually cancelled
            orj_df.loc[ind_trade_next,'TRADED'] = True # traded bool is true when there is a partial trade next
            orj_df.loc[ind_final_before_full_trade,'TRADED'] = True # ... or when there is no more updates and order was eventually traded (F messages)

            orj_df.drop(['ORDER','STEP','DESTROYED_BY'],axis=1,inplace=True) # drop the extra columns (they are in the index as well)               
            
            ### Cumsum to get the remaining quantities. This will take a couple of minutes.
            orj_df['QUANTITY'] = orj_df.groupby(level=[0])['DELTA_Q'].transform(pd.Series.cumsum) # cumsum over ORDERs to get quantities            
            
            ### Cumsum to get the quantities to be traded.
            orj_df['DELTA_Q_TO_TRADE'] = 0 # create new temporary column for the following trades delta q's 
            orj_df.loc[ind_trade_next,'DELTA_Q_TO_TRADE'] = orj_df.shift(-1).loc[ind_trade_next,'DELTA_Q'].astype('i4') # get the partial trade quantities
            orj_df.loc[ind_final_before_full_trade,'DELTA_Q_TO_TRADE'] = -orj_df.loc[ind_final_before_full_trade,'QUANTITY'] # get the full trade quantities
            # orj_df.index.sortlevel(level=[1],ascending=[False],sort_remaining=False) 
            orj_df.sort_index(level=[0,1], ascending=[True,False], inplace=True, sort_remaining=False) # sort again to get the decending step order
            orj_df['Q_TO_TRADE'] = -orj_df.groupby(level=[0])['DELTA_Q_TO_TRADE'].transform(pd.Series.cumsum) # cumsum to get the remaining  quantities to be traded
            orj_df.sort_index(ascending=True,inplace=True) # return to the original order            
            
            ### Create the cancellations table
            cd_df = df[['index','Order', 'Time','T','Shares','MPID']].query('T == "C" or T == "D"').copy() # filter all of the cancellations
            rn_cols = {'index':'STEP','Time':'TIME','Order':'ORDER','T':'UPDATE','Shares':'DELTA_Q','MPID':'PARTICIPANT'};
            cd_df.rename(columns=rn_cols, inplace=True) # rename the columns to match the output format
            cd_df.set_index(['ORDER','STEP'],inplace =True) # change to a multi-index to group the orders' rows together
            cd_df.sort_index(ascending=True, inplace=True) # sort by the new index so that ORDERs are grouped together

            ### Join the last updates' remaining quantities back to the cancellation data to determine the delta_q
            cdj_df = cd_df.join(orj_df.groupby(level=[0],as_index=True).last(),how = 'left',rsuffix = '_OR')
            ixd = (cdj_df['UPDATE'] == 'D') # filter for full cancellations
            ixc = (cdj_df['UPDATE'] == 'C') # filter for partial cancellations
            cdj_df.loc[ixd,'DELTA_Q'] = -cdj_df.loc[ixd,'QUANTITY'] # update full cancellations to match the remaining quantity times -1
            cdj_df.loc[ixc,'DELTA_Q'] = -cdj_df.loc[ixc,'DELTA_Q'] # update partial cancellations to be negative in delta_q
            cdj_df.drop(['T_CREATED','DELTA_Q_OR','UPDATE_OR','T_DESTROYED','QUANTITY','S_DESTROYED'], axis=1,inplace=True) # drop the extra columns

            ### Create the trades table
            ef_df = df[['index','Order', 'Time','T','Price','Shares','MPID']].query('T == "E" or T == "F" or T == "T"').copy() # filter all of the trades
            rn_cols = {'index':'STEP','Time':'TIME','Price':'TMP_PRICE','Order':'ORDER','T':'UPDATE','Shares':'DELTA_Q','MPID':'PARTICIPANT'};
            ef_df.rename(columns=rn_cols, inplace=True) # rename the columns to match the output format
            ef_df.set_index(['ORDER','STEP'],inplace =True) # change to a multi-index to group the orders' rows together
            ef_df.sort_index(ascending=True, inplace=True) # sort by the new index so that ORDERs are grouped together
            
            ### Join the last updates' remaining quantities back to the trade data to determine the delta_q
            efj_df = ef_df.join(orj_df.groupby(level=[0],as_index=True).last(),how = 'left',rsuffix = '_OR')
            ixf = (efj_df['UPDATE'] == 'F') # filter for full excutions (trades)
            ixe = (efj_df['UPDATE'] == 'E') # filter for partial executions
            efj_df.loc[ixf,'DELTA_Q'] = -efj_df.loc[ixf,'QUANTITY'] # update full executions to match the remaining quantity times -1
            efj_df.loc[~(ixf),'DELTA_Q'] = -efj_df.loc[~(ixf),'DELTA_Q'] # update partial and non-display trades to be negative in delta_q
            efj_df.loc[~(ixe|ixf),'PRICE'] = efj_df.loc[~(ixe|ixf),'TMP_PRICE'] # set correct prices for non-display trades
            efj_df.drop(['T_CREATED','DELTA_Q_OR','UPDATE_OR','TMP_PRICE','T_DESTROYED','QUANTITY','S_DESTROYED'], 
                        axis=1,inplace=True) # drop extra columns
            efj_df.loc[(efj_df['UPDATE'] == 'T'),['STEP_PRIORITY']] = 0 # fill nans with 0s ...
            efj_df['STEP_PRIORITY'] = efj_df['STEP_PRIORITY'].astype('u4') # ... so it's possible to convert them back to unsigned ints
            
            ### Get the cross events
            x_df = df.query('T == "X"').copy() # filter a dataframe with only cross events
            x_df.drop(['T','MPID'], axis=1,inplace=True)
            rn_cols = {'index':'STEP','Time':'TIME','Order':'ORDER','Shares':'QUANTITY','Price':'PRICE'};
            x_df.rename(columns=rn_cols, inplace=True) # rename the columns to match the output format
            x_df['QUANTITY'] = x_df['QUANTITY'].astype("u4") # dtype to uint32            
            
            ### Convert datatypes and prepare the data to be written into the hdf5 file
            orj_df.reset_index(inplace=True) # Switch back to using row number as the index
            orj_df.index.names = ['INDEX'] # Rename with caps so it fits in with the other column names
            orj_df['ORDER'] = orj_df['ORDER'].astype("u4") # dtype to uint32
            orj_df['T_DESTROYED'] = orj_df['T_DESTROYED'].astype("u4") # dtype to uint32
            orj_df['S_DESTROYED'] = orj_df['S_DESTROYED'].astype("u4") # dtype to uint32
            orj_df['STEP_PRIORITY'] = orj_df['STEP_PRIORITY'].astype("u4") # dtype to uint32
            orj_df['QUANTITY'] = orj_df['QUANTITY'].astype("u4") # dtype to uint32
            orj_df['PRICE'] = orj_df['PRICE'].astype("u4") # dtype to uint32
            orj_df['DELTA_Q'] = orj_df['DELTA_Q'].astype("i4") # dtype to int32
            orj_df['SIDE'] = orj_df['SIDE'].astype('category') # dtype to category
            orj_df['UPDATE'] = orj_df['UPDATE'].astype('category') # dtype to category
            orj_df['PARTICIPANT'] = orj_df['PARTICIPANT'].astype('category') # dtype to category
            rn_cols = {'STEP':'STEP_CREATED','S_DESTROYED':'STEP_DESTROYED','T_DESTROYED':'TIME_DESTROYED','T_CREATED':'TIME_CREATED','DELTA_Q':'DELTA_QUANTITY','Q_TO_TRADE':'QUANTITY_TO_BE_TRADED'}
            orj_df.rename(columns=rn_cols, inplace=True) # rename 

            orj_df['SIDE'].cat.rename_categories(['BID','ASK'],inplace=True) # rename category labels to be more descriptive
            orj_df['UPDATE'].cat.rename_categories(['CANCEL','TRADE','ORDER'],inplace=True) # rename category labels to be more descriptive

            cdj_df.reset_index(inplace=True) # Switch back to using row number as the index
            cdj_df.index.names = ['INDEX'] # Rename with caps so it fits in with the other column names
            cdj_df['ORDER'] = cdj_df['ORDER'].astype("u4") # dtype to uint32
            cdj_df['STEP'] = cdj_df['STEP'].astype("u4") # dtype to uint32
            cdj_df['TIME'] = cdj_df['TIME'].astype("u4") # dtype to uint32
            cdj_df['PRICE'] = cdj_df['PRICE'].astype("u4") # dtype to uint32
            cdj_df['DELTA_Q'] = cdj_df['DELTA_Q'].astype("i4") # dtype to int32
            cdj_df['SIDE'] = cdj_df['SIDE'].astype('category') # dtype to category
            cdj_df['UPDATE'] = cdj_df['UPDATE'].astype('category') # dtype to category
            cdj_df['PARTICIPANT'] = cdj_df['PARTICIPANT'].astype('category') # dtype to category
            rn_cols = {'DELTA_Q':'DELTA_QUANTITY'}
            cdj_df.rename(columns=rn_cols, inplace=True) # rename 
            
            cdj_df['SIDE'].cat.rename_categories(['BID','ASK'],inplace=True) # rename category labels to be more descriptive
            cdj_df['UPDATE'].cat.rename_categories(['PART','FULL'],inplace=True) # rename category labels to be more descriptive

            efj_df.reset_index(inplace=True) # Switch back to using row number as the index
            efj_df.index.names = ['INDEX'] # Rename with caps so it fits in with the other column names
            efj_df['ORDER'] = efj_df['ORDER'].astype("u4") # dtype to uint32
            efj_df['STEP'] = efj_df['STEP'].astype("u4") # dtype to uint32
            efj_df['TIME'] = efj_df['TIME'].astype("u4") # dtype to uint32
            efj_df['PRICE'] = efj_df['PRICE'].astype("u4") # dtype to uint32
            efj_df['DELTA_Q'] = efj_df['DELTA_Q'].astype("i4") # dtype to int32
            efj_df['SIDE'] = efj_df['SIDE'].astype('category') # dtype to category
            efj_df['UPDATE'] = efj_df['UPDATE'].astype('category') # dtype to category
            efj_df['PARTICIPANT'] = efj_df['PARTICIPANT'].astype('category') # dtype to category
            rn_cols = {'DELTA_Q':'DELTA_QUANTITY'}
            efj_df.rename(columns=rn_cols, inplace=True) # rename 

            efj_df['SIDE'].cat.rename_categories(['BID','ASK'],inplace=True) # rename category labels to be more descriptive
            efj_df['UPDATE'].cat.rename_categories(['PART','FULL','HIDDEN'],inplace=True) # rename category labels to be more descriptive

            ## Write the tables into the HDF5 file
            cols = ['ORDER','STEP_CREATED','STEP_DESTROYED','STEP_PRIORITY','TIME_CREATED','TIME_DESTROYED','SIDE',
                    'UPDATE','PRICE','QUANTITY','DELTA_QUANTITY','TRADED','QUANTITY_TO_BE_TRADED','PARTICIPANT']
            grp_orders = ('/' + grp_date + '/ORDERS')
            orj_df[cols].to_hdf(path_out_full, grp_orders, mode=mode,format='table',data_columns=cols,complevel=0,complib='blosc') # orders
            mode = 'a' # even if mode was write it still needs to be 'a'ppend for the rest of the datasets
            cols = ['ORDER','STEP','STEP_PRIORITY','TIME','SIDE','UPDATE','PRICE','DELTA_QUANTITY','PARTICIPANT']
            grp_cancels = ('/' + grp_date + '/CANCELS')
            cdj_df[cols].to_hdf(path_out_full, grp_cancels, mode=mode,format='table',data_columns=cols,complevel=0,complib='blosc') # cancels
            cols = ['ORDER','STEP','STEP_PRIORITY','TIME','SIDE','UPDATE','PRICE','DELTA_QUANTITY','PARTICIPANT']
            grp_trades = ('/' + grp_date + '/TRADES')
            efj_df[cols].to_hdf(path_out_full, grp_trades, mode=mode,format='table',data_columns=cols,complevel=0,complib='blosc') # trades
            cols = ['ORDER','STEP','TIME','PRICE','QUANTITY']
            grp_crosses = ('/' + grp_date + '/CROSS_EVENTS')
            x_df[cols].to_hdf(path_out_full, grp_crosses, mode=mode,format='table',data_columns=cols,complevel=0,complib='blosc') # cross events
            
            print('Conversion of '+ file +' complete. It took '+str(time.time()-loop_time)+' seconds.')
            
        print('Conversion complete. It took '+str(time.time()-start_time)+' seconds.')
        return 1

    def getdata(self,ticker,min_date,max_date,c_num,pw_hash,dl_path): # Get the data files using trading physics api
        start_time = time.time() # time the funcintion       

        if os.path.isdir(dl_path): # if dl_path is directory
            pass # everything ok
        else: # invalid input_path
            print('Invalid dl_path. Stopping.')
            return -2        
        
        data_type = 'orderflow' # type of data to get - it should always be 'orderflow' or the converter will not work
        data_format = 'CSV'     # format of data to get - it should always be 'CSV' or the converter will not work
        data_comp = 'stream'    # compression of data to use - it should always be 'stream' or the converter will not work
        api_url = 'http://api.tradingphysics.com' # trading physics data api url

        # Get the available dates and filter the ones we want
        r = rq.get(api_url+'/getdates',params={'type':data_type})
        datelist = r.text.splitlines(); # split by newline
        all_dts = np.array([dt.strptime(datestr, '%Y-%m-%d') for datestr in datelist]) # convert strings to datetime objects
        min_dt = dt.strptime(min_date, '%Y-%m-%d') # datetime lower limit
        max_dt = dt.strptime(max_date, '%Y-%m-%d') # datetime lower limit
        get_dts = all_dts[(all_dts < max_dt)&(all_dts >= min_dt)] # find the span of dates to get
        get_dts = get_dts[::-1] # flip 
        
        # download data for date each in get_dts
        for date in get_dts:

            strdate = dt.strftime(date,'%Y%m%d') # make a YYYYmmdd -string from date
            filename = strdate+'-'+ticker+'.csv' # create name for the file that will be downloaded

            # get the ticket
            ticket_params = (('date',strdate),('stock',ticker),('format',data_format),('compression',data_comp))
            ticket_url = api_url+'/getticket?C='+c_num+'&P='+pw_hash+'?getdata?type=orderflow' # api doesn't follow normal syntax so this gets messy
            r = rq.get(ticket_url,params=ticket_params) # request to get the ticket 
            if (r.status_code!=200): # if there is a problem with the request
                print('Could not get a dl ticket using url: '+r.url)
                print('Status code: '+str(r.status_code))
                print('Reason: '+r.reason)
                print('Stopping because otherwise download credits might be wasted.')
                return -1
            ticket = r.text # get the ticket from the response
            
            data_params = (('type',data_type),('date',strdate),('stock',ticker),('format',data_format),
                           ('compression',data_comp),('t',ticket))
            data_url = api_url+'/getdata' # use the ticket to get the data
            r = rq.get(data_url,params=data_params,stream=True) # request to get the data
            if (r.status_code==200): # if there is no problem with the request
                file = open(os.path.join(dl_path,filename), 'wb')
                for chunk in r.iter_content(chunk_size=512 * 1024): # avoid running out of memory with large files -> write in chunks
                    if chunk: # filter out keep-alive new chunks
                        file.write(chunk)
                file.close()   
            else:
                print('Could not download file using url: '+r.url)
                print('Status code: '+str(r.status_code))
                print('Reason: '+r.reason)
                print('Stopping because otherwise download credits might be wasted.')
                return -1
        
        print('Completed loading of '+str(get_dts.size)+' files.')
        print('Time taken: ' +str((time.time()-start_time))+' seconds.')
        return 1

## Class to build and hold order book states from oct2 data

class OrderBookData(object):
    # orders, pd.DataFrame    - orders table as a pandas dataframe
    # cancels, pd.DataFrame    - cancels table as a pandas dataframe
    # trades, pd.DataFrame    - trades table as a pandas dataframe
    # cross_trades, pd.DataFrame    - cross_trades table as a pandas dataframe
    # tick_size              - int price tick size *10000
    
    def __init__(self,date,orders,cancels,trades,cross_trades,tick_size):    

        self.date = date # date of the data
        self.tick_size = tick_size # size of price tick
        
        # self.fig =  plt.figure()
        # self.ax = self.fig.add_subplot(1, 1, 1, facecolor=[0.15,0.15,0.15])  

        orders.set_index(['STEP_CREATED','STEP_DESTROYED'],drop=False,inplace=True) # index by timestamps
        orders.sort_index(inplace=True) # sort by index so they are in chronological order by creation time (S_CREATED)
        
        self.bid_orders = orders[(orders['SIDE'] == 'BID')].copy() # filter to get only bid side orders
        self.ask_orders = orders[(orders['SIDE'] == 'ASK')].copy() # filter to get only bid side orders 
        
        self.bsc = self.bid_orders.ix[:,'STEP_CREATED'] # get list of bid state creation times
        self.bsd = self.bid_orders.ix[:,'STEP_DESTROYED'] # get list of bid state destruction times
        self.ssc = self.ask_orders.ix[:,'STEP_CREATED'] # get list of bid state creation times
        self.ssd = self.ask_orders.ix[:,'STEP_DESTROYED'] # get list of bid state destruction times        

        self.orders = orders
        self.cancels = cancels.set_index(['STEP'],drop=False)
        self.trades = trades.set_index(['STEP'],drop=False) 
        self.cross_trades = cross_trades.set_index(['STEP'],drop=False)
        self.nd_trades = self.trades[self.trades['ORDER']==0]
        
        # create events dataframe
        events = orders[['STEP_CREATED','STEP_PRIORITY','TIME_CREATED','SIDE','PRICE','UPDATE']]
        rncols = {'STEP_CREATED':'STEP','TIME_CREATED':'TIME','UPDATE':'UPDATE_OLD'}
        events.rename(columns=rncols,inplace=True)
        events.set_index(['STEP'],drop=False,inplace=True) 
        events.loc[events['UPDATE_OLD']=='ORDER','UPDATE'] = 'O'
        events.loc[events['UPDATE_OLD']=='TRADE','UPDATE'] = 'PT'
        events.loc[events['UPDATE_OLD']=='CANCEL','UPDATE'] = 'PC'
        
        cancels = cancels[cancels['UPDATE']=='FULL']
        cancels = cancels[['STEP','STEP_PRIORITY','TIME','SIDE','PRICE']]
        cancels.set_index(['STEP'],drop=False,inplace=True) 
        cancels['UPDATE'] = 'C'
        events = events.append(cancels)
        
        trades = trades[trades['UPDATE']=='FULL']
        trades = trades[['STEP','STEP_PRIORITY','TIME','SIDE','PRICE']]
        trades.set_index(['STEP'],drop=False,inplace=True) 
        trades['UPDATE'] = 'T'
        events = events.append(trades)
        
        events['UPDATE'] = events['UPDATE'].astype('category')
        events.drop('UPDATE_OLD',axis=1,inplace=True)
        events['UPDATE'].cat.rename_categories(['CANCEL','ORDER','PART_CANCEL','PART_TRADE','TRADE'],inplace=True)  
        events.sort_index(ascending=True, inplace=True)
        self.events = events
        self.ask_events = events[events['SIDE']=='ASK'].copy()
        self.bid_events = events[events['SIDE']=='BID'].copy()

    def update_rm_ask_state(self,step): # update ask side by removing everything that has expired up to current step
        self.ask_state = self.ask_state[self.ask_state['STEP_DESTROYED']!=step]   
    
    def update_rm_bid_state(self,step): # update bid side by removing everything that has expired up to current step
        self.bid_state = self.bid_state[self.bid_state['STEP_DESTROYED']!=step]

    def update_add_ask_state(self,step):  # update ask side by adding current step order
        self.ask_state = pd.concat([self.ask_state, self.ask_orders.xs(step, level=0, drop_level=False)])
    
    def update_add_bid_state(self,step): # update bid side by adding current step order
        self.bid_state = pd.concat([self.bid_state, self.bid_orders.xs(step, level=0, drop_level=False)])
    
    def update_addrm_ask_state(self,step): # update bid side by removing everything that has expired up to current step
        self.ask_state = pd.concat([self.ask_state[self.ask_state['STEP_DESTROYED']!=step], self.ask_orders.xs(step, level=0, drop_level=False)])
     
    def update_addrm_bid_state(self,step): # update bid side by adding current step order
        self.bid_state = pd.concat([self.bid_state[self.bid_state['STEP_DESTROYED']!=step], self.bid_orders.xs(step, level=0, drop_level=False)])    
    
    def init_ask_state(self,step): # get the collection of orders on ask side (regardless of sequence of states)
        self.ask_state = self.ask_orders[(self.ssc <= step)&(self.ssd > step)]
        pass
        
    def init_bid_state(self,step): # get the collection of orders on bid side (regardless of sequence of states)
        self.bid_state = self.bid_orders[(self.bsc <= step)&(self.bsd > step)]
        pass
    
    def update_event_ask_lvl(self,p): # get the level relative to ask side events price: 0.5 if p is a new best, 1 at lvl1, 1.5 if at [lvl1-lvl2] etc...
        if not self.ask_levels.empty:                           
            lvl = bs.bisect_left(self.ask_levels.index.values,p) + 1 # ask level of the event
            return (lvl if (p in self.ask_levels.index.values) else lvl - 0.5)
        else:
            return 0
                                                                          
    def update_event_bid_lvl(self,p):  # get the level relative to bid side events price: 0.5 if p is a new best, 1 at lvl1, 1.5 if at [lvl1-lvl2] etc...
        if not self.bid_levels.empty: 
            lvl = bs.bisect_left(-self.bid_levels.index.values[::-1],-p) + 1 # bid level of the event
            return (lvl if (p in self.bid_levels.index.values) else lvl - 0.5)
        else: 
            return 0
    
    def update_ask_tick(self,p): # get the ask side updates tick depth of ask order (deeper means lower price priority negatives are inside the spread)
        return (p-self.s_best)/self.tick_size if not(np.isnan(self.s_best)) else np.nan
    
    def update_bid_tick(self,p): # get the bid side updates tick depth of ask order (deeper means lower price priority negatives are inside the spread)
        return (self.b_best-p)/self.tick_size if not(np.isnan(self.b_best)) else np.nan
    
    def update_nth_ask_lvl(self,n): # update ask levels and return nth level [p,q]
        self.ask_levels = self.ask_state[['PRICE','QUANTITY']].groupby(['PRICE']).agg({'QUANTITY': np.sum}).sort_index()
        if not self.ask_levels.empty:
            self.s_best = self.ask_levels.index[n-1]
            return [self.ask_levels.index[n-1], self.ask_levels['QUANTITY'].iloc[n-1]]
        else: 
            return [-1,-1]
    
    def update_nth_bid_lvl(self,n): # update bid levels and return nth level [p,q]
        self.bid_levels = self.bid_state[['PRICE','QUANTITY']].groupby(['PRICE']).agg({'QUANTITY': np.sum}).sort_index()
        if not self.bid_levels.empty:
            self.b_best = self.bid_levels.index[-n]
            return [self.bid_levels.index[-n], self.bid_levels['QUANTITY'].iloc[-n]]
        else:
            return [-1,-1]
    
    def update_ask_q_in_front(self,p,sp): # cumulative quantity of orders with higher priority on ask side
        orders_in_front = ((self.ask_state['PRICE']<p) | ((self.ask_state['PRICE'] == p) & (self.ask_state['STEP_PRIORITY'] < sp))) 
        if any(orders_in_front):
            return self.ask_state.loc[orders_in_front,'QUANTITY'].sum()
        else:
            return 0
    
    def update_bid_q_in_front(self,p,sp): # cumulative quantity of orders with higher priority on bid side
        orders_in_front = ((self.bid_state['PRICE']>p) | ((self.bid_state['PRICE'] == p) & (self.bid_state['STEP_PRIORITY'] < sp))) 
        if any(orders_in_front):
            return self.bid_state.loc[orders_in_front,'QUANTITY'].sum()
        else:
            return 0       
        
    def init_first_state(self): # start the order book at the first (empty) state 
        self.init_bid_state(-1)
        self.init_ask_state(-1)
        self.update_nth_ask_lvl(1)
        self.update_nth_bid_lvl(1)
        self.b_best = np.nan
        self.s_best = np.nan
         
    def update_ask_data_on_order(self,row): # calculate and return various features that depend on the order book state and the ask side event
        al = self.update_event_ask_lvl(row['PRICE'])
        at = self.update_ask_tick(row['PRICE']) if (al > 0) else np.nan   # event & old state relative
        self.update_add_ask_state(row['STEP']) # update state with event  
        aqif = self.update_ask_q_in_front(row['PRICE'],row['STEP_PRIORITY'])
        [ap,aq] = self.update_nth_ask_lvl(1) # 1st level price and quantity at the new state
        return pd.Series([ap ,aq , al, at, aqif])     
        
    def update_bid_data_on_order(self,row): # calculate and return various features that depend on the order book state and the bid side event
        bl = self.update_event_bid_lvl(row['PRICE']) # event & old state relative
        bt = self.update_bid_tick(row['PRICE']) if (bl > 0) else np.nan # event & old state relative       
        self.update_add_bid_state(row['STEP']) # update state with event  
        bqif = self.update_bid_q_in_front(row['PRICE'],row['STEP_PRIORITY'])
        [bp,bq] = self.update_nth_bid_lvl(1) # 1st level price and quantity at the new state
        return pd.Series([bp ,bq , bl, bt, bqif]) 

    def update_ask_data_on_cancel(self,row): # calculate and return various features that depend on the order book state and the ask side event
        al = self.update_event_ask_lvl(row['PRICE'])
        at = self.update_ask_tick(row['PRICE']) if (al > 0) else np.nan # event & old state relative   
        self.update_rm_ask_state(row['STEP']) # update state with event
        aqif = self.update_ask_q_in_front(row['PRICE'],row['STEP_PRIORITY'])
        [ap,aq] = self.update_nth_ask_lvl(1) # 1st level price and quantity at the new state
        return pd.Series([ap ,aq , al, at, aqif])     
 
    def update_bid_data_on_cancel(self,row): # calculate and return various features that depend on the order book state and the bid side event
        bl = self.update_event_bid_lvl(row['PRICE']) # event & old state relative
        bt = self.update_bid_tick(row['PRICE']) if (bl > 0) else np.nan# event & old state relative
        self.update_rm_bid_state(row['STEP']) # update state with event  
        bqif = self.update_bid_q_in_front(row['PRICE'],row['STEP_PRIORITY'])
        [bp,bq] = self.update_nth_bid_lvl(1) # 1st level price and quantity at the new state
        return pd.Series([bp ,bq , bl, bt, bqif]) 
 
    def update_ask_data_on_part_cancel(self,row): # calculate and return various features that depend on the order book state and the ask side event
        al = self.update_event_ask_lvl(row['PRICE'])
        at = self.update_ask_tick(row['PRICE']) if (al > 0) else np.nan # event & old state relative        
        self.update_addrm_ask_state(row['STEP']) # update state with event
        aqif = self.update_ask_q_in_front(row['PRICE'],row['STEP_PRIORITY'])
        [ap,aq] = self.update_nth_ask_lvl(1) # 1st level price and quantity at the new state
        return pd.Series([ap ,aq , al, at, aqif])     
 
    def update_bid_data_on_part_cancel(self,row): # calculate and return various features that depend on the order book state and the bid side event
        bl = self.update_event_bid_lvl(row['PRICE']) # event & old state relative
        bt = self.update_bid_tick(row['PRICE']) if (bl > 0) else np.nan# event & old state relative
        self.update_addrm_bid_state(row['STEP']) # update state with event
        bqif = self.update_bid_q_in_front(row['PRICE'],row['STEP_PRIORITY'])
        [bp,bq] = self.update_nth_bid_lvl(1) # 1st level price and quantity at the new state
        return pd.Series([bp ,bq , bl, bt, bqif]) 

    def update_ask_data_on_trade(self,row): # process ask side trades
        self.update_rm_ask_state(row['STEP']) # update state with event
        [ap,aq] = self.update_nth_ask_lvl(1) # 1st level price and quantity at the new state        
        return pd.Series([ap ,aq, 1.0, 0.0, 0.0]) 
      
    def update_bid_data_on_trade(self,row): # process bid side trades
        self.update_rm_bid_state(row['STEP']) # update state with event
        [bp,bq] = self.update_nth_bid_lvl(1) # 1st level price and quantity at the new state
        return pd.Series([bp ,bq, 1.0, 0.0, 0.0])    
  
    def update_ask_data_on_part_trade(self,row): # process ask side trades
        self.update_addrm_ask_state(row['STEP']) # update state with event
        [ap,aq] = self.update_nth_ask_lvl(1) # 1st level price and quantity at the new state        
        return pd.Series([ap ,aq, 1.0, 0.0, 0.0]) 
      
    def update_bid_data_on_part_trade(self,row): # process bid side trades
        self.update_addrm_bid_state(row['STEP']) # update state with event
        [bp,bq] = self.update_nth_bid_lvl(1) # 1st level price and quantity at the new state
        return pd.Series([bp ,bq ,1.0, 0.0, 0.0])    

    def update_data_on_nd_trade(self,row): # process non display trades (very innefficient but there are only very few of them)
        self.init_bid_state(row['STEP']) # update state with event 
        [bp,bq] = self.update_nth_bid_lvl(1) # 1st level price and quantity at the new state
        self.init_ask_state(row['STEP']) # update state with event  
        [ap,aq] = self.update_nth_ask_lvl(1) # 1st level price and quantity at the new state 
        mt = ( row['PRICE'] - ((ap-bp)/2.0) )/self.tick_size
        return pd.Series([ap, aq, bp, bq, mt])                                         

    def init_updates(self): # unit function dictionaries for different updates 
        self.ask_updates = {
            'ORDER' : self.update_ask_data_on_order,
            'CANCEL' : self.update_ask_data_on_cancel,
            'PART_CANCEL' : self.update_ask_data_on_part_cancel,
            'TRADE' : self.update_ask_data_on_trade,
            'PART_TRADE' : self.update_ask_data_on_part_trade
        }        
        self.bid_updates = {
            'ORDER' : self.update_bid_data_on_order,
            'CANCEL' : self.update_bid_data_on_cancel,
            'PART_CANCEL' : self.update_bid_data_on_part_cancel,
            'TRADE' : self.update_bid_data_on_trade,
            'PART_TRADE' : self.update_bid_data_on_part_trade
        }
        
    def update_ask_data(self,row): # update ask side data after ask side event
        return self.ask_updates[row['UPDATE']](row)
        
    def update_bid_data(self,row): # update bid side data after bid side event
        return self.bid_updates[row['UPDATE']](row)        
    
    def build_ask_features(self): # build all of the ask side dependent features
        self.init_first_state()
        self.ask_events[['ASK_PRICE','ASK_QUANTITY','UPDATE_LEVEL','TICK_DIST','QUANTITY_IN_FRONT']] = self.ask_events.apply(
            lambda row: self.update_ask_data(row),axis=1)                   
        return
        
    def build_bid_features(self): # build all of the bid side dependent features
        self.init_first_state()
        self.bid_events[['BID_PRICE','BID_QUANTITY','UPDATE_LEVEL','TICK_DIST','QUANTITY_IN_FRONT']] = self.bid_events.apply(
            lambda row: self.update_bid_data(row),axis=1) 
        return
                                       
    def build_nd_features(self): # build the non-display trades relatad features
        self.nd_trades[['ASK_PRICE','ASK_QUANTITY','BID_PRICE','BID_QUANTITY','TICK_DIST']] = self.nd_trades.apply(
            lambda row: self.update_data_on_nd_trade(row),axis=1)
        return
    
    def join_event_data(self): # fill the blanks and prepare orders, cancels and trades tables
        self.events = self.bid_events.append(self.ask_events)
        
        self.nd_trades = self.nd_trades[['STEP','STEP_PRIORITY','TIME','SIDE','PRICE','ASK_PRICE','ASK_QUANTITY','BID_PRICE','BID_QUANTITY','TICK_DIST']]
        self.nd_trades['UPDATE_LEVEL'] = np.nan
        self.nd_trades['TICK_DIST'] = np.nan
        self.nd_trades.set_index(['STEP'],drop=False,inplace=True) 
        
        self.events = self.events.append(self.nd_trades)
        self.events.sort_index(ascending=True,inplace=True)
        
        # fill nan values
        self.events['ASK_PRICE'].fillna(method='ffill',inplace=True)
        self.events['ASK_QUANTITY'].fillna(method='ffill',inplace=True)
        self.events['BID_PRICE'].fillna(method='ffill',inplace=True)
        self.events['BID_QUANTITY'].fillna(method='ffill',inplace=True)
        self.events['ASK_PRICE'].fillna(-1,inplace=True)
        self.events['ASK_QUANTITY'].fillna(-1,inplace=True)
        self.events['BID_PRICE'].fillna(-1,inplace=True)
        self.events['BID_QUANTITY'].fillna(-1,inplace=True)   
        
        # convert datatypes to int32
        self.events['ASK_PRICE'] = self.events['ASK_PRICE'].astype('i4')
        self.events['ASK_QUANTITY'] = self.events['ASK_QUANTITY'].astype('i4')
        self.events['BID_PRICE'] = self.events['BID_PRICE'].astype('i4')
        self.events['BID_QUANTITY'] = self.events['BID_QUANTITY'].astype('i4')
        
        self.orders.set_index('STEP_CREATED',inplace=True,drop=False)
        self.orders.sort_index()
        self.cancels.set_index('STEP',inplace=True,drop=False)
        self.cancels.sort_index()
        self.trades.set_index('STEP',inplace=True,drop=False)
        self.trades.sort_index()
        
        self.orders = self.orders.join(self.events[['ASK_PRICE','ASK_QUANTITY','BID_PRICE','BID_QUANTITY','UPDATE_LEVEL','TICK_DIST','QUANTITY_IN_FRONT']],how='left')
        self.cancels = self.cancels.join(self.events[['ASK_PRICE','ASK_QUANTITY','BID_PRICE','BID_QUANTITY','UPDATE_LEVEL','TICK_DIST','QUANTITY_IN_FRONT']],how='left')
        self.trades = self.trades.join(self.events[['ASK_PRICE','ASK_QUANTITY','BID_PRICE','BID_QUANTITY','UPDATE_LEVEL','TICK_DIST','QUANTITY_IN_FRONT']],how='left')
        
    def build_features(self,debug_mode): # build features    
        start = time.time()
        print('Building features for ' + self.date + '.')
        
        if (debug_mode==True):
            lim = 5000  
            blim = (self.bid_events['STEP'] < lim) # limit for testing and and debugging
            alim = (self.ask_events['STEP'] < lim) # limit for testing and and debugging
            self.bid_events = self.bid_events[(blim)] # limit for testing and and debugging
            self.ask_events = self.ask_events[(alim)] # limit for testing and and debugging      
        
        self.init_updates()
        self.build_ask_features()
        self.build_bid_features()
        self.build_nd_features()
        self.join_event_data()
        
        print('Building features done. The process took ' +str((time.time()-start)/60)+' minutes.')
        
    def save_data(self,save_path):
    
        print('Saving extended data for ' + self.date + '.')
    
        # categorize the text columns
        self.orders['UPDATE'] = self.orders['UPDATE'].astype('category')
        self.orders['SIDE'] = self.orders['SIDE'].astype('category')
        self.orders['PARTICIPANT'] = self.orders['PARTICIPANT'].astype('category')
 
        self.cancels['UPDATE'] = self.cancels['UPDATE'].astype('category')
        self.cancels['SIDE'] = self.cancels['SIDE'].astype('category')
        self.cancels['PARTICIPANT'] = self.cancels['PARTICIPANT'].astype('category')

        self.trades['UPDATE'] = self.trades['UPDATE'].astype('category')
        self.trades['SIDE'] = self.trades['SIDE'].astype('category')
        self.trades['PARTICIPANT'] = self.trades['PARTICIPANT'].astype('category')
        
        ## Write the tables into the HDF5 file
        mode = 'w'
        cols = ['ORDER','STEP_CREATED','STEP_DESTROYED','STEP_PRIORITY','TIME_CREATED','TIME_DESTROYED','SIDE','UPDATE','PRICE','QUANTITY','DELTA_QUANTITY','TRADED','QUANTITY_TO_BE_TRADED','PARTICIPANT','ASK_PRICE','ASK_QUANTITY','BID_PRICE','BID_QUANTITY','UPDATE_LEVEL','TICK_DIST','QUANTITY_IN_FRONT']
        grp_orders = ('/' + self.date + '/ORDERS')
        self.orders[cols].to_hdf(save_path, grp_orders, mode=mode,format='table',data_columns=cols,complevel=0,complib='blosc') # orders
        mode = 'a' # even if mode was write it still needs to be 'a'ppend for the rest of the datasets
        cols = ['ORDER','STEP','STEP_PRIORITY','TIME','SIDE','UPDATE','PRICE','DELTA_QUANTITY','PARTICIPANT','ASK_PRICE','ASK_QUANTITY','BID_PRICE','BID_QUANTITY','UPDATE_LEVEL','TICK_DIST','QUANTITY_IN_FRONT']
        grp_cancels = ('/' + self.date + '/CANCELS')
        self.cancels[cols].to_hdf(save_path, grp_cancels, mode=mode,format='table',data_columns=cols,complevel=0,complib='blosc') # cancels
        cols = ['ORDER','STEP','STEP_PRIORITY','TIME','SIDE','UPDATE','PRICE','DELTA_QUANTITY','PARTICIPANT','ASK_PRICE','ASK_QUANTITY','BID_PRICE','BID_QUANTITY','UPDATE_LEVEL','TICK_DIST','QUANTITY_IN_FRONT']
        grp_trades = ('/' + self.date + '/TRADES')
        self.trades[cols].to_hdf(save_path, grp_trades, mode=mode,format='table',data_columns=cols,complevel=0,complib='blosc') # trades
        cols = ['ORDER','STEP','TIME','PRICE','QUANTITY']
        grp_crosses = ('/' + self.date + '/CROSS_EVENTS')
        self.cross_trades[cols].to_hdf(save_path, grp_crosses, mode=mode,format='table',data_columns=cols,complevel=0,complib='blosc') # cross events
        print('Extended data for ' + self.date +' saved to ' +save_path+'.')
        
    def time2step(self,t): # convert time (ms from start of day) to the a pair [first step / last step]
        ss = self.events.loc[self.events['TIME']==t,'STEP']
        if(not ss.empty):
            return [min(ss),max(ss)]
        else:
            ss = self.events.loc[self.events['TIME']<t,'STEP']
            return [ss.iloc[-1],ss.iloc[-1]]
    
    def step2time(self,s): # convert step to time (ms from start of day)
        return self.events.loc[s,'TIME']
        
    def time2str(self,t): # convert time (ms from start of day) to string ('HH:MM:SS.mmm') representing that time
        hrs = math.floor(t / (60*60*1000*1.0))
        mins = math.floor((t - hrs*60*60*1000) / (60*1000*1.0))
        secs = math.floor((t - hrs*60*60*1000 - mins*60*1000) / (1000*1.0))
        mils = (t - hrs*60*60*1000 - mins*60*1000 - secs*1000)
        return time.strftime("%H:%M:%S.{}".format(mils), (1970,1,1,hrs,mins,secs,0,0,0) )
    
    def str2time(self,s):
        ms = int(s[-3:])
        t = time.strptime(s[:-4],"%H:%M:%S")
        return t[3]*60*60*1000+t[4]*60*1000+t[5]*1000+ms # time in milliseconds since midnight

    def plot(self,step,ylim,xlim):      
        
        print(self.time2str(self.step2time(step)))                
                        
        tick = self.tick_size
        border = 30
        alp = 0.5

        self.ax.clear()
        self.init_ask_state(step)
        self.init_bid_state(step)
        
        ss = self.ask_state.set_index(['PRICE','STEP_PRIORITY','ORDER'],drop=False)
        ss.sort_index(inplace=True)
        bs = self.bid_state.set_index(['PRICE','STEP_PRIORITY','ORDER'],drop=False)
        bs.sort_index(inplace=True)        
        
        maxq = 0
        minq = 0
        lastp = 0
        lastq = 0
        bflip = False
        
        for ind, row in ss.iterrows():
            if (row['PRICE']!=lastp):
                maxq = max(lastq,maxq)
                lastp = 0
                lastq = 0
                bflip = False               
            col_ca = 'sandybrown' if bflip==True else 'orangered'
            col_tr = 'cyan' if bflip==True else 'darkcyan' 

            self.ax.add_patch(
                patches.Rectangle(
                    (lastq, row['PRICE']-(tick-border)/2),   # (x,y)
                    row['QUANTITY'],                         # width
                    (tick-border),                           # height
                    facecolor=col_ca,
                    alpha=alp  
                    )               
                )
            
            self.ax.add_patch(
                patches.Rectangle(
                    (lastq, row['PRICE']-(tick-border)/2),    # (x,y)
                    row['QUANTITY_TO_TRADE'],                 # width
                    (tick-border),                            # height
                    facecolor=col_tr,
                    alpha=alp  
                    )
                )            
            
            lastp = row['PRICE']
            lastq += row['QUANTITY']
            bflip = not bflip

        lastp = 0
        lastq = 0
        bflip = False            
            
        for ind, row in bs.iterrows():
            if (row['PRICE']!=lastp):
                
                minq = min(-lastq,minq)
                lastp = 0
                lastq = 0
                bflip = False               
            
            col_ca = 'palegreen' if bflip==True else 'seagreen'
            col_tr = 'magenta' if bflip==True else 'darkmagenta' 
            
            self.ax.add_patch(
                patches.Rectangle(
                    (-row['QUANTITY']-lastq, row['PRICE']-(tick-border)/2),   # (x,y)
                    row['QUANTITY'],                                          # width
                    (tick-border),                                            # height
                    facecolor=col_ca,
                    alpha=alp  
                    )               
                )
            
            self.ax.add_patch(
                patches.Rectangle(
                    (-row['QUANTITY_TO_TRADE']-lastq, row['PRICE']-(tick-border)/2),   # (x,y)
                    row['QUANTITY_TO_TRADE'],                                          # width
                    (tick-border),                                                     # height
                    facecolor=col_tr,
                    alpha=alp  
                    )
                )
            
            lastp = row['PRICE']
            lastq += row['QUANTITY']
            bflip = not bflip
                         
        
        self.ax.set_ylim(ylim[0],ylim[1])
        self.ax.set_xlim(xlim[0],xlim[1])
        display.display(self.fig)
        return 1

class OrderBook(object):
    
    def __init__(self,ticker,data_path):
        print('Started order book ' + ticker)
        self.ticker = ticker
        self.data_path = data_path
        
        self.data = {}
        self.file = h5.File(data_path,'r')
        self.dates = [date for date in self.file]
        self.file.close()
        
        for date in self.dates:
            print('Reading in data for '+ date)
            orders = pd.read_hdf(data_path,date+'/ORDERS')
            cancels = pd.read_hdf(data_path,date+'/CANCELS')
            trades = pd.read_hdf(data_path,date+'/TRADES')
            cross_trades = pd.read_hdf(data_path,date+'/CROSS_EVENTS')
            self.data[date] = OrderBookData(date,orders,cancels,trades,cross_trades,100)
            
    def build_features(self,date,save_path,debug_mode=False):
        self.data[date].build_features(debug_mode)
        self.data[date].save_data(save_path)
        pass
    
    def plot(self,date,step,ylim,xlim):
        self.data[date].plot(step,ylim,xlim)
        
    def time2step(self,date,time):
        return self.data[date].time2step(time)
        
    def step2time(self,date,step):
        return self.date[date].step2time(step)
    
    def time2str(self,date,time):
        return self.data[date].time2str(time)
    
    def str2step(self,date,strtime):
        return self.data[date].time2step(self.data[date].str2time(strtime))

In [ ]: