notebook.community

Edit and run



In [7]:

    
import numpy as np
np.__version__









    Out[7]:





'1.10.1'



In [1]:

    
import taq



In [3]:

    
chunks = taq.TAQ2Chunks('../test-data/EQY_US_ALL_BBO_20140206.zip')



In [4]:

    
c0 = next(chunks)



In [14]:

    
c0.dtype









    Out[14]:





dtype([('Time', '<f8'), ('hour', 'i1'), ('minute', 'i1'), ('msec', '<u2'), ('Exchange', 'S1'), ('Symbol_root', 'S6'), ('Symbol_suffix', 'S10'), ('Bid_Price', '<f8'), ('Bid_Size', '<i4'), ('Ask_Price', '<f8'), ('Ask_Size', '<i4'), ('Quote_Condition', 'S1'), ('Market_Maker', 'S4'), ('Bid_Exchange', 'S1'), ('Ask_Exchange', 'S1'), ('Sequence_Number', '<i8'), ('National_BBO_Ind', 'S1'), ('NASDAQ_BBO_Ind', 'S1'), ('Quote_Cancel_Correction', 'S1'), ('Source_of_Quote', 'S1'), ('Retail_Interest_Indicator_RPI', 'S1'), ('Short_Sale_Restriction_Indicator', 'S1'), ('LULD_BBO_Indicator_CQS', 'S1'), ('LULD_BBO_Indicator_UTP', 'S1'), ('FINRA_ADF_MPID_Indicator', 'S1'), ('SIP_generated_Message_Identifier', 'S1'), ('National_BBO_LULD_Indicator', 'S1')])



In [16]:

    
# The fact that we have a structured dtype makes unique work across columns 
# (it's actually a single column with a complex dtype)
unique_symbols, start_indices = np.unique(c0[['Symbol_root', 'Symbol_suffix']], return_index=True)



In [21]:

    
unique_symbols, start_indices









    Out[21]:





(array([(b'A     ', b'          '), (b'AA    ', b'          '),
        (b'AA    ', b'PR        '), (b'AADR  ', b'          '),
        (b'AAIT  ', b'          '), (b'AAL   ', b'          ')], 
       dtype=[('Symbol_root', 'S6'), ('Symbol_suffix', 'S10')]),
 array([     0, 187943, 950194, 950397, 952619, 959557]))



In [30]:

    
split_c = np.split(c0, start_indices[1:])  # We don't want to "split" the array at 0 - this creates an empty array



In [28]:

    
# split_c now contains the chunks for each symbol type
[len(s) for s in split_c]









    Out[28]:





[187943, 762251, 203, 2222, 6938, 40443]



In [29]:

    
np.diff(start_indices)









    Out[29]:





array([187943, 762251,    203,   2222,   6938])



In [ ]: