In [7]:
import numpy as np
np.__version__


Out[7]:
'1.10.1'

In [1]:
import taq

In [3]:
chunks = taq.TAQ2Chunks('../test-data/EQY_US_ALL_BBO_20140206.zip')

In [4]:
c0 = next(chunks)

In [14]:
c0.dtype


Out[14]:
dtype([('Time', '<f8'), ('hour', 'i1'), ('minute', 'i1'), ('msec', '<u2'), ('Exchange', 'S1'), ('Symbol_root', 'S6'), ('Symbol_suffix', 'S10'), ('Bid_Price', '<f8'), ('Bid_Size', '<i4'), ('Ask_Price', '<f8'), ('Ask_Size', '<i4'), ('Quote_Condition', 'S1'), ('Market_Maker', 'S4'), ('Bid_Exchange', 'S1'), ('Ask_Exchange', 'S1'), ('Sequence_Number', '<i8'), ('National_BBO_Ind', 'S1'), ('NASDAQ_BBO_Ind', 'S1'), ('Quote_Cancel_Correction', 'S1'), ('Source_of_Quote', 'S1'), ('Retail_Interest_Indicator_RPI', 'S1'), ('Short_Sale_Restriction_Indicator', 'S1'), ('LULD_BBO_Indicator_CQS', 'S1'), ('LULD_BBO_Indicator_UTP', 'S1'), ('FINRA_ADF_MPID_Indicator', 'S1'), ('SIP_generated_Message_Identifier', 'S1'), ('National_BBO_LULD_Indicator', 'S1')])

In [16]:
# The fact that we have a structured dtype makes unique work across columns 
# (it's actually a single column with a complex dtype)
unique_symbols, start_indices = np.unique(c0[['Symbol_root', 'Symbol_suffix']], return_index=True)

In [21]:
unique_symbols, start_indices


Out[21]:
(array([(b'A     ', b'          '), (b'AA    ', b'          '),
        (b'AA    ', b'PR        '), (b'AADR  ', b'          '),
        (b'AAIT  ', b'          '), (b'AAL   ', b'          ')], 
       dtype=[('Symbol_root', 'S6'), ('Symbol_suffix', 'S10')]),
 array([     0, 187943, 950194, 950397, 952619, 959557]))

In [30]:
split_c = np.split(c0, start_indices[1:])  # We don't want to "split" the array at 0 - this creates an empty array

In [28]:
# split_c now contains the chunks for each symbol type
[len(s) for s in split_c]


Out[28]:
[187943, 762251, 203, 2222, 6938, 40443]

In [29]:
np.diff(start_indices)


Out[29]:
array([187943, 762251,    203,   2222,   6938])

In [ ]: