In [3]:
%pylab inline
import time
import spikedetekt

try:
    import matplotlib.pyplot as plt
except:
    raise
import networkx as nx
#import graphviz_layout

import itertools
import re
import pprint


Populating the interactive namespace from numpy and matplotlib

Mouse 2541

Defining probe layout for .probe file

SpikeDetekt requires a graph describing the probe geometry. For tetrodes, I will represent each wire bundle as four mutually connected nodes.

NOTE: These are uncorrected tetrode numbers. No dead channels assigned.


In [7]:
# Channel numbering
# !NOTE: 1-based index
channels_2541 = {1: [ 5, 6, 7, 8], 2: [ 2, 3, 4, 9], 3: [ 1,10,11,32], 4: [12,13,30,31],
                 5: [14,15,16,29], 6: [17,18,19,28], 7: [20,21,26,27], 8: [22,23,24,25],
                 9: [37,38,39,40], 10: [34,35,36,41], 11: [33,42,43,64], 12: [44,45,62,63],
                 13: [46,47,48,61], 14: [49,50,51,60], 15: [52,53,58,59], 16: [54,55,56,57]}

# zero index it...
channels_2541 = {t_num: [channel-1 for channel in tetrode] for t_num, tetrode in channels_2541.items()}

tetrodes_2541 = {k: list(itertools.combinations(l, r=2)) for k, l in channels_2541.items()}
tetrodes_2541


Out[7]:
{1: [(4, 5), (4, 6), (4, 7), (5, 6), (5, 7), (6, 7)],
 2: [(1, 2), (1, 3), (1, 8), (2, 3), (2, 8), (3, 8)],
 3: [(0, 9), (0, 10), (0, 31), (9, 10), (9, 31), (10, 31)],
 4: [(11, 12), (11, 29), (11, 30), (12, 29), (12, 30), (29, 30)],
 5: [(13, 14), (13, 15), (13, 28), (14, 15), (14, 28), (15, 28)],
 6: [(16, 17), (16, 18), (16, 27), (17, 18), (17, 27), (18, 27)],
 7: [(19, 20), (19, 25), (19, 26), (20, 25), (20, 26), (25, 26)],
 8: [(21, 22), (21, 23), (21, 24), (22, 23), (22, 24), (23, 24)],
 9: [(36, 37), (36, 38), (36, 39), (37, 38), (37, 39), (38, 39)],
 10: [(33, 34), (33, 35), (33, 40), (34, 35), (34, 40), (35, 40)],
 11: [(32, 41), (32, 42), (32, 63), (41, 42), (41, 63), (42, 63)],
 12: [(43, 44), (43, 61), (43, 62), (44, 61), (44, 62), (61, 62)],
 13: [(45, 46), (45, 47), (45, 60), (46, 47), (46, 60), (47, 60)],
 14: [(48, 49), (48, 50), (48, 59), (49, 50), (49, 59), (50, 59)],
 15: [(51, 52), (51, 57), (51, 58), (52, 57), (52, 58), (57, 58)],
 16: [(53, 54), (53, 55), (53, 56), (54, 55), (54, 56), (55, 56)]}

In [8]:
tetrodes = tetrodes_2541

G = nx.Graph()
for k, t in tetrodes.items():
    G.add_edges_from(t)
pos = nx.graphviz_layout(G, prog='neato',args='')
plt.figure(figsize=(6, 6))
#colors = np.linspace(0.0, 1.0, num=len(tetrodes))
colors = [random.random() for n in xrange(len(tetrodes))]
C=nx.connected_component_subgraphs(G)
for i, g in enumerate(C):
    #c=*nx.number_of_nodes(g) # random color...
    nx.draw(g, pos,
            node_size=400, node_color=[colors[i]]*nx.number_of_nodes(g),
            vmin=0.0, vmax=1.0, alpha=0.2,
            with_labels=True)


Looking into open ephys .continuous files

Continuous data files (.continuous)

Each continuous channel within each processor has its own file, titled "XXX_CHY.continuous," where XXX = the processor ID #, and Y = the channel number. For each record, it saves:

  • One int64 timestamp (actually a sample number; this can be converted to seconds using the sampleRate variable in the header)
  • One uint16 number (N) indicating the samples per record (always 1024, at least for now)
  • One uint16 recording number (version 0.2 and higher)
  • 1024 int16 samples
  • 10-byte record marker (0 1 2 3 4 5 6 7 8 255)

If a file is opened or closed in the middle of a record, the leading or trailing samples are set to zero.


In [9]:
# fixed header describing data
SIZE_HEADER = 1024

# 22 byte record header + 2048 byte samples
SIZE_RECORD = 2070

NUM_SAMPLES = 1024
REC_MARKER = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 255], dtype=uint8)

In [10]:
def oe_read_header(fname):
    """ Return a dict with header content.
    """
    # TODO: Use alternative reading method when fname is a file id. If string, use numpy.fromfile
    import re

    # 1 kiB header
    header_dt = np.dtype([('Header', 'S%d' % SIZE_HEADER)])
    header = np.fromfile(fname, dtype=header_dt, count=1)
    
    # alternative which moves file pointer
    #fid = open(fname, 'rb')
    #header = fid.read(SIZE_HEADER)
    #fid.close()
    
    # Stand back! I know regex!
    # Annoyingly, there is a newline character missing in the header
    regex = "header\.([\d\w\.\s]{1,}).=.\'*([^\;\']{1,})\'*"
    header_str = str(header[0][0]).rstrip(' ')
    header_dict = {entry[0]: entry[1] for entry in re.compile(regex).findall(header_str)}
    for key in ['bitVolts', 'sampleRate']:
        header_dict[key] = float(header_dict[key])
    for key in ['blockLength', 'bufferSize', 'header_bytes', 'channel']:
        header_dict[key] = int(header_dict[key]) if not key == 'channel' else int(header_dict[key][2:])
    return header_dict

In [11]:
def oe_read_records(fname, offset=0, count=10000):
    # rec_ofs: Offset in number of records
    # rec_cnt: Number of records to read
    with open(fname, 'rb') as fid:
        # move pointer to new position
        fid.seek(SIZE_HEADER + offset * SIZE_RECORD)

        # n times x B data
        data_dt = np.dtype([('timestamp', np.int64),
                            ('n_samples', np.uint16),
                            ('rec_num', np.uint16),
                            # !Note: endianness!
                            ('samples', ('>i2', NUM_SAMPLES)),
                            ('rec_mark', (np.uint8, 10))])

        data = np.fromfile(fid, dtype=data_dt, count=count)

    return data

Header + data example:


In [12]:
example = 'examples/2541_2014-03-13_20-45-41/100_CH12.continuous'
pprint.pprint(oe_read_header(example))


{'bitVolts': 0.194999993,
 'blockLength': 1024,
 'bufferSize': 1024,
 'channel': 12,
 'channelType': 'Continuous',
 'date_created': '13-Mar-2014 204546',
 'description': 'each record contains one 64-bit timestamp, one 16-bit sample count (N), 1 uint16 recordingNumber, N 16-bit samples, and one 10-byte record marker (0 1 2 3 4 5 6 7 8 255)',
 'format': 'Open Ephys Data Format',
 'header_bytes': 1024,
 'sampleRate': 20000.0,
 'version': '0.2'}

In [13]:
data = oe_read_records(example, offset=5, count=1000)
n = 2
plt.figure(figsize=(13, 5))
plot(data['samples'][:n].ravel())
xlim(0, 1024*n)
data['samples'].__array_interface__


Out[13]:
{'data': (140296749948956, False),
 'descr': [('', '>i2')],
 'shape': (1000, 1024),
 'strides': (2070, 2),
 'typestr': '>i2',
 'version': 3}

Read controls


In [14]:
# Timestamps should increase monotonically in steps of 1024
assert len(set(np.diff(data['timestamp']))) == 1 and np.diff(data['timestamp'][:2]) == 1024
print 'timestamps: ', data['timestamp'][0]

# Number of samples in each record should be NUM_SAMPLES, or 1024
assert len(set(data['n_samples'])) == 1 and data['n_samples'][0] == NUM_SAMPLES
print 'N samples: ', data['n_samples'][0]

# should be byte pattern [0...8, 255]
stringified = map(str, data['rec_mark']) # <- slow
assert len(set(stringified)) == 1 and str(data['rec_mark'][0]) == str(REC_MARKER)
print 'record marker: ', data['rec_mark'][0]

# should be zero, or there are multiple recordings in this file
assert len(set(data['rec_num'])) == 1 and data['rec_num'][0] == 0
print 'Number recording: ', data['rec_num'][0]


timestamps:  20024120
N samples:  1024
record marker:  [  0   1   2   3   4   5   6   7   8 255]
Number recording:  0

Plotting Open Ephys 0.2x files

Chunk-wise reading of all 64 neural channels.

Simplest approach would be to seek and read chunk by chunk, no generators, withs or leaving the files open.

TODO: - timings (esp. see if difference in endianness affects speed)


In [15]:
def data_to_buf(path, channels=64, count=1000, proc_node=100, channel_offset=500):
    base_end = '{proc_node:d}_CH{channel:d}.continuous'

    # channels to include either given as number of channels, or as a list
    channels = channels if channels is iterable else list(range(channels))
    
    # temporary storage
    buf = np.zeros((len(channels), num_records*1024), dtype='>i2')

    # load chunk of data from all channels
    print 'Reading chunk'
    for n in channels:
        fname = path+base_end.format(proc_node=proc_node, channel=n+1)
        # channel offset is clunky shortcut for plotting
        if channel_offset:
            buf[n] = oe_read_records(fname, count=count)['samples'].ravel() + channel_offset * (n-32)
        else:
            buf[n] = oe_read_records(fname, count=count)['samples'].ravel()

    return buf

In [16]:
# path to folder of current session
example_path = 'examples/2541_2014-03-13_20-45-41/'

# number of records to load at once
num_records=1000

# load first records from ALL channels in 
t = time.time()
tmp = data_to_buf(path=example_path, count=num_records).transpose()
size, elapsed = tmp.nbytes/1e6, time.time() - t

# Take speed with a grain of salt. Linux is smart enough to keep files in RAM once used.
print 'Read {0:.1f} MB in {1:.2f} s, {2:.2f} MB/s'.format(size, elapsed, size*1./elapsed)


Reading chunk
Read 131.1 MB in 5.43 s, 24.14 MB/s

In [17]:
plt.figure(figsize=(14, 18))
print tmp.shape
plot(tmp[5000:10000]);


(1024000, 64)

Conversion to .dat files

Ideally, channels should be taken from the probe file and matched with a list of dead/excluded channels.


In [21]:
def oe_to_dat(dirname, outfile='./proc/raw.dat', channels=64, proc_node=100, chunk_size=10000):
    # TODO: Check that no samples are getting lost, at start, chunk-borders or file end
    # TODO: Instead of channel list, give channel dict and write to appropriate raw file grouping
    #       This won't require re-reading for each tetrode. Not that it takes that long...
    import os
    start_t = time.time()
    fname_base = dirname + '{proc_node:d}_CH{channel:d}.continuous'
    size_header = 1024  # Bytes
    size_record = 2070  # Bytes
    num_samples = 1024  # per record

    # should be iterable with channels to include into dat file
    channels = channels if iterable(channels) else list(range(channels)) 
    
    # list of files
    fnames = [fname_base.format(proc_node=proc_node, channel=chan+1) for chan in channels]
    #print fnames
    
    # check that filesizes are equal
    fsizes = [os.path.getsize(fname) for fname in fnames]
    assert len(set(fsizes)) == 1
    
    # there should be an integer multiple of records, i.e. not leftover bytes!
    assert not (fsizes[0] - 1024) % 2070
    
    # calculate number of records from filesize
    num_records = (fsizes[0] - 1024) / 2070
    
    # pre-allocate array
    buf = np.zeros((len(channels), chunk_size*1024), dtype='>i2')
    #print 'Allocated {0:.2f} MB buffer'.format(buf.nbytes/1e6)
    
    with open(outfile, 'wb') as fid:
        # loop over all records, in chunk sizes
        records_left = num_records
        chunk_n = 0
        written = 0
        while records_left:
            count = min(records_left, chunk_size)
            offset = num_records - records_left
            #print '-> Reading chunk {0:d}'.format(chunk_n)
            for i, fname in enumerate(fnames):
                buf[i, 0:count*num_samples] = \
                oe_read_records(fname, count=count, offset=offset)['samples'].ravel()
            records_left -= count

            # write chunk of interleaved data
            #print '<- Writing chunk {0:d}'.format(chunk_n)
            if count == chunk_size:
                buf.transpose().tofile(fid)
            else:
                # We don't want to write the trailing zeros on the last chunk
                buf[:, 0:count*num_samples].transpose().tofile(fid)
            written += (count * 2048 * len(channels))
            chunk_n += 1
            #print '{0:.0f}% completed.'.format((num_records-records_left)*100.0/num_records),
 
    elapsed = time.time() - start_t
    written /= 1e6
    speed = written/elapsed
    msg_str = '\nConverted {0:.2f} MB into {1:s}, took {2:.2f} s, effectively {3:.2f} MB/s'
    print msg_str.format(written, outfile, elapsed, speed)

In [20]:
t = time.time()
for i, tt in channels_2541.items():
    outfile = ''.join(['./proc/TT', str(i), '_Ch' ,'-'.join([str(c) for c in tt]), '.dat'])
    oe_to_dat(example_path, outfile=outfile, channels=tt, chunk_size=50000)
print '\nTotal elapsed: {0:.2f} s'.format(time.time()-t)


Converted 384.93 MB into ./proc/TT1_Ch4-5-6-7.dat, took 8.82 s, effectively 43.67 MB/s


Converted 384.93 MB into ./proc/TT2_Ch1-2-3-8.dat, took 10.48 s, effectively 36.74 MB/s


Converted 384.93 MB into ./proc/TT3_Ch0-9-10-31.dat, took 8.12 s, effectively 47.38 MB/s


Converted 384.93 MB into ./proc/TT4_Ch11-12-29-30.dat, took 8.04 s, effectively 47.88 MB/s


Converted 384.93 MB into ./proc/TT5_Ch13-14-15-28.dat, took 8.01 s, effectively 48.05 MB/s


Converted 384.93 MB into ./proc/TT6_Ch16-17-18-27.dat, took 11.37 s, effectively 33.86 MB/s


Converted 384.93 MB into ./proc/TT7_Ch19-20-25-26.dat, took 11.17 s, effectively 34.46 MB/s


Converted 384.93 MB into ./proc/TT8_Ch21-22-23-24.dat, took 8.13 s, effectively 47.37 MB/s


Converted 384.93 MB into ./proc/TT9_Ch36-37-38-39.dat, took 8.04 s, effectively 47.85 MB/s


Converted 384.93 MB into ./proc/TT10_Ch33-34-35-40.dat, took 9.95 s, effectively 38.69 MB/s


Converted 384.93 MB into ./proc/TT11_Ch32-41-42-63.dat, took 8.29 s, effectively 46.45 MB/s


Converted 384.93 MB into ./proc/TT12_Ch43-44-61-62.dat, took 14.82 s, effectively 25.98 MB/s


Converted 384.93 MB into ./proc/TT13_Ch45-46-47-60.dat, took 7.97 s, effectively 48.32 MB/s


Converted 384.93 MB into ./proc/TT14_Ch48-49-50-59.dat, took 8.06 s, effectively 47.74 MB/s


Converted 384.93 MB into ./proc/TT15_Ch51-52-57-58.dat, took 8.20 s, effectively 46.93 MB/s


Converted 384.93 MB into ./proc/TT16_Ch53-54-55-56.dat, took 8.49 s, effectively 45.32 MB/s


Total elapsed: 147.98 s

In [24]:
%timeit buf = empty((64, 100000), dtype='>i2')


100000 loops, best of 3: 4.1 µs per loop

In [33]:
buf = zeros((1000, 1000), dtype='>i2')
buf.__array_interface__


Out[33]:
{'data': (140123308075120, False),
 'descr': [('', '>i2')],
 'shape': (1000, 1000),
 'strides': None,
 'typestr': '>i2',
 'version': 3}

In [35]:
buf[1, 900]


Out[35]:
0

In [ ]: