In [1]:
import os
import timeit
import tables as tb
import numpy as np

In [2]:
def create_contiguous(filename):
    with tb.openFile(filename, "w") as f:
        a = f.createArray('/', 'test', atom=tb.Float32Atom(),
                          shape=(n,k))
        n_ = n//10
        for i in range(10):
            print i,
            a[i*n_:(i+1)*n_,...] = np.random.rand(n_, k)

In [3]:
def create_chunked(filename):
    with tb.openFile(filename, "w") as f:
        a = f.createEArray('/', 'test', atom=tb.Float32Atom(),
                           shape=(0,k), chunkshape=(100, k))
        n_ = n//10
        for i in range(10):
            print i,
            a.append(np.random.rand(n_, k))

In [4]:
def benchmark(*funcs):
    with tb.openFile(filename, "r") as f:
        a = f.root.test
        for func in funcs:
            %timeit -r1 -n1 func(a)

Create the array


In [5]:
n, k = 5000000, 100

In [6]:
filename_contiguous = 'features_contiguous.h5'
if not os.path.exists(filename_contiguous):
    create_contiguous(filename_contiguous)


0 1 2 3 4 5 6 7 8 9

In [7]:
filename_chunked = 'features_chunked.h5'
if not os.path.exists(filename_chunked):
    create_chunked(filename_chunked)


0 1 2 3 4 5 6 7 8 9

Create the indices


In [8]:
ind = np.random.randint(size=50000, low=0, high=n)
ind = np.unique(ind)

Contiguous

Solution 1: fancy indexing, slow


In [9]:
def read1(a):
    return a[ind,:]
with tb.openFile(filename_contiguous, "r") as f: a = f.root.test %timeit -r1 -n1 read1(a)

Solution 2: for loop


In [10]:
def read2(a):
    out = np.empty((len(ind),) +  a.shape[1:], dtype=a.dtype)
    for j, i in enumerate(ind):
        out[j:j+1,...] = a[i:i+1,...]
    return out

In [11]:
with tb.openFile(filename_contiguous, "r") as f:
    a = f.root.test
    %timeit -r1 -n1 read2(a)


1 loops, best of 1: 22.6 s per loop

Chunked

with tb.openFile(filename_chunked, "r") as f: a = f.root.test %timeit -r1 -n1 read1(a)

In [12]:
with tb.openFile(filename_chunked, "r") as f:
    a = f.root.test
    %timeit -r1 -n1 read2(a)


1 loops, best of 1: 22.5 s per loop