Performance comparisons on arrays between PyTables and h5py


In [1]:
import tables as tb
import h5py
import numpy as np

In [5]:
import tables as tb
import numpy as np
n, k = 200000, 100
ind = np.random.rand(n) < .01
ind = np.nonzero(ind)[0]
print len(ind), "rows to select"
for chunk in (1, 10, 100, 1000, 10000):
    f = tb.openFile("test", "w")
    a = f.createEArray('/', 'test', 
                       obj=np.random.rand(n, k),
                       chunkshape=(chunk, k))
    print "chunk =", chunk
    %timeit -r1 -n1 a[ind].reshape((-1, k))
    print
    f.close()


2007 rows to select
chunk= 1
1 loops, best of 1: 1.28 s per loop

chunk= 10
1 loops, best of 1: 554 ms per loop

chunk= 100
1 loops, best of 1: 370 ms per loop

chunk= 1000
1 loops, best of 1: 226 ms per loop

chunk= 1000
1 loops, best of 1: 240 ms per loop


In [10]:
n, k = 100000, 100
shape = (n, k)

In [11]:
f = tb.openFile("test", "w")
a = f.createEArray('/', 'test', obj=np.random.rand(n // 10, k),
                    chunkshape=(1, k))
for _ in range(9):
    a.append(np.random.rand(n // 10, k))
f.close()

PyTables

1. Array[:n/10]

Slice.


In [12]:
with tb.openFile("test", "r") as f:
    a = f.root.test
    %timeit -r1 -n1 a[:n // 10]


1 loops, best of 1: 86.8 ms per loop

Indices (with PyTables hack).


In [13]:
with tb.openFile("test", "r") as f:
    a = f.root.test
    ind = np.arange(n // 10)
    %timeit -r1 -n1 a[ind].reshape((n // 10, k))


1 loops, best of 1: 220 ms per loop

2. Array[::10]

Slice.


In [14]:
with tb.openFile("test", "r") as f:
    a = f.root.test
    %timeit -r1 -n1 a[::10,0]


1 loops, best of 1: 10.4 s per loop

Indices (with PyTables hack).


In [15]:
with tb.openFile("test", "r") as f:
    a = f.root.test
    ind = np.arange(0, n, 10)
    %timeit -r1 -n1 a[ind].reshape((n // 10, k))


1 loops, best of 1: 16.6 s per loop

3. Random subselection

p = 0.01


In [17]:
ind = np.random.rand(n) < .01
ind = np.nonzero(ind)[0]
with tb.openFile("test", "r") as f:
    a = f.root.test
    %timeit -r1 -n1 a[ind].reshape((-1, k))


1 loops, best of 1: 342 ms per loop

In [19]:
ind = np.random.rand(n) < .01
ind = np.nonzero(ind)[0]
with tb.openFile("test", "r") as f:
    a = f.root.test
    %timeit -r1 -n1 a[ind,0]


1 loops, best of 1: 332 ms per loop

p = 0.1


In [18]:
ind = np.random.rand(n) < .1
ind = np.nonzero(ind)[0]
with tb.openFile("test", "r") as f:
    a = f.root.test
    %timeit -r1 -n1 a[ind].reshape((-1, k))


1 loops, best of 1: 14.1 s per loop

h5py


In [ ]:
f = h5py.File("test", "r")

p = 0.1


In [20]:
ind = np.random.rand(n) < .1
ind = np.nonzero(ind)[0]
with h5py.File("test", "r") as f:
    a = f.get("test")
    %timeit -r1 -n1 a[ind, 0]


1 loops, best of 1: 16.8 s per loop

In [ ]: