There are lies, damn lies and benchmarks...

Setup


In [1]:
import zarr
zarr.__version__


Out[1]:
'2.2.0a2.dev22+dirty'

In [2]:
import bsddb3
bsddb3.__version__


Out[2]:
'6.2.5'

In [3]:
import lmdb
lmdb.__version__


Out[3]:
'0.93'

In [4]:
import numpy as np

In [5]:
import dbm.gnu
import dbm.ndbm

In [6]:
import os
import shutil
bench_dir = '../data/bench'


def clean():
    if os.path.isdir(bench_dir):
        shutil.rmtree(bench_dir)
    os.makedirs(bench_dir)

    
def setup(a, name='foo/bar'):
    global fdict_z, hdict_z, lmdb_z, gdbm_z, ndbm_z, bdbm_btree_z, bdbm_hash_z, zip_z, dir_z
    
    clean()
    fdict_root = zarr.group(store=dict())
    hdict_root = zarr.group(store=zarr.DictStore())
    lmdb_root = zarr.group(store=zarr.LMDBStore(os.path.join(bench_dir, 'lmdb')))
    gdbm_root = zarr.group(store=zarr.DBMStore(os.path.join(bench_dir, 'gdbm'), open=dbm.gnu.open))
    ndbm_root = zarr.group(store=zarr.DBMStore(os.path.join(bench_dir, 'ndbm'), open=dbm.ndbm.open))
    bdbm_btree_root = zarr.group(store=zarr.DBMStore(os.path.join(bench_dir, 'bdbm_btree'), open=bsddb3.btopen))
    bdbm_hash_root = zarr.group(store=zarr.DBMStore(os.path.join(bench_dir, 'bdbm_hash'), open=bsddb3.hashopen))
    zip_root = zarr.group(store=zarr.ZipStore(os.path.join(bench_dir, 'zip'), mode='w'))
    dir_root = zarr.group(store=zarr.DirectoryStore(os.path.join(bench_dir, 'dir')))

    fdict_z = fdict_root.empty_like(name, a)
    hdict_z = hdict_root.empty_like(name, a)
    lmdb_z = lmdb_root.empty_like(name, a)
    gdbm_z = gdbm_root.empty_like(name, a)
    ndbm_z = ndbm_root.empty_like(name, a)
    bdbm_btree_z = bdbm_btree_root.empty_like(name, a)
    bdbm_hash_z = bdbm_hash_root.empty_like(name, a)
    zip_z = zip_root.empty_like(name, a)
    dir_z = dir_root.empty_like(name, a)

    # check compression ratio
    fdict_z[:] = a
    return fdict_z.info

Main benchmarks


In [7]:
def save(a, z):
    if isinstance(z.store, zarr.ZipStore):
        # needed for zip benchmarks to avoid duplicate entries
        z.store.clear()
    z[:] = a
    if hasattr(z.store, 'flush'):
        z.store.flush()
    
    
def load(z, a):
    z.get_basic_selection(out=a)

arange


In [8]:
a = np.arange(500000000)
setup(a)


Out[8]:
Name/foo/bar
Typezarr.core.Array
Data typeint64
Shape(500000000,)
Chunk shape(488282,)
OrderC
Read-onlyFalse
CompressorBlosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store typebuiltins.dict
No. bytes4000000000 (3.7G)
No. bytes stored59269657 (56.5M)
Storage ratio67.5
Chunks initialized1024/1024

save


In [9]:
%timeit save(a, fdict_z)


324 ms ± 60.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [10]:
%timeit save(a, hdict_z)


302 ms ± 11.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [11]:
%timeit save(a, lmdb_z)


316 ms ± 12.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [12]:
%timeit save(a, gdbm_z)


938 ms ± 111 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [13]:
%timeit save(a, ndbm_z)


406 ms ± 8.93 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [14]:
%timeit save(a, bdbm_btree_z)


1.43 s ± 156 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [15]:
%timeit save(a, bdbm_hash_z)


1.24 s ± 260 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [16]:
%timeit save(a, zip_z)


519 ms ± 59.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [17]:
%timeit save(a, dir_z)


609 ms ± 48.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

load


In [18]:
%timeit load(fdict_z, a)


454 ms ± 56.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [19]:
%timeit load(hdict_z, a)


428 ms ± 13.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [20]:
%timeit load(lmdb_z, a)


429 ms ± 19.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [21]:
%timeit load(gdbm_z, a)


459 ms ± 10 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [22]:
%timeit load(ndbm_z, a)


473 ms ± 5.71 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [23]:
%timeit load(bdbm_btree_z, a)


504 ms ± 8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [24]:
%timeit load(bdbm_hash_z, a)


519 ms ± 9.59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [25]:
%timeit load(zip_z, a)


575 ms ± 12.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [26]:
%timeit load(dir_z, a)


494 ms ± 10.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

randint


In [28]:
np.random.seed(42)
a = np.random.randint(0, 2**30, size=500000000)
setup(a)


Out[28]:
Name/foo/bar
Typezarr.core.Array
Data typeint64
Shape(500000000,)
Chunk shape(488282,)
OrderC
Read-onlyFalse
CompressorBlosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store typebuiltins.dict
No. bytes4000000000 (3.7G)
No. bytes stored2020785466 (1.9G)
Storage ratio2.0
Chunks initialized1024/1024

save


In [29]:
%timeit -r3 save(a, fdict_z)


670 ms ± 78.1 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)

In [30]:
%timeit -r3 save(a, hdict_z)


611 ms ± 6.11 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)

In [31]:
%timeit -r3 save(a, lmdb_z)


846 ms ± 24 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)

In [32]:
%timeit -r3 save(a, gdbm_z)


6.35 s ± 785 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)

In [33]:
%timeit -r3 save(a, ndbm_z)


4.62 s ± 1.09 s per loop (mean ± std. dev. of 3 runs, 1 loop each)

In [34]:
%timeit -r3 save(a, bdbm_btree_z)


7.84 s ± 1.66 s per loop (mean ± std. dev. of 3 runs, 1 loop each)

In [35]:
%timeit -r3 save(a, bdbm_hash_z)


6.49 s ± 808 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)

In [36]:
%timeit -r3 save(a, zip_z)


3.68 s ± 441 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)

In [38]:
%timeit -r3 save(a, dir_z)


3.55 s ± 1.24 s per loop (mean ± std. dev. of 3 runs, 1 loop each)

load


In [39]:
%timeit -r3 load(fdict_z, a)


566 ms ± 72.8 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)

In [40]:
%timeit -r3 load(hdict_z, a)


521 ms ± 16.1 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)

In [41]:
%timeit -r3 load(lmdb_z, a)


532 ms ± 16.1 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)

In [42]:
%timeit -r3 load(gdbm_z, a)


1.2 s ± 10.9 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)

In [43]:
%timeit -r3 load(ndbm_z, a)


1.18 s ± 13.2 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)

In [44]:
%timeit -r3 load(bdbm_btree_z, a)


1.59 s ± 16.7 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)

In [45]:
%timeit -r3 load(bdbm_hash_z, a)


1.61 s ± 7.31 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)

In [46]:
%timeit -r3 load(zip_z, a)


2.33 s ± 19.8 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)

In [47]:
%timeit -r3 load(dir_z, a)


790 ms ± 56 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)

dask


In [48]:
import dask.array as da

In [50]:
def dask_op(source, sink, chunks=None):
    if isinstance(sink.store, zarr.ZipStore):
        sink.store.clear()
    if chunks is None:
        try:
            chunks = sink.chunks
        except AttributeError:
            chunks = source.chunks
    d = da.from_array(source, chunks=chunks, asarray=False, fancy=False, lock=False)
    result = (d // 2) * 2
    da.store(result, sink, lock=False)
    if hasattr(sink.store, 'flush'):
        sink.store.flush()

Compare sources


In [76]:
%time dask_op(fdict_z, fdict_z)


CPU times: user 15.6 s, sys: 1.8 s, total: 17.4 s
Wall time: 3.07 s

In [77]:
%time dask_op(hdict_z, fdict_z)


CPU times: user 16.5 s, sys: 104 ms, total: 16.6 s
Wall time: 2.59 s

In [78]:
%time dask_op(lmdb_z, fdict_z)


CPU times: user 15.1 s, sys: 524 ms, total: 15.6 s
Wall time: 3.02 s

In [79]:
%time dask_op(gdbm_z, fdict_z)


CPU times: user 16.5 s, sys: 712 ms, total: 17.2 s
Wall time: 3.13 s

In [80]:
%time dask_op(ndbm_z, fdict_z)


CPU times: user 16.3 s, sys: 604 ms, total: 16.9 s
Wall time: 3.22 s

In [81]:
%time dask_op(bdbm_btree_z, fdict_z)


CPU times: user 19.6 s, sys: 1.24 s, total: 20.9 s
Wall time: 3.27 s

In [82]:
%time dask_op(bdbm_hash_z, fdict_z)


CPU times: user 20.3 s, sys: 1.08 s, total: 21.4 s
Wall time: 3.53 s

In [83]:
%time dask_op(zip_z, fdict_z)


CPU times: user 15.7 s, sys: 700 ms, total: 16.4 s
Wall time: 3.12 s

In [84]:
%time dask_op(dir_z, fdict_z)


CPU times: user 17.4 s, sys: 1.08 s, total: 18.5 s
Wall time: 2.91 s

Compare sinks


In [51]:
%time dask_op(fdict_z, hdict_z)


CPU times: user 15.8 s, sys: 1.4 s, total: 17.2 s
Wall time: 3.04 s

In [52]:
%time dask_op(fdict_z, lmdb_z)


CPU times: user 16.2 s, sys: 1.6 s, total: 17.8 s
Wall time: 2.71 s

In [59]:
%time dask_op(fdict_z, gdbm_z)


CPU times: user 16.8 s, sys: 3.05 s, total: 19.8 s
Wall time: 8.01 s

In [54]:
%time dask_op(fdict_z, ndbm_z)


CPU times: user 17.9 s, sys: 3.01 s, total: 20.9 s
Wall time: 5.46 s

In [61]:
%time dask_op(fdict_z, bdbm_btree_z)


CPU times: user 13.8 s, sys: 3.39 s, total: 17.2 s
Wall time: 7.87 s

In [56]:
%time dask_op(fdict_z, bdbm_hash_z)


CPU times: user 13.9 s, sys: 3.27 s, total: 17.2 s
Wall time: 6.73 s

In [57]:
%time dask_op(fdict_z, zip_z)


CPU times: user 13.9 s, sys: 2.5 s, total: 16.4 s
Wall time: 3.8 s

In [58]:
%time dask_op(fdict_z, dir_z)


CPU times: user 15.7 s, sys: 3.72 s, total: 19.4 s
Wall time: 3.1 s

In [62]:
lmdb_z.store.close()
gdbm_z.store.close()
ndbm_z.store.close()
bdbm_btree_z.store.close()
bdbm_hash_z.store.close()
zip_z.store.close()

In [ ]: