In [1]:
import sys
sys.path.insert(0, '..')
import zarr
import numpy as np
np.random.seed(42)
import cProfile
zarr.__version__
Out[1]:
In [2]:
a = np.arange(10)
za = zarr.array(a, chunks=2)
ix = [False, True, False, True, False, True, False, True, False, True]
In [3]:
# get items
za.vindex[ix]
Out[3]:
In [4]:
# get items
za.oindex[ix]
Out[4]:
In [5]:
# set items
za.vindex[ix] = a[ix] * 10
za[:]
Out[5]:
In [6]:
# set items
za.oindex[ix] = a[ix] * 100
za[:]
Out[6]:
In [7]:
# if using .oindex, indexing array can be any array-like, e.g., Zarr array
zix = zarr.array(ix, chunks=2)
za = zarr.array(a, chunks=2)
za.oindex[zix] # will not load all zix into memory
Out[7]:
In [8]:
a = np.arange(10)
za = zarr.array(a, chunks=2)
ix = [1, 3, 5, 7, 9]
In [9]:
# get items
za.vindex[ix]
Out[9]:
In [10]:
# get items
za.oindex[ix]
Out[10]:
In [11]:
# set items
za.vindex[ix] = a[ix] * 10
za[:]
Out[11]:
In [12]:
# set items
za.oindex[ix] = a[ix] * 100
za[:]
Out[12]:
In [13]:
a = np.arange(10)
za = zarr.array(a, chunks=2)
ix = np.array([[1, 3, 5], [2, 4, 6]])
In [14]:
# get items
za.vindex[ix]
Out[14]:
In [15]:
# set items
za.vindex[ix] = a[ix] * 10
za[:]
Out[15]:
In [16]:
a = np.arange(10)
za = zarr.array(a, chunks=2)
In [17]:
# get items
za[1::2]
Out[17]:
In [18]:
# set items
za.oindex[1::2] = a[1::2] * 10
za[:]
Out[18]:
Orthogonal (a.k.a. outer) indexing is supported with either Boolean or integer arrays, in combination with integers and slices. This functionality is provided via the get/set_orthogonal_selection() methods. For convenience, this functionality is also available via the .oindex[] property.
In [19]:
a = np.arange(15).reshape(5, 3)
za = zarr.array(a, chunks=(3, 2))
za[:]
Out[19]:
In [20]:
# orthogonal indexing with Boolean arrays
ix0 = [False, True, False, True, False]
ix1 = [True, False, True]
za.get_orthogonal_selection((ix0, ix1))
Out[20]:
In [21]:
# alternative API
za.oindex[ix0, ix1]
Out[21]:
In [22]:
# orthogonal indexing with integer arrays
ix0 = [1, 3]
ix1 = [0, 2]
za.get_orthogonal_selection((ix0, ix1))
Out[22]:
In [23]:
# alternative API
za.oindex[ix0, ix1]
Out[23]:
In [24]:
# combine with slice
za.oindex[[1, 3], :]
Out[24]:
In [25]:
# combine with slice
za.oindex[:, [0, 2]]
Out[25]:
In [26]:
# set items via Boolean selection
ix0 = [False, True, False, True, False]
ix1 = [True, False, True]
selection = ix0, ix1
value = 42
za.set_orthogonal_selection(selection, value)
za[:]
Out[26]:
In [27]:
# alternative API
za.oindex[ix0, ix1] = 44
za[:]
Out[27]:
In [28]:
# set items via integer selection
ix0 = [1, 3]
ix1 = [0, 2]
selection = ix0, ix1
value = 46
za.set_orthogonal_selection(selection, value)
za[:]
Out[28]:
In [29]:
# alternative API
za.oindex[ix0, ix1] = 48
za[:]
Out[29]:
Selecting arbitrary points from a multi-dimensional array by indexing with integer (coordinate) arrays is supported. This functionality is provided via the get/set_coordinate_selection() methods. For convenience, this functionality is also available via the .vindex[] property.
In [30]:
a = np.arange(15).reshape(5, 3)
za = zarr.array(a, chunks=(3, 2))
za[:]
Out[30]:
In [31]:
# get items
ix0 = [1, 3]
ix1 = [0, 2]
za.get_coordinate_selection((ix0, ix1))
Out[31]:
In [32]:
# alternative API
za.vindex[ix0, ix1]
Out[32]:
In [33]:
# set items
za.set_coordinate_selection((ix0, ix1), 42)
za[:]
Out[33]:
In [34]:
# alternative API
za.vindex[ix0, ix1] = 44
za[:]
Out[34]:
In [35]:
a = np.arange(15).reshape(5, 3)
za = zarr.array(a, chunks=(3, 2))
za[:]
Out[35]:
In [36]:
ix = np.zeros_like(a, dtype=bool)
ix[1, 0] = True
ix[3, 2] = True
za.get_mask_selection(ix)
Out[36]:
In [37]:
za.vindex[ix]
Out[37]:
In [38]:
za.set_mask_selection(ix, 42)
za[:]
Out[38]:
In [39]:
za.vindex[ix] = 44
za[:]
Out[39]:
In [42]:
a = np.array([(b'aaa', 1, 4.2),
(b'bbb', 2, 8.4),
(b'ccc', 3, 12.6)],
dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')])
za = zarr.array(a, chunks=2, fill_value=None)
za[:]
Out[42]:
In [43]:
za['foo']
Out[43]:
In [44]:
za['foo', 'baz']
Out[44]:
In [45]:
za[:2, 'foo']
Out[45]:
In [46]:
za[:2, 'foo', 'baz']
Out[46]:
In [47]:
za.oindex[[0, 2], 'foo']
Out[47]:
In [48]:
za.vindex[[0, 2], 'foo']
Out[48]:
In [49]:
za['bar'] = 42
za[:]
Out[49]:
In [50]:
za[:2, 'bar'] = 84
za[:]
Out[50]:
Note that this API differs from numpy when selecting multiple fields. E.g.:
In [51]:
a['foo', 'baz']
In [52]:
a[['foo', 'baz']]
Out[52]:
In [53]:
za['foo', 'baz']
Out[53]:
In [54]:
za[['foo', 'baz']]
In [53]:
c = np.arange(100000000)
c.nbytes
Out[53]:
In [54]:
%time zc = zarr.array(c)
zc.info
Out[54]:
In [55]:
%timeit c.copy()
In [56]:
%timeit zc[:]
In [57]:
# relatively dense selection - 10%
ix_dense_bool = np.random.binomial(1, 0.1, size=c.shape[0]).astype(bool)
np.count_nonzero(ix_dense_bool)
Out[57]:
In [58]:
%timeit c[ix_dense_bool]
In [59]:
%timeit zc.oindex[ix_dense_bool]
In [60]:
%timeit zc.vindex[ix_dense_bool]
In [61]:
import tempfile
import cProfile
import pstats
def profile(statement, sort='time', restrictions=(7,)):
with tempfile.NamedTemporaryFile() as f:
cProfile.run(statement, filename=f.name)
pstats.Stats(f.name).sort_stats(sort).print_stats(*restrictions)
In [62]:
profile('zc.oindex[ix_dense_bool]')
Method nonzero is being called internally within numpy to convert bool to int selections, no way to avoid.
In [63]:
profile('zc.vindex[ix_dense_bool]')
.vindex[] is a bit slower, possibly because internally it converts to a coordinate array first.
In [64]:
ix_dense_int = np.random.choice(c.shape[0], size=c.shape[0]//10, replace=True)
ix_dense_int_sorted = ix_dense_int.copy()
ix_dense_int_sorted.sort()
len(ix_dense_int)
Out[64]:
In [65]:
%timeit c[ix_dense_int_sorted]
In [66]:
%timeit zc.oindex[ix_dense_int_sorted]
In [67]:
%timeit zc.vindex[ix_dense_int_sorted]
In [68]:
%timeit c[ix_dense_int]
In [69]:
%timeit zc.oindex[ix_dense_int]
In [70]:
%timeit zc.vindex[ix_dense_int]
In [71]:
profile('zc.oindex[ix_dense_int_sorted]')
In [72]:
profile('zc.vindex[ix_dense_int_sorted]')
In [73]:
profile('zc.oindex[ix_dense_int]')
In [74]:
profile('zc.vindex[ix_dense_int]')
When indices are not sorted, zarr needs to partially sort them so the occur in chunk order, so we only have to visit each chunk once. This sorting dominates the processing time and is unavoidable AFAIK.
In [75]:
# relatively sparse selection
ix_sparse_bool = np.random.binomial(1, 0.0001, size=c.shape[0]).astype(bool)
np.count_nonzero(ix_sparse_bool)
Out[75]:
In [76]:
%timeit c[ix_sparse_bool]
In [77]:
%timeit zc.oindex[ix_sparse_bool]
In [78]:
%timeit zc.vindex[ix_sparse_bool]
In [79]:
profile('zc.oindex[ix_sparse_bool]')
In [80]:
profile('zc.vindex[ix_sparse_bool]')
In [81]:
ix_sparse_int = np.random.choice(c.shape[0], size=c.shape[0]//10000, replace=True)
ix_sparse_int_sorted = ix_sparse_int.copy()
ix_sparse_int_sorted.sort()
len(ix_sparse_int)
Out[81]:
In [82]:
%timeit c[ix_sparse_int_sorted]
In [83]:
%timeit c[ix_sparse_int]
In [84]:
%timeit zc.oindex[ix_sparse_int_sorted]
In [85]:
%timeit zc.vindex[ix_sparse_int_sorted]
In [86]:
%timeit zc.oindex[ix_sparse_int]
In [87]:
%timeit zc.vindex[ix_sparse_int]
In [88]:
profile('zc.oindex[ix_sparse_int]')
In [89]:
profile('zc.vindex[ix_sparse_int]')
For sparse selections, processing time is dominated by decompression, so we can't do any better.
In [90]:
zix_sparse_bool = zarr.array(ix_sparse_bool)
zix_sparse_bool.info
Out[90]:
In [91]:
%timeit zc.oindex[zix_sparse_bool]
In [92]:
%timeit np.array(c[::2])
In [93]:
%timeit zc[::2]
In [94]:
%timeit zc[::10]
In [95]:
%timeit zc[::100]
In [96]:
%timeit zc[::1000]
In [97]:
profile('zc[::2]')
In [99]:
c.shape
Out[99]:
In [100]:
d = c.reshape(-1, 1000)
d.shape
Out[100]:
In [101]:
zd = zarr.array(d)
zd.info
Out[101]:
In [102]:
ix0 = np.random.binomial(1, 0.5, size=d.shape[0]).astype(bool)
ix1 = np.random.binomial(1, 0.5, size=d.shape[1]).astype(bool)
In [103]:
%timeit d[np.ix_(ix0, ix1)]
In [104]:
%timeit zd.oindex[ix0, ix1]
In [105]:
ix0 = np.random.choice(d.shape[0], size=int(d.shape[0] * .5), replace=True)
ix1 = np.random.choice(d.shape[1], size=int(d.shape[1] * .5), replace=True)
In [106]:
%timeit d[np.ix_(ix0, ix1)]
In [107]:
%timeit zd.oindex[ix0, ix1]
In [108]:
n = int(d.size * .1)
ix0 = np.random.choice(d.shape[0], size=n, replace=True)
ix1 = np.random.choice(d.shape[1], size=n, replace=True)
n
Out[108]:
In [109]:
%timeit d[ix0, ix1]
In [110]:
%timeit zd.vindex[ix0, ix1]
In [111]:
profile('zd.vindex[ix0, ix1]')
Points need to be partially sorted so all points in the same chunk are grouped and processed together. This requires argsort which dominates time.
In [65]:
import h5py
import tempfile
In [78]:
h5f = h5py.File(tempfile.mktemp(), driver='core', backing_store=False)
In [79]:
hc = h5f.create_dataset('c', data=c, compression='gzip', compression_opts=1, chunks=zc.chunks, shuffle=True)
hc
Out[79]:
In [80]:
%time hc[:]
Out[80]:
In [81]:
%time hc[ix_sparse_bool]
Out[81]:
In [82]:
# # this is pathological, takes minutes
# %time hc[ix_dense_bool]
In [83]:
# this is pretty slow
%time hc[::1000]
Out[83]:
In [ ]: