Object arrays

See #212 for more information.


In [1]:
import numpy as np

In [2]:
import zarr
zarr.__version__


Out[2]:
'2.2.0a2.dev82+dirty'

In [3]:
import numcodecs
numcodecs.__version__


Out[3]:
'0.5.0'

API changes in Zarr version 2.2

Creation of an object array requires providing new object_codec argument:


In [4]:
z = zarr.empty(10, chunks=5, dtype=object, object_codec=numcodecs.MsgPack())
z


Out[4]:
<zarr.core.Array (10,) object>

To maintain backwards compatibility with previously-created data, the object codec is treated as a filter and inserted as the first filter in the chain:


In [5]:
z.info


Out[5]:
Typezarr.core.Array
Data typeobject
Shape(10,)
Chunk shape(5,)
OrderC
Read-onlyFalse
Filter [0]MsgPack(encoding='utf-8')
CompressorBlosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store typebuiltins.dict
No. bytes80
No. bytes stored396
Storage ratio0.2
Chunks initialized0/2

In [6]:
z[0] = 'foo'
z[1] = b'bar'  # msgpack doesn't support bytes objects correctly
z[2] = 1
z[3] = [2, 4, 6, 'baz']
z[4] = {'a': 'b', 'c': 'd'}
a = z[:]
a


Out[6]:
array(['foo', 'bar', 1, list([2, 4, 6, 'baz']), {'a': 'b', 'c': 'd'}, None,
       None, None, None, None], dtype=object)

If no object_codec is provided, a ValueError is raised:


In [7]:
z = zarr.empty(10, chunks=5, dtype=object)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-7-ddcd85b97ce0> in <module>()
----> 1 z = zarr.empty(10, chunks=5, dtype=object)

~/src/github/alimanfoo/zarr/zarr/creation.py in empty(shape, **kwargs)
    204 
    205     """
--> 206     return create(shape=shape, fill_value=None, **kwargs)
    207 
    208 

~/src/github/alimanfoo/zarr/zarr/creation.py in create(shape, chunks, dtype, compressor, fill_value, order, store, synchronizer, overwrite, path, chunk_store, filters, cache_metadata, read_only, object_codec, **kwargs)
    112     init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor,
    113                fill_value=fill_value, order=order, overwrite=overwrite, path=path,
--> 114                chunk_store=chunk_store, filters=filters, object_codec=object_codec)
    115 
    116     # instantiate array

~/src/github/alimanfoo/zarr/zarr/storage.py in init_array(store, shape, chunks, dtype, compressor, fill_value, order, overwrite, path, chunk_store, filters, object_codec)
    289                          order=order, overwrite=overwrite, path=path,
    290                          chunk_store=chunk_store, filters=filters,
--> 291                          object_codec=object_codec)
    292 
    293 

~/src/github/alimanfoo/zarr/zarr/storage.py in _init_array_metadata(store, shape, chunks, dtype, compressor, fill_value, order, overwrite, path, chunk_store, filters, object_codec)
    346             if not filters:
    347                 # there are no filters so we can be sure there is no object codec
--> 348                 raise ValueError('missing object_codec for object array')
    349             else:
    350                 # one of the filters may be an object codec, issue a warning rather

ValueError: missing object_codec for object array

For API backward-compatibility, if object codec is provided via filters, issue a warning but don't raise an error.


In [8]:
z = zarr.empty(10, chunks=5, dtype=object, filters=[numcodecs.MsgPack()])


/home/aliman/src/github/alimanfoo/zarr/zarr/storage.py:353: FutureWarning: missing object_codec for object array; this will raise a ValueError in version 3.0
  'ValueError in version 3.0', FutureWarning)

If a user tries to subvert the system and create an object array with no object codec, a runtime check is added to ensure no object arrays are passed down to the compressor (which could lead to nasty errors and/or segfaults):


In [9]:
z = zarr.empty(10, chunks=5, dtype=object, object_codec=numcodecs.MsgPack())
z._filters = None  # try to live dangerously, manually wipe filters

In [10]:
z[0] = 'foo'


---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-10-3ac17b59bc55> in <module>()
----> 1 z[0] = 'foo'

~/src/github/alimanfoo/zarr/zarr/core.py in __setitem__(self, selection, value)
   1094 
   1095         fields, selection = pop_fields(selection)
-> 1096         self.set_basic_selection(selection, value, fields=fields)
   1097 
   1098     def set_basic_selection(self, selection, value, fields=None):

~/src/github/alimanfoo/zarr/zarr/core.py in set_basic_selection(self, selection, value, fields)
   1189             return self._set_basic_selection_zd(selection, value, fields=fields)
   1190         else:
-> 1191             return self._set_basic_selection_nd(selection, value, fields=fields)
   1192 
   1193     def set_orthogonal_selection(self, selection, value, fields=None):

~/src/github/alimanfoo/zarr/zarr/core.py in _set_basic_selection_nd(self, selection, value, fields)
   1480         indexer = BasicIndexer(selection, self)
   1481 
-> 1482         self._set_selection(indexer, value, fields=fields)
   1483 
   1484     def _set_selection(self, indexer, value, fields=None):

~/src/github/alimanfoo/zarr/zarr/core.py in _set_selection(self, indexer, value, fields)
   1528 
   1529             # put data
-> 1530             self._chunk_setitem(chunk_coords, chunk_selection, chunk_value, fields=fields)
   1531 
   1532     def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection,

~/src/github/alimanfoo/zarr/zarr/core.py in _chunk_setitem(self, chunk_coords, chunk_selection, value, fields)
   1633         with lock:
   1634             self._chunk_setitem_nosync(chunk_coords, chunk_selection, value,
-> 1635                                        fields=fields)
   1636 
   1637     def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=None):

~/src/github/alimanfoo/zarr/zarr/core.py in _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields)
   1707 
   1708         # encode chunk
-> 1709         cdata = self._encode_chunk(chunk)
   1710 
   1711         # store

~/src/github/alimanfoo/zarr/zarr/core.py in _encode_chunk(self, chunk)
   1753         # check object encoding
   1754         if isinstance(chunk, np.ndarray) and chunk.dtype == object:
-> 1755             raise RuntimeError('cannot write object array without object codec')
   1756 
   1757         # compress

RuntimeError: cannot write object array without object codec

Here is another way to subvert the system, wiping filters after storing some data. To cover this case a runtime check is added to ensure no object arrays are handled inappropriately during decoding (which could lead to nasty errors and/or segfaults).


In [11]:
from numcodecs.tests.common import greetings
z = zarr.array(greetings, chunks=5, dtype=object, object_codec=numcodecs.MsgPack())
z[:]


Out[11]:
array(['¡Hola mundo!', 'Hej Världen!', 'Servus Woid!', 'Hei maailma!',
       'Xin chào thế giới', 'Njatjeta Botë!', 'Γεια σου κόσμε!', 'こんにちは世界',
       '世界,你好!', 'Helló, világ!', 'Zdravo svete!', 'เฮลโลเวิลด์'], dtype=object)

In [12]:
z._filters = []  # try to live dangerously, manually wipe filters
z[:]


---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-12-8915cc0b3395> in <module>()
      1 z._filters = []  # try to live dangerously, manually wipe filters
----> 2 z[:]

~/src/github/alimanfoo/zarr/zarr/core.py in __getitem__(self, selection)
    551 
    552         fields, selection = pop_fields(selection)
--> 553         return self.get_basic_selection(selection, fields=fields)
    554 
    555     def get_basic_selection(self, selection=Ellipsis, out=None, fields=None):

~/src/github/alimanfoo/zarr/zarr/core.py in get_basic_selection(self, selection, out, fields)
    677         else:
    678             return self._get_basic_selection_nd(selection=selection, out=out,
--> 679                                                 fields=fields)
    680 
    681     def _get_basic_selection_zd(self, selection, out=None, fields=None):

~/src/github/alimanfoo/zarr/zarr/core.py in _get_basic_selection_nd(self, selection, out, fields)
    719         indexer = BasicIndexer(selection, self)
    720 
--> 721         return self._get_selection(indexer=indexer, out=out, fields=fields)
    722 
    723     def get_orthogonal_selection(self, selection, out=None, fields=None):

~/src/github/alimanfoo/zarr/zarr/core.py in _get_selection(self, indexer, out, fields)
   1007             # load chunk selection into output array
   1008             self._chunk_getitem(chunk_coords, chunk_selection, out, out_selection,
-> 1009                                 drop_axes=indexer.drop_axes, fields=fields)
   1010 
   1011         if out.shape:

~/src/github/alimanfoo/zarr/zarr/core.py in _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, drop_axes, fields)
   1597 
   1598             # decode chunk
-> 1599             chunk = self._decode_chunk(cdata)
   1600 
   1601             # select data from chunk

~/src/github/alimanfoo/zarr/zarr/core.py in _decode_chunk(self, cdata)
   1733                 chunk = chunk.astype(self._dtype)
   1734             else:
-> 1735                 raise RuntimeError('cannot read object array without object codec')
   1736         elif isinstance(chunk, np.ndarray):
   1737             chunk = chunk.view(self._dtype)

RuntimeError: cannot read object array without object codec