Setup


In [1]:
import numpy as np
import numcodecs
numcodecs.__version__


Out[1]:
'0.4.2.dev15+dirty'

Unicode string benchmarks


In [2]:
import fastparquet


class FastParquetCodec(numcodecs.abc.Codec):
    """Hacked codec using fastparquet utf8 encoding, for benchmarking purposes only."""
    
    codec_id = 'xxx-fastparquet'
    
    def encode(self, buf):
        buf = np.asanyarray(buf)
        n = buf.size
        ba = fastparquet.speedups.array_encode_utf8(buf)
        enc = fastparquet.speedups.pack_byte_array(ba.tolist())
        return n, enc  # hack for now, return n
    
    def decode(self, data, out=None):
        n, enc = data
        ba = fastparquet.speedups.unpack_byte_array(enc, n)
        dec = fastparquet.speedups.array_decode_utf8(np.array(ba, dtype=object))
        if out is not None:
            out[:] = dec
            return out
        return dec

In [3]:
zstd1 = numcodecs.Zstd(1)
zstd5 = numcodecs.Zstd(5)
zstd9 = numcodecs.Zstd(9)


def benchmark_codec(codec, a):
    print(codec)
    print('encode')
    %timeit codec.encode(a)
    enc = codec.encode(a)
    print('decode')
    %timeit codec.decode(enc)
    if isinstance(codec, FastParquetCodec):
        enc = enc[1]  # hack
    print('size         : {:,}'.format(len(enc)))
    print('size (zstd 1): {:,}'.format(len(zstd1.encode(enc))))
    print('size (zstd 5): {:,}'.format(len(zstd5.encode(enc))))
    print('size (zstd 9): {:,}'.format(len(zstd9.encode(enc))))

In [4]:
from numcodecs.tests.common import greetings
msgpack_codec = numcodecs.MsgPack()
json_codec = numcodecs.JSON()
pickle_codec = numcodecs.Pickle()
cat_codec = numcodecs.Categorize(greetings, dtype=object, astype='u1')
vlen_codec = numcodecs.VLenUTF8()
fp_codec = FastParquetCodec()

Greetings benchmark


In [5]:
np.random.seed(42)
data = np.random.choice(greetings, size=1000000).astype(object)
data


Out[5]:
array(['Γεια σου κόσμε!', 'Hei maailma!', 'Zdravo svete!', ...,
       'Servus Woid!', 'เฮลโลเวิลด์', 'Zdravo svete!'], dtype=object)

In [7]:
%time enc = vlen_codec.encode(data)


CPU times: user 84 ms, sys: 20 ms, total: 104 ms
Wall time: 105 ms

In [9]:
%time dec = vlen_codec.decode(enc)


CPU times: user 160 ms, sys: 24 ms, total: 184 ms
Wall time: 185 ms

In [8]:
benchmark_codec(msgpack_codec, data)


MsgPack(encoding='utf-8')
encode
128 ms ± 1.96 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
232 ms ± 6.94 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
size         : 18,996,503
size (zstd 1): 1,576,435
size (zstd 5): 1,409,320
size (zstd 9): 1,310,380

In [7]:
benchmark_codec(json_codec, data)


JSON(encoding='utf-8', allow_nan=True, check_circular=True, ensure_ascii=True,
     indent=None, separators=(',', ':'), skipkeys=False, sort_keys=True,
     strict=True)
encode
178 ms ± 4.09 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
422 ms ± 4.02 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
size         : 33,322,595
size (zstd 1): 1,840,414
size (zstd 5): 1,675,163
size (zstd 9): 1,522,853

In [8]:
benchmark_codec(pickle_codec, data)


Pickle(protocol=4)
encode
238 ms ± 25.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
decode
212 ms ± 3.05 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
size         : 20,835,275
size (zstd 1): 1,608,227
size (zstd 5): 1,436,093
size (zstd 9): 1,333,676

In [9]:
benchmark_codec(cat_codec, data)


Categorize(dtype='|O', astype='|u1', labels=['¡Hola mundo!', 'Hej Världen!', 'Servus Woid!', ...])
encode
221 ms ± 10.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
decode
34.8 ms ± 11 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 1,000,000
size (zstd 1): 458,191
size (zstd 5): 493,638
size (zstd 9): 490,483

In [10]:
benchmark_codec(vlen_codec, data)


VLenUTF8()
encode
72.4 ms ± 811 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
166 ms ± 202 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 21,830,275
size (zstd 1): 1,762,784
size (zstd 5): 1,546,616
size (zstd 9): 1,358,840

In [11]:
benchmark_codec(fp_codec, data)


FastParquetCodec()
encode
107 ms ± 497 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
223 ms ± 4.44 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
size         : 21,830,271
size (zstd 1): 1,762,809
size (zstd 5): 1,546,612
size (zstd 9): 1,358,813

Lorem benchmark


In [11]:
from faker import Faker
fake = Faker()

In [12]:
data2 = np.array(' '.join(fake.sentences(nb=200000)).split(), dtype=object)
len(data2), data2[:10]


Out[12]:
(1101346, array(['Aspernatur', 'ratione', 'ea', 'eius', 'unde', 'sed', 'id',
        'provident.', 'Harum', 'expedita'], dtype=object))

In [14]:
benchmark_codec(msgpack_codec, data2)


MsgPack(encoding='utf-8')
encode
99.4 ms ± 7.05 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
decode
141 ms ± 334 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 8,372,944
size (zstd 1): 2,627,063
size (zstd 5): 2,354,835
size (zstd 9): 2,262,203

In [15]:
benchmark_codec(json_codec, data2)


JSON(encoding='utf-8', allow_nan=True, check_circular=True, ensure_ascii=True,
     indent=None, separators=(',', ':'), skipkeys=False, sort_keys=True,
     strict=True)
encode
120 ms ± 1.31 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
144 ms ± 12.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 10,574,596
size (zstd 1): 2,544,155
size (zstd 5): 2,435,165
size (zstd 9): 2,310,001

In [16]:
benchmark_codec(pickle_codec, data2)


Pickle(protocol=4)
encode
235 ms ± 7.98 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
decode
133 ms ± 577 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 10,571,711
size (zstd 1): 2,581,019
size (zstd 5): 2,454,825
size (zstd 9): 2,452,222

In [13]:
benchmark_codec(vlen_codec, data2)


VLenUTF8()
encode
50.5 ms ± 614 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
93.2 ms ± 641 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 11,683,595
size (zstd 1): 2,957,265
size (zstd 5): 2,859,653
size (zstd 9): 2,557,328

In [15]:
benchmark_codec(fp_codec, data2)


FastParquetCodec()
encode
80.4 ms ± 841 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
148 ms ± 1.08 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 11,680,588
size (zstd 1): 2,956,144
size (zstd 5): 2,859,941
size (zstd 9): 2,555,154

Byte strings benchmark


In [14]:
vlen_bytes_codec = numcodecs.VLenBytes()

In [15]:
np.random.seed(42)
greetings_bytes = [g.encode('utf-8') for g in greetings]
data3 = np.random.choice(greetings_bytes, size=1000000).astype(object)
data3


Out[15]:
array([ b'\xce\x93\xce\xb5\xce\xb9\xce\xb1 \xcf\x83\xce\xbf\xcf\x85 \xce\xba\xcf\x8c\xcf\x83\xce\xbc\xce\xb5!',
       b'Hei maailma!', b'Zdravo svete!', ..., b'Servus Woid!',
       b'\xe0\xb9\x80\xe0\xb8\xae\xe0\xb8\xa5\xe0\xb9\x82\xe0\xb8\xa5\xe0\xb9\x80\xe0\xb8\xa7\xe0\xb8\xb4\xe0\xb8\xa5\xe0\xb8\x94\xe0\xb9\x8c',
       b'Zdravo svete!'], dtype=object)

In [16]:
vlen_bytes_codec.decode(vlen_bytes_codec.encode(data3))


Out[16]:
array([ b'\xce\x93\xce\xb5\xce\xb9\xce\xb1 \xcf\x83\xce\xbf\xcf\x85 \xce\xba\xcf\x8c\xcf\x83\xce\xbc\xce\xb5!',
       b'Hei maailma!', b'Zdravo svete!', ..., b'Servus Woid!',
       b'\xe0\xb9\x80\xe0\xb8\xae\xe0\xb8\xa5\xe0\xb9\x82\xe0\xb8\xa5\xe0\xb9\x80\xe0\xb8\xa7\xe0\xb8\xb4\xe0\xb8\xa5\xe0\xb8\x94\xe0\xb9\x8c',
       b'Zdravo svete!'], dtype=object)

In [8]:
benchmark_codec(pickle_codec, data3)


Pickle(protocol=4)
encode
217 ms ± 9.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
decode
90.6 ms ± 2.28 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 20,835,275
size (zstd 1): 1,608,240
size (zstd 5): 1,436,091
size (zstd 9): 1,333,674

In [17]:
benchmark_codec(vlen_bytes_codec, data3)


VLenBytes()
encode
14.2 ms ± 76.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
decode
45.5 ms ± 2.38 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 21,830,275
size (zstd 1): 1,762,784
size (zstd 5): 1,546,616
size (zstd 9): 1,358,840

Array benchmarks


In [30]:
np.random.seed(42)
data4 = np.array([np.random.randint(0, 100, size=np.random.randint(0, 20)).astype('i4')
                  for i in range(100000)], dtype=object)
data4


Out[30]:
array([array([51, 92, 14, 71, 60, 20], dtype=int32),
       array([82, 86, 74, 74, 87, 99], dtype=int32),
       array([23,  2, 21, 52,  1, 87, 29], dtype=int32), ...,
       array([19, 62, 18], dtype=int32),
       array([93, 20,  7, 50], dtype=int32), array([51, 28], dtype=int32)], dtype=object)

In [28]:
vlen_arr_codec = numcodecs.VLenArray('<i4')

In [29]:
vlen_arr_codec.decode(vlen_arr_codec.encode(data4))


Out[29]:
array([array([51, 92, 14, 71, 60, 20], dtype=int32),
       array([82, 86, 74, 74, 87, 99], dtype=int32),
       array([23,  2, 21, 52,  1, 87, 29], dtype=int32), ...,
       array([19, 62, 18], dtype=int32),
       array([93, 20,  7, 50], dtype=int32), array([51, 28], dtype=int32)], dtype=object)

In [25]:
benchmark_codec(vlen_arr_codec, data4)


VLenArray(dtype='<i4')
encode
152 ms ± 317 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
82.3 ms ± 1.08 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 4,195,540
size (zstd 1): 1,316,155
size (zstd 5): 1,119,896
size (zstd 9): 1,200,561

In [26]:
benchmark_codec(pickle_codec, data4)


Pickle(protocol=4)
encode
311 ms ± 6.38 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
decode
126 ms ± 1.36 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 6,791,830
size (zstd 1): 1,716,068
size (zstd 5): 1,566,619
size (zstd 9): 1,488,721

In [ ]: