In [1]:
import numpy as np
import numcodecs
numcodecs.__version__
Out[1]:
In [2]:
import fastparquet
class FastParquetCodec(numcodecs.abc.Codec):
"""Hacked codec using fastparquet utf8 encoding, for benchmarking purposes only."""
codec_id = 'xxx-fastparquet'
def encode(self, buf):
buf = np.asanyarray(buf)
n = buf.size
ba = fastparquet.speedups.array_encode_utf8(buf)
enc = fastparquet.speedups.pack_byte_array(ba.tolist())
return n, enc # hack for now, return n
def decode(self, data, out=None):
n, enc = data
ba = fastparquet.speedups.unpack_byte_array(enc, n)
dec = fastparquet.speedups.array_decode_utf8(np.array(ba, dtype=object))
if out is not None:
out[:] = dec
return out
return dec
In [3]:
zstd1 = numcodecs.Zstd(1)
zstd5 = numcodecs.Zstd(5)
zstd9 = numcodecs.Zstd(9)
def benchmark_codec(codec, a):
print(codec)
print('encode')
%timeit codec.encode(a)
enc = codec.encode(a)
print('decode')
%timeit codec.decode(enc)
if isinstance(codec, FastParquetCodec):
enc = enc[1] # hack
print('size : {:,}'.format(len(enc)))
print('size (zstd 1): {:,}'.format(len(zstd1.encode(enc))))
print('size (zstd 5): {:,}'.format(len(zstd5.encode(enc))))
print('size (zstd 9): {:,}'.format(len(zstd9.encode(enc))))
In [4]:
from numcodecs.tests.common import greetings
msgpack_codec = numcodecs.MsgPack()
json_codec = numcodecs.JSON()
pickle_codec = numcodecs.Pickle()
cat_codec = numcodecs.Categorize(greetings, dtype=object, astype='u1')
vlen_codec = numcodecs.VLenUTF8()
fp_codec = FastParquetCodec()
In [5]:
np.random.seed(42)
data = np.random.choice(greetings, size=1000000).astype(object)
data
Out[5]:
In [7]:
%time enc = vlen_codec.encode(data)
In [9]:
%time dec = vlen_codec.decode(enc)
In [8]:
benchmark_codec(msgpack_codec, data)
In [7]:
benchmark_codec(json_codec, data)
In [8]:
benchmark_codec(pickle_codec, data)
In [9]:
benchmark_codec(cat_codec, data)
In [10]:
benchmark_codec(vlen_codec, data)
In [11]:
benchmark_codec(fp_codec, data)
In [11]:
from faker import Faker
fake = Faker()
In [12]:
data2 = np.array(' '.join(fake.sentences(nb=200000)).split(), dtype=object)
len(data2), data2[:10]
Out[12]:
In [14]:
benchmark_codec(msgpack_codec, data2)
In [15]:
benchmark_codec(json_codec, data2)
In [16]:
benchmark_codec(pickle_codec, data2)
In [13]:
benchmark_codec(vlen_codec, data2)
In [15]:
benchmark_codec(fp_codec, data2)
In [14]:
vlen_bytes_codec = numcodecs.VLenBytes()
In [15]:
np.random.seed(42)
greetings_bytes = [g.encode('utf-8') for g in greetings]
data3 = np.random.choice(greetings_bytes, size=1000000).astype(object)
data3
Out[15]:
In [16]:
vlen_bytes_codec.decode(vlen_bytes_codec.encode(data3))
Out[16]:
In [8]:
benchmark_codec(pickle_codec, data3)
In [17]:
benchmark_codec(vlen_bytes_codec, data3)
In [30]:
np.random.seed(42)
data4 = np.array([np.random.randint(0, 100, size=np.random.randint(0, 20)).astype('i4')
for i in range(100000)], dtype=object)
data4
Out[30]:
In [28]:
vlen_arr_codec = numcodecs.VLenArray('<i4')
In [29]:
vlen_arr_codec.decode(vlen_arr_codec.encode(data4))
Out[29]:
In [25]:
benchmark_codec(vlen_arr_codec, data4)
In [26]:
benchmark_codec(pickle_codec, data4)
In [ ]: