In [3]:
import numpy as np
import pandas as pd
import pyarrow as pa
import feather as fth

type_ = np.dtype('float64')
DATA_SIZE = (1 << 30)
NCOLS = 100
NROWS = DATA_SIZE / NCOLS / np.dtype(type_).itemsize

data = {
    'c' + str(i): np.random.randn(int(NROWS))
    for i in range(int(NCOLS))
}
df = pd.DataFrame(data)
df[::5] = np.nan

In [4]:
fth.write_dataframe?

In [5]:
NFILES = 20
for i in range(NFILES):
    fth.write_dataframe(df, 'df{0}.feather'.format(i))

In [9]:
def read_all(nfiles=NFILES, nthreads=1):
    for i in range(nfiles):
        path = 'df{0}.feather'.format(i)
        fth.read_dataframe(path, nthreads=nthreads)

Make sure to run before each command

echo 3 > /proc/sys/vm/drop_caches


In [21]:
%time read_all(10)


CPU times: user 1.74 s, sys: 7.75 s, total: 9.49 s
Wall time: 16.5 s

In [22]:
%time read_all(10, nthreads=4)


CPU times: user 2.08 s, sys: 8.56 s, total: 10.6 s
Wall time: 10.9 s

In [16]:
fth.read_dataframe??

In [ ]: