In [13]:
from __future__ import print_function, division
import pandas as pd
import numpy as np
import itertools, os, time

print("Pandas version = ", pd.__version__)
print("Numpy version =  ", np.__version__)


Pandas version =  0.13.1
Numpy version =   1.8.0

Create some toy data


In [14]:
N_PERIODS = 1E6
N_METERS = 5
N_MEASUREMENTS_PER_METER = 1

meters = ['meter{:d}'.format(i) for i in range(1,N_METERS+1)]
meters = [[m]*N_MEASUREMENTS_PER_METER for m in meters]
flatten_2d_list = lambda lst: list(itertools.chain(*lst))
meters = flatten_2d_list(meters)
level2 = ['power', 'power', 'voltage'][:N_MEASUREMENTS_PER_METER] * N_METERS
level3 = ['active', 'reactive', ''][:N_MEASUREMENTS_PER_METER] * N_METERS


columns = [meters, level2, level3]
columns = pd.MultiIndex.from_arrays(columns)
rng = pd.date_range('2012', freq='S', periods=N_PERIODS)
data = np.random.randint(low=0, high=1000, 
                         size=(N_PERIODS, 
                               N_METERS*N_MEASUREMENTS_PER_METER))
df = pd.DataFrame(data=data, index=rng, columns=columns, dtype=np.float32)

# df.iloc[:10]

Save the data to an HDF5 file, one big table


In [19]:
COMPRESSION = 'zlib'

filename = 'one_big_table.h5'
t0 = time.time()
store = pd.HDFStore(filename, 'w', complevel=9, complib=COMPRESSION)
store.put('df', df, format='table')
store.close()

def print_runtime(start):
    print('Time = {:.1f}'.format(time.time() - start))

def print_filesize(filename):
    print('Filesize of \'{:s}\' is {:.2f} MBytes'
           .format(filename, os.path.getsize(filename) / 1E6))
    
print_runtime(t0)
print_filesize(filename)


Time = 9.1
Filesize of 'one_big_table.h5' is 10.47 MBytes

Save the data to an HDF5 file, one table per meter


In [20]:
filename = 'one_table_per_meter.h5'
t0 = time.time()
store = pd.HDFStore(filename, 'w', complevel=9, complib=COMPRESSION)
for meter in df.columns.levels[0]:
    store.put(meter, df[meter], format='table')
store.close()

print_runtime(t0)
print_filesize(filename)


Time = 14.0
Filesize of 'one_table_per_meter.h5' is 16.89 MBytes

RESULTS

Default config is 5 meters, 3 params per meter, 1 million rows, float32 for power data, fixed format

BLOSC

5 meters: 35.57 MBytes versus 39.26 MBytes = 1.10 x larger

5 meters, 1 param per meter: 12.49 MB (0.3 sec) vs 16.16 MB (0.4 sec) = 1.29 x larger

5 meters, 1 param per meter, table format: 16.48 MB (1.3 sec) vs 22.13 MB (4.6 sec) = 1.34 x larger

50 meters: 346.93 MB versus 392.56 MB = 1.13 x larger

ZLIB

(48 seconds compress!)

5 meters: 24.80 MB versus 26.77 MB = 1.07 x larger

BZIP2

5 meters, 3 params per meter: 23.73 MB (8.7 sec) versus 25.87 MB (12.1 sec) = 1.09 x larger

5 meters, 1 param per meter: 8.28 MB (3.3 sec) vs 10.52MB (6.7 sec) = 1.27 x larger

5 meters, 1 param per meter, table format: 10.38 MB (5.3 sec) vs 17.55 MB (13.7 sec) = 1.69 x larger (!)

LZO

5 meters: 30.17 MB (0.3 sec) versus 34.17 MB (0.5 sec) = 1.13 x larger


In [ ]: