In [13]:
from __future__ import print_function, division
import pandas as pd
import numpy as np
import itertools, os, time
print("Pandas version = ", pd.__version__)
print("Numpy version = ", np.__version__)
In [14]:
N_PERIODS = 1E6
N_METERS = 5
N_MEASUREMENTS_PER_METER = 1
meters = ['meter{:d}'.format(i) for i in range(1,N_METERS+1)]
meters = [[m]*N_MEASUREMENTS_PER_METER for m in meters]
flatten_2d_list = lambda lst: list(itertools.chain(*lst))
meters = flatten_2d_list(meters)
level2 = ['power', 'power', 'voltage'][:N_MEASUREMENTS_PER_METER] * N_METERS
level3 = ['active', 'reactive', ''][:N_MEASUREMENTS_PER_METER] * N_METERS
columns = [meters, level2, level3]
columns = pd.MultiIndex.from_arrays(columns)
rng = pd.date_range('2012', freq='S', periods=N_PERIODS)
data = np.random.randint(low=0, high=1000,
size=(N_PERIODS,
N_METERS*N_MEASUREMENTS_PER_METER))
df = pd.DataFrame(data=data, index=rng, columns=columns, dtype=np.float32)
# df.iloc[:10]
In [19]:
COMPRESSION = 'zlib'
filename = 'one_big_table.h5'
t0 = time.time()
store = pd.HDFStore(filename, 'w', complevel=9, complib=COMPRESSION)
store.put('df', df, format='table')
store.close()
def print_runtime(start):
print('Time = {:.1f}'.format(time.time() - start))
def print_filesize(filename):
print('Filesize of \'{:s}\' is {:.2f} MBytes'
.format(filename, os.path.getsize(filename) / 1E6))
print_runtime(t0)
print_filesize(filename)
In [20]:
filename = 'one_table_per_meter.h5'
t0 = time.time()
store = pd.HDFStore(filename, 'w', complevel=9, complib=COMPRESSION)
for meter in df.columns.levels[0]:
store.put(meter, df[meter], format='table')
store.close()
print_runtime(t0)
print_filesize(filename)
Default config is 5 meters, 3 params per meter, 1 million rows, float32 for power data, fixed format
5 meters: 35.57 MBytes versus 39.26 MBytes = 1.10 x larger
5 meters, 1 param per meter: 12.49 MB (0.3 sec) vs 16.16 MB (0.4 sec) = 1.29 x larger
5 meters, 1 param per meter, table format: 16.48 MB (1.3 sec) vs 22.13 MB (4.6 sec) = 1.34 x larger
50 meters: 346.93 MB versus 392.56 MB = 1.13 x larger
(48 seconds compress!)
5 meters: 24.80 MB versus 26.77 MB = 1.07 x larger
5 meters, 3 params per meter: 23.73 MB (8.7 sec) versus 25.87 MB (12.1 sec) = 1.09 x larger
5 meters, 1 param per meter: 8.28 MB (3.3 sec) vs 10.52MB (6.7 sec) = 1.27 x larger
5 meters, 1 param per meter, table format: 10.38 MB (5.3 sec) vs 17.55 MB (13.7 sec) = 1.69 x larger (!)
5 meters: 30.17 MB (0.3 sec) versus 34.17 MB (0.5 sec) = 1.13 x larger
In [ ]: