In [ ]:
import os
import time
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import cellpy
from cellpy import cellreader

%matplotlib inline

In [ ]:
filename = Path("/Users/jepe/Arbeid/Data/celldata/20171120_nb034_11_cc.h5")
print(f"size: {filename.stat().st_size/1_048_576}")

In [ ]:
store = pd.HDFStore(filename)

In [ ]:
store.keys()

In [ ]:
infoname = '/CellpyData/info'
dataname = '/CellpyData/dfdata'
summaryname = '/CellpyData/dfsummary'
fidname = '/CellpyData/fidtable'
stepname = '/CellpyData/step_table'

In [ ]:
infotable = store.select(infoname)
datatable = store.select(dataname)
summarytable = store.select(summaryname)
fidtable = store.select(fidname)
steptable = store.select(stepname)

In [ ]:
datatable.describe()

In [ ]:
%timeit datatable = store.select(dataname)

In [ ]:
store.close()

In [ ]:
# making sure steptable has correct dtypes
for col in steptable.columns:
    if not col == "type":
        steptable[col] = steptable[col].apply(pd.to_numeric)
    else:
        steptable[col] = steptable[col].astype('str')

What I want to do

  • figure out how compression works
  • find out how to use indexing properly

Compression


In [ ]:
path_name = '/Users/jepe/scripting/cellpy/dev_data/speedtest2.hf5'

In [ ]:
def save_and_read_them(mode=None, complevel=None, complib=None, fletcher32=False, format='fixed', all = False, dtable=None):
    # linking
    t0 = time.time()
    new_path = Path(path_name)
    new_store = pd.HDFStore(
        new_path,
        mode = mode,
        complevel = complevel,
        complib = complib,
        fletcher32 = fletcher32,
    )
    t1 = time.time()
    dt = t1 - t0

    # putting
    t0 = time.time()
    if not dtable is None:
        new_store.put(dataname, dtable, format=format)
    else:
        new_store.put(dataname, datatable, format=format)
    
    if all:
        new_store.put(fidname, fidtable, format='fixed')
        for name, table in zip([infoname, summaryname, stepname],
                               [infotable, summarytable, steptable]):
            try:
                new_store.put(name, table, format=format)
                #print(f" -- stored {name}")
            except Exception as e:
                print(f" -- FAILED storing {name}")
                print(e)
    new_store.close()
    size = new_path.stat().st_size
    
    t1 = time.time()
    dt_save = t1 - t0
    
    # opening
    t0 = time.time()
    new_path = Path(path_name)
    new_store = pd.HDFStore(new_path)
    
    # getting data
    datatable2 = new_store.select(dataname)
    if all:
        fidtable2 = new_store.select(fidname)
        infotable2 = new_store.select(infoname)
        summarytable2 = new_store.select(summaryname)
        steptable2 = new_store.select(stepname)
    
    t1 = time.time()
    dt_read = t1-t0
    new_store.close()
    return dt_save, dt_read, size

In [ ]:
collected_test = []
m = "w"
f = False
form = 'table'
for clib in ['zlib', 'lzo', 'bzip2', 'blosc']:
    for clevel in range(3):
        dts, dtr, size = save_and_read_them(mode=m, complevel=clevel, complib='zlib', fletcher32=f, format=form, all=False)
        # print(f"[cl = {clevel}] savetime: {dts*1000} ms - readtime: {dtr*1000} ms - size: {size}")
        collected_test.append([m, clib, clevel, f, dts*1000, dtr*1000, size/1_048_576])
results_only_data = pd.DataFrame(collected_test, columns=["mode", "clib", "clevel", "fle32", "savetime", "readtime", "filesize"])
results_only_data

In [ ]:
collected_test = []
m = "w"
f = False
form = 'fixed'
for clib in ['zlib', 'lzo', 'bzip2', 'blosc']:
    for clevel in range(3):
        dts, dtr, size = save_and_read_them(mode=m, complevel=clevel, complib='zlib', fletcher32=f, format=form, all=True)
        # print(f"[cl = {clevel}] savetime: {dts*1000} ms - readtime: {dtr*1000} ms - size: {size}")
        collected_test.append([m, clib, clevel, f, dts*1000, dtr*1000, size/1_048_576])
results_all_fixed = pd.DataFrame(collected_test, columns=["mode", "clib", "clevel", "fle32", "savetime", "readtime", "filesize"])
results_all_fixed

In [ ]:
collected_test = []
m = "w"
f = False
form = 'table'
for clib in ['zlib', 'lzo', 'bzip2', 'blosc']:
    for clevel in range(3):
        dts, dtr, size = save_and_read_them(mode=m, complevel=clevel, complib='zlib', fletcher32=f, format=form, all=True)
        # print(f"[cl = {clevel}] savetime: {dts*1000} ms - readtime: {dtr*1000} ms - size: {size}")
        collected_test.append([m, clib, clevel, f, dts*1000, dtr*1000, size/1_048_576])
results_all = pd.DataFrame(collected_test, columns=["mode", "clib", "clevel", "fle32", "savetime", "readtime", "filesize"])
results_all

Summarizing compression tests

Seems like the best option is

  • use zlib (default) compression library
  • use a comression level of 1
  • use 'table' format for data and possible for summary and step_table (but make sure to fixe the dtypes of step_table), and also possible infotable
  • use 'fixed' format for fidtable

The most critical prm is read-spead. Then comes save spead. Another important prm is lookup-speed!

Look-up speed and indexing

First - influcence of save/read speed on indexing


In [ ]:
datatable_not_indexed = datatable.copy()

In [ ]:
datatable_indexed_point = datatable.copy()
datatable_indexed_point = datatable_indexed_point.set_index("Data_Point");

In [ ]:
collected_test = []
m = "w"
f = False
form = 'table'
for clib in ['zlib', 'lzo', 'bzip2', 'blosc']:
    for clevel in range(3):
        dts, dtr, size = save_and_read_them(mode=m, complevel=clevel, complib='zlib', fletcher32=f, format=form, all=True, dtable=datatable_not_indexed)
        # print(f"[cl = {clevel}] savetime: {dts*1000} ms - readtime: {dtr*1000} ms - size: {size}")
        collected_test.append([m, clib, clevel, f, dts*1000, dtr*1000, size/1_048_576])
results_all = pd.DataFrame(collected_test, columns=["mode", "clib", "clevel", "fle32", "savetime", "readtime", "filesize"])
results_all

In [ ]:
collected_test = []
m = "w"
f = False
form = 'table'
for clib in ['zlib', 'lzo', 'bzip2', 'blosc']:
    for clevel in range(3):
        dts, dtr, size = save_and_read_them(mode=m, complevel=clevel, complib='zlib', fletcher32=f, format=form, all=True, dtable=datatable_indexed_point)
        # print(f"[cl = {clevel}] savetime: {dts*1000} ms - readtime: {dtr*1000} ms - size: {size}")
        collected_test.append([m, clib, clevel, f, dts*1000, dtr*1000, size/1_048_576])
results_all = pd.DataFrame(collected_test, columns=["mode", "clib", "clevel", "fle32", "savetime", "readtime", "filesize"])
results_all

did not see much difference here (a bit smaller, but that could be because we dropped one column).

Then lets see if we can get cycle data directly from the hdf5 file

(but first, lets get to know the tables a little bit more)


In [ ]:
infotable.cellpy_file_version.values[0]

NOTICE: a trick to read prms and save them using pandas


In [ ]:
infodict = infotable.T.to_dict()[0]
new_infotable = pd.DataFrame(infodict, index=[0])

In [ ]:
infodict

In [ ]:
infotable

In [ ]:
new_infotable

In [ ]:
steptable.describe()

Oh no: datapoints are missing from the step table!!!!!!!


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:

TODO´s

  1. Update the function that makes the step-table OK
    1. Ensure correct dtypes
    2. Add start and stop data-point numbers
  2. Make the Data_Point column the index for the dfdata DataFrame TOO MUCH WORK
  3. Save the data in hdf5-format using OK
    • table format
    • default complib
    • complevel of 1
  4. Use the "trick" explained (infotable.T etc) above for reading and saving prms from hdf5 using pandas DataFrames

In [ ]: