In [ ]:
import os
import time
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import cellpy
from cellpy import cellreader
%matplotlib inline
In [ ]:
filename = Path("/Users/jepe/Arbeid/Data/celldata/20171120_nb034_11_cc.h5")
print(f"size: {filename.stat().st_size/1_048_576}")
In [ ]:
store = pd.HDFStore(filename)
In [ ]:
store.keys()
In [ ]:
infoname = '/CellpyData/info'
dataname = '/CellpyData/dfdata'
summaryname = '/CellpyData/dfsummary'
fidname = '/CellpyData/fidtable'
stepname = '/CellpyData/step_table'
In [ ]:
infotable = store.select(infoname)
datatable = store.select(dataname)
summarytable = store.select(summaryname)
fidtable = store.select(fidname)
steptable = store.select(stepname)
In [ ]:
datatable.describe()
In [ ]:
%timeit datatable = store.select(dataname)
In [ ]:
store.close()
In [ ]:
# making sure steptable has correct dtypes
for col in steptable.columns:
if not col == "type":
steptable[col] = steptable[col].apply(pd.to_numeric)
else:
steptable[col] = steptable[col].astype('str')
In [ ]:
path_name = '/Users/jepe/scripting/cellpy/dev_data/speedtest2.hf5'
In [ ]:
def save_and_read_them(mode=None, complevel=None, complib=None, fletcher32=False, format='fixed', all = False, dtable=None):
# linking
t0 = time.time()
new_path = Path(path_name)
new_store = pd.HDFStore(
new_path,
mode = mode,
complevel = complevel,
complib = complib,
fletcher32 = fletcher32,
)
t1 = time.time()
dt = t1 - t0
# putting
t0 = time.time()
if not dtable is None:
new_store.put(dataname, dtable, format=format)
else:
new_store.put(dataname, datatable, format=format)
if all:
new_store.put(fidname, fidtable, format='fixed')
for name, table in zip([infoname, summaryname, stepname],
[infotable, summarytable, steptable]):
try:
new_store.put(name, table, format=format)
#print(f" -- stored {name}")
except Exception as e:
print(f" -- FAILED storing {name}")
print(e)
new_store.close()
size = new_path.stat().st_size
t1 = time.time()
dt_save = t1 - t0
# opening
t0 = time.time()
new_path = Path(path_name)
new_store = pd.HDFStore(new_path)
# getting data
datatable2 = new_store.select(dataname)
if all:
fidtable2 = new_store.select(fidname)
infotable2 = new_store.select(infoname)
summarytable2 = new_store.select(summaryname)
steptable2 = new_store.select(stepname)
t1 = time.time()
dt_read = t1-t0
new_store.close()
return dt_save, dt_read, size
In [ ]:
collected_test = []
m = "w"
f = False
form = 'table'
for clib in ['zlib', 'lzo', 'bzip2', 'blosc']:
for clevel in range(3):
dts, dtr, size = save_and_read_them(mode=m, complevel=clevel, complib='zlib', fletcher32=f, format=form, all=False)
# print(f"[cl = {clevel}] savetime: {dts*1000} ms - readtime: {dtr*1000} ms - size: {size}")
collected_test.append([m, clib, clevel, f, dts*1000, dtr*1000, size/1_048_576])
results_only_data = pd.DataFrame(collected_test, columns=["mode", "clib", "clevel", "fle32", "savetime", "readtime", "filesize"])
results_only_data
In [ ]:
collected_test = []
m = "w"
f = False
form = 'fixed'
for clib in ['zlib', 'lzo', 'bzip2', 'blosc']:
for clevel in range(3):
dts, dtr, size = save_and_read_them(mode=m, complevel=clevel, complib='zlib', fletcher32=f, format=form, all=True)
# print(f"[cl = {clevel}] savetime: {dts*1000} ms - readtime: {dtr*1000} ms - size: {size}")
collected_test.append([m, clib, clevel, f, dts*1000, dtr*1000, size/1_048_576])
results_all_fixed = pd.DataFrame(collected_test, columns=["mode", "clib", "clevel", "fle32", "savetime", "readtime", "filesize"])
results_all_fixed
In [ ]:
collected_test = []
m = "w"
f = False
form = 'table'
for clib in ['zlib', 'lzo', 'bzip2', 'blosc']:
for clevel in range(3):
dts, dtr, size = save_and_read_them(mode=m, complevel=clevel, complib='zlib', fletcher32=f, format=form, all=True)
# print(f"[cl = {clevel}] savetime: {dts*1000} ms - readtime: {dtr*1000} ms - size: {size}")
collected_test.append([m, clib, clevel, f, dts*1000, dtr*1000, size/1_048_576])
results_all = pd.DataFrame(collected_test, columns=["mode", "clib", "clevel", "fle32", "savetime", "readtime", "filesize"])
results_all
Seems like the best option is
The most critical prm is read-spead. Then comes save spead. Another important prm is lookup-speed!
In [ ]:
datatable_not_indexed = datatable.copy()
In [ ]:
datatable_indexed_point = datatable.copy()
datatable_indexed_point = datatable_indexed_point.set_index("Data_Point");
In [ ]:
collected_test = []
m = "w"
f = False
form = 'table'
for clib in ['zlib', 'lzo', 'bzip2', 'blosc']:
for clevel in range(3):
dts, dtr, size = save_and_read_them(mode=m, complevel=clevel, complib='zlib', fletcher32=f, format=form, all=True, dtable=datatable_not_indexed)
# print(f"[cl = {clevel}] savetime: {dts*1000} ms - readtime: {dtr*1000} ms - size: {size}")
collected_test.append([m, clib, clevel, f, dts*1000, dtr*1000, size/1_048_576])
results_all = pd.DataFrame(collected_test, columns=["mode", "clib", "clevel", "fle32", "savetime", "readtime", "filesize"])
results_all
In [ ]:
collected_test = []
m = "w"
f = False
form = 'table'
for clib in ['zlib', 'lzo', 'bzip2', 'blosc']:
for clevel in range(3):
dts, dtr, size = save_and_read_them(mode=m, complevel=clevel, complib='zlib', fletcher32=f, format=form, all=True, dtable=datatable_indexed_point)
# print(f"[cl = {clevel}] savetime: {dts*1000} ms - readtime: {dtr*1000} ms - size: {size}")
collected_test.append([m, clib, clevel, f, dts*1000, dtr*1000, size/1_048_576])
results_all = pd.DataFrame(collected_test, columns=["mode", "clib", "clevel", "fle32", "savetime", "readtime", "filesize"])
results_all
did not see much difference here (a bit smaller, but that could be because we dropped one column).
In [ ]:
infotable.cellpy_file_version.values[0]
NOTICE: a trick to read prms and save them using pandas
In [ ]:
infodict = infotable.T.to_dict()[0]
new_infotable = pd.DataFrame(infodict, index=[0])
In [ ]:
infodict
In [ ]:
infotable
In [ ]:
new_infotable
In [ ]:
steptable.describe()
Oh no: datapoints are missing from the step table!!!!!!!
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
Data_Point column the index for the dfdata DataFrame TOO MUCH WORKtable formatinfotable.T etc) above for reading and saving prms from hdf5 using pandas DataFrames
In [ ]: