Currently HDF5 is our standard file format to store simulation results and this notebook aims to check if playing with raw data yields better perfomance (in terms of both, speed and file size).
The idea is to emulate an ALFASim simulation that generates a large amount of data (profiles and trends). We are saving all pipes variables as profiles and for each pipe the trends are defined as the values at the first position just for easiness.
Set some constants to determine the problem size, frequencies and where to store the data to be saved
In [145]:
number_of_pipes = 10
number_of_cells = 500
total_simulation_time = 6 * 60 * 60 # [s], 6 hours of simulation
trend_output_frequency = 2 # [s]
profile_output_frequency = 5 * 60 # [s]
initial_time_step = 1.0e-2 # [s]
time_step_increase_factor = 1.1
maximum_time_step = 2 # [s]
import tempfile
import os
import json
import numpy
root_path = os.getcwd()
Let's create a list of pipes, each one with some output properies and random values. During the simulation the data for each pipe will change but for our pourposes it can be always the same.
In [146]:
from collections import namedtuple
import numpy as np
output_variables_per_pipe = [
'pressure',
'temperature',
'gas_velocity',
'liquid_velocity'
]
positions = number_of_pipes * [0]
class Pipe:
__slots__ = list(output_variables_per_pipe)
pipes = []
for i in range(number_of_pipes):
pipes.append(Pipe())
for var in output_variables_per_pipe:
setattr(pipes[-1], var, np.random.rand(number_of_cells))
Time step sizes are not constant during the simulation, so emulate a growing time step size up to the point when it reaches the maximum value.
In [147]:
def new_time_step(time_step, time_step_increase_factor, maximum_time_step):
new_time_step = time_step * time_step_increase_factor
if new_time_step > maximum_time_step:
return maximum_time_step
else:
return new_time_step
Crate a folder to store the simulation results. This would have the project name associated to it but here simplename is enough, just check whether it exists or not and clear the folder content if necessary. A folder inside of the main one is also created to store the metadata.
In [148]:
def create_dirs():
project_name = 'my_project_folder'
project_path = os.path.join(root_path, project_name)
meta_path = os.path.join(project_path, 'meta')
if not os.path.exists(project_path):
os.makedirs(project_path)
else:
# clear directory content
# in real life need to ask the user what to do, if delete and rerun simulation or stop
import shutil
for file in os.listdir(project_path):
file_path = os.path.join(project_path, file)
if os.path.isfile(file_path):
os.unlink(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path, ignore_errors=True)
os.makedirs(meta_path)
return project_path, meta_path
Some auxiliary methods to actually write the data...
In [149]:
def write_profiles(pipes, tmp_path):
from numpy import ndarray
profile_path = os.path.join(tmp_path, 'profiles')
os.makedirs(profile_path)
metadata = []
pipe_index = 0
profile_index = 0
for pipe in pipes:
pipe_name = 'pipe_' + '{:03d}'.format(pipe_index)
for var in sorted(output_variables_per_pipe):
md = {}
output_file = 'profile_' + '{:03d}'.format(profile_index) + '.binary'
data = getattr(pipe, var)
data.tofile(os.path.join(profile_path, output_file))
#np.savez_compressed(os.path.join(profile_path, output_file), a=data)
md['property_id'] = var
md['edge'] = pipe_name
md['fn'] = output_file
md['size'] = len(data)
profile_index += 1
metadata.append(md)
pipe_index += 1
return metadata
In [150]:
def write_trends(pipes, positions, tmp_path):
from numpy import array
trends_path = os.path.join(tmp_path, 'trends')
os.makedirs(trends_path)
metadata = []
trend_data = []
i = 0
output_file = 'trends.binary'
for pipe, position in zip(pipes, positions):
pipe_name = 'pipe_' + '{:03d}'.format(i)
for var in sorted(output_variables_per_pipe):
md = {}
full_data = getattr(pipe, var)
trend_data.append(full_data[position])
md['property_id'] = var
md['edge'] = pipe_name
md['fn'] = output_file
md['position'] = position
metadata.append(md)
i += 1
trend_data = array(trend_data)
trend_data.tofile(os.path.join(trends_path, output_file))
return metadata
In [151]:
def write_metadata(folder_path, metadata, filename):
with open(os.path.join(folder_path, filename), 'w') as file:
json.dump(obj=metadata, fp=file, indent=4)
In [152]:
def write_timeset(folder_path, timeset):
ts = numpy.array(timeset)
ts_values_file = os.path.join(folder_path, 'timeset.values.binary')
ts.tofile(ts_values_file)
content = dict({'unit': 's', 'count': len(timeset), 'fn': 'timeset.values.npy'})
with open(os.path.join(folder_path, 'timeset.json'), 'w') as file:
json.dump(obj=content, fp=file, indent=4)
In [153]:
def write_metadata_and_timeset(metadata_path, folder_name, profile_metadata, trend_metadata, timeset):
tmp_path = tempfile.mkdtemp(dir=metadata_path)
write_metadata(tmp_path, profile_metadata, 'profiles.json')
write_metadata(tmp_path, trend_metadata, 'trends.json')
write_timeset(tmp_path, timeset)
os.rename(tmp_path, os.path.join(metadata_path, folder_name))
In [154]:
def update_timeset_file(timeset, meta_path):
ts = numpy.array(timeset)
tmp_values_file = os.path.join(meta_path, 'tmp.values.binary')
ts.tofile(tmp_values_file)
tmp_meta_file = os.path.join(meta_path, 'tmp.json')
content = dict({'unit': 's', 'count': len(timeset), 'fn': 'timeset.values.binary'})
with open(tmp_meta_file, 'w') as file:
json.dump(obj=content, fp=file, indent=4)
ts_values_file = os.path.join(meta_path, 'timeset.values.binary')
ts_meta_file = os.path.join(meta_path, 'timeset.json')
if os.path.exists(ts_values_file):
os.remove(ts_values_file)
if os.path.exists(ts_meta_file):
os.remove(ts_meta_file)
os.rename(tmp_values_file, ts_values_file)
os.rename(tmp_meta_file, ts_meta_file)
This is the actual simulation loop encapsulated into a function just to facilitate timming it below:
In [155]:
def solve():
time_step_index = 0
t = 0
time_step = initial_time_step
elapsed_time_since_last_trend_output = 0.0
elapsed_time_since_last_profile_output = 0.0
timeset = []
# create basic directory structire if it doesn't exist, if it exits clear content
project_path, meta_path = create_dirs()
meta_folder_name = ''
last_profile_metadata = profile_metadata = None
last_trend_metadata = trend_metadata = None
while t < total_simulation_time:
ts_folder_name = 'TS_' + '{:08d}'.format(time_step_index)
timeset.append(t)
if time_step_index == 0:
# always save initial condition, profiles and trends
tmp_path = tempfile.mkdtemp(dir=project_path)
profile_metadata = write_profiles(pipes, tmp_path)
trend_metadata = write_trends(pipes, positions, tmp_path)
os.rename(tmp_path, os.path.join(project_path, ts_folder_name))
# also create the metadata folder if everyting goes fine until here
meta_folder_name = 'meta_' + '{:08d}'.format(time_step_index)
write_metadata_and_timeset(meta_path, meta_folder_name, profile_metadata, trend_metadata, timeset)
last_profile_metadata = profile_metadata
last_trend_metadata = trend_metadata
# call solve methods or in this case just use the values in the pipes list
elapsed_time_since_last_trend_output += time_step
elapsed_time_since_last_profile_output += time_step
if elapsed_time_since_last_trend_output > trend_output_frequency or \
elapsed_time_since_last_profile_output > profile_output_frequency:
tmp_path = tempfile.mkdtemp(dir=project_path)
if elapsed_time_since_last_trend_output > trend_output_frequency:
trend_metadata = write_trends(pipes, positions, tmp_path)
elapsed_time_since_last_trend_output = 0.0
if elapsed_time_since_last_profile_output > profile_output_frequency:
profile_metadata = write_profiles(pipes, tmp_path)
elapsed_time_since_last_profile_output = 0.0
os.rename(tmp_path, os.path.join(project_path, ts_folder_name))
# update timeset file
update_timeset_file(timeset, os.path.join(meta_path, meta_folder_name))
# if metadata changed, create a new directory and store the updated information
if last_profile_metadata != profile_metadata or last_trend_metadata != trend_metadata:
meta_folder_name = 'meta_' + '{:08d}'.format(time_step_index)
write_metadata_and_timeset(meta_path, meta_folder_name, profile_metadata, trend_metadata, timeset)
time_step = new_time_step(time_step, time_step_increase_factor, maximum_time_step)
t += time_step
time_step_index += 1
In [156]:
%%timeit
solve()
First implementation was using a .json file to store the timeset, but as it can become quite big (all simulated time steps, not only the saved ones) the dump to the .json file was a bottleneck: 93.7% of the time was spent just in update_timeset_file method:
%lprun -f solve solve()
Timer unit: 3.01859e-07 s
Total time: 378.498 s
File: <ipython-input-38-26d1dd4204e0>
Function: solve at line 1
Line # Hits Time Per Hit % Time Line Contents
==============================================================
1 def solve():
2
3 1 6 6.0 0.0 time_step_index = 0
4 1 3 3.0 0.0 t = 0
5 1 3 3.0 0.0 time_step = initial_time_step
6
7 1 2 2.0 0.0 elapsed_time_since_last_trend_output = 0.0
8 1 2 2.0 0.0 elapsed_time_since_last_profile_output = 0.0
9
10 1 3 3.0 0.0 timeset = []
11
12 1 29782315 29782315.0 2.4 project_path, meta_path = create_dirs()
13
14 1 10 10.0 0.0 last_profile_metadata = None
15 1 5 5.0 0.0 last_trend_metadata = None
16
17 10846 38824 3.6 0.0 while t < total_simulation_time:
18
19 10845 210044 19.4 0.0 ts_folder_name = 'TS_' + '{:08d}'.format(time_step_index)
20
21 10845 50351 4.6 0.0 timeset.append(t)
22
23 10845 32718 3.0 0.0 if time_step_index == 0:
24 # always save initial condition
25 1 2049 2049.0 0.0 tmp_path = tempfile.mkdtemp(dir=project_path)
26
27 1 107427 107427.0 0.0 profile_metadata = write_profiles(pipes, tmp_path)
28 1 4265 4265.0 0.0 trend_metadata = write_trends(pipes, positions, tmp_path)
29 1 1919 1919.0 0.0 os.rename(tmp_path, os.path.join(project_path, ts_folder_name))
30
31 # also create the metadata folder if everyting goes fine until here
32 1 26 26.0 0.0 meta_folder_name = 'meta_' + '{:08d}'.format(time_step_index)
33 1 42555 42555.0 0.0 write_metadata_and_timeset(meta_path, meta_folder_name, profile_metadata, trend_metadata, timeset)
34
35 1 7 7.0 0.0 last_profile_metadata = profile_metadata
36 1 2 2.0 0.0 last_trend_metadata = trend_metadata
37
38 # set up and run the model but here this is not necessary as there is no simulation,
39 # we just use the values in the pipes list
40
41 10845 36874 3.4 0.0 elapsed_time_since_last_trend_output += time_step
42 10845 33156 3.1 0.0 elapsed_time_since_last_profile_output += time_step
43
44 10845 42624 3.9 0.0 if elapsed_time_since_last_trend_output > trend_output_frequency or elapsed_time_since_last_profile_output > profile_output_frequency:
45
46 5437 5030271 925.2 0.4 tmp_path = tempfile.mkdtemp(dir=project_path)
47
48 5437 57617 10.6 0.0 if elapsed_time_since_last_trend_output > trend_output_frequency:
49 5402 23472668 4345.2 1.9 trend_metadata = write_trends(pipes, positions, tmp_path)
50 5402 37765 7.0 0.0 elapsed_time_since_last_trend_output = 0.0
51
52 5437 34341 6.3 0.0 if elapsed_time_since_last_profile_output > profile_output_frequency:
53 71 7417821 104476.4 0.6 profile_metadata = write_profiles(pipes, tmp_path)
54 71 685 9.6 0.0 elapsed_time_since_last_profile_output = 0.0
55
56 5437 11762379 2163.4 0.9 os.rename(tmp_path, os.path.join(project_path, ts_folder_name))
57
58 # update timeset file
59 5437 1174911177 216095.5 93.7 update_timeset_file(timeset, os.path.join(meta_path, meta_folder_name))
60
61 # if metadata changed, create a new directory and store the updated information
62 5437 548502 100.9 0.0 if last_profile_metadata != profile_metadata or last_trend_metadata != trend_metadata:
63 meta_folder_name = 'meta_' + '{:08d}'.format(time_step_index)
64 write_metadata_and_timeset(meta_path, meta_folder_name, profile_metadata, trend_metadata, timeset)
65
66
67 10845 143380 13.2 0.0 time_step = new_time_step(time_step, time_step_increase_factor, maximum_time_step)
68 10845 48121 4.4 0.0 t += time_step
69 10845 39312 3.6 0.0 time_step_index += 1
In more detail:
%lprun -f update_timeset_file solve()
Timer unit: 3.01859e-07 s
Total time: 356.392 s
File: <ipython-input-37-7e67500ff9fe>
Function: update_timeset_file at line 1
Line # Hits Time Per Hit % Time Line Contents
==============================================================
1 def update_timeset_file(timeset, meta_path):
2
3 5437 369593 68.0 0.0 tmp_file = os.path.join(meta_path, 'tmp.json')
4 5437 320227 58.9 0.0 timeset_file = os.path.join(meta_path, 'timeset.json')
5
6 5437 64538 11.9 0.0 content = dict({'unit': 's', 'values': timeset})
7 5437 3746910 689.2 0.3 with open(tmp_file, 'w') as file:
8 5437 1156926499 212787.7 98.0 json.dump(obj=content, fp=file, indent=4)
9
10 5437 2704180 497.4 0.2 if os.path.exists(timeset_file):
11 5437 6365348 1170.7 0.5 os.remove(timeset_file)
12
13 5437 10157657 1868.2 0.9 os.rename(tmp_file, timeset_file)
The decision was to split the information and save two files, one containing the timeset metadata timeset.json and other with the values using a numpy dump timeset.values.
%lprun -f solve solve()
Timer unit: 3.01859e-07 s
Total time: 35.2779 s
File: <ipython-input-11-9d0838aa97cd>
Function: solve at line 1
Line # Hits Time Per Hit % Time Line Contents
==============================================================
1 def solve():
2
3 1 6 6.0 0.0 time_step_index = 0
4 1 3 3.0 0.0 t = 0
5 1 3 3.0 0.0 time_step = initial_time_step
6
7 1 2 2.0 0.0 elapsed_time_since_last_trend_output = 0.0
8 1 3 3.0 0.0 elapsed_time_since_last_profile_output = 0.0
9
10 1 2 2.0 0.0 timeset = []
11
12 # create basic directory structire if it doesn't exist, if it exits clear content
13 1 197836 197836.0 0.2 project_path, meta_path = create_dirs()
14 1 7 7.0 0.0 meta_folder_name = ''
15
16 1 3 3.0 0.0 last_profile_metadata = profile_metadata = None
17 1 2 2.0 0.0 last_trend_metadata = trend_metadata = None
18
19 10846 39773 3.7 0.0 while t < total_simulation_time:
20
21 10845 165287 15.2 0.1 ts_folder_name = 'TS_' + '{:08d}'.format(time_step_index)
22
23 10845 47141 4.3 0.0 timeset.append(t)
24
25 10845 31701 2.9 0.0 if time_step_index == 0:
26 # always save initial condition, profiles and trends
27 1 953 953.0 0.0 tmp_path = tempfile.mkdtemp(dir=project_path)
28
29 1 103800 103800.0 0.1 profile_metadata = write_profiles(pipes, tmp_path)
30 1 3713 3713.0 0.0 trend_metadata = write_trends(pipes, positions, tmp_path)
31 1 1826 1826.0 0.0 os.rename(tmp_path, os.path.join(project_path, ts_folder_name))
32
33 # also create the metadata folder if everyting goes fine until here
34 1 18 18.0 0.0 meta_folder_name = 'meta_' + '{:08d}'.format(time_step_index)
35 1 41405 41405.0 0.0 write_metadata_and_timeset(meta_path, meta_folder_name, profile_metadata, trend_metadata, timeset)
36
37 1 6 6.0 0.0 last_profile_metadata = profile_metadata
38 1 3 3.0 0.0 last_trend_metadata = trend_metadata
39
40 # call solve methods or in this case just use the values in the pipes list
41
42 10845 37281 3.4 0.0 elapsed_time_since_last_trend_output += time_step
43 10845 32066 3.0 0.0 elapsed_time_since_last_profile_output += time_step
44
45 10845 41421 3.8 0.0 if elapsed_time_since_last_trend_output > trend_output_frequency or elapsed_time_since_last_profile_output > profile_output_frequency:
46
47 5437 4479454 823.9 3.8 tmp_path = tempfile.mkdtemp(dir=project_path)
48
49 5437 46541 8.6 0.0 if elapsed_time_since_last_trend_output > trend_output_frequency:
50 5402 22644276 4191.8 19.4 trend_metadata = write_trends(pipes, positions, tmp_path)
51 5402 36690 6.8 0.0 elapsed_time_since_last_trend_output = 0.0
52
53 5437 32757 6.0 0.0 if elapsed_time_since_last_profile_output > profile_output_frequency:
54 71 7820215 110143.9 6.7 profile_metadata = write_profiles(pipes, tmp_path)
55 71 714 10.1 0.0 elapsed_time_since_last_profile_output = 0.0
56
57 5437 12242185 2251.6 10.5 os.rename(tmp_path, os.path.join(project_path, ts_folder_name))
58
59 # update timeset file
60 5437 68224529 12548.2 58.4 update_timeset_file(timeset, os.path.join(meta_path, meta_folder_name))
61
62 # if metadata changed, create a new directory and store the updated information
63 5437 392792 72.2 0.3 if last_profile_metadata != profile_metadata or last_trend_metadata != trend_metadata:
64 meta_folder_name = 'meta_' + '{:08d}'.format(time_step_index)
65 write_metadata_and_timeset(meta_path, meta_folder_name, profile_metadata, trend_metadata, timeset)
66
67
68 10845 122973 11.3 0.1 time_step = new_time_step(time_step, time_step_increase_factor, maximum_time_step)
69 10845 45225 4.2 0.0 t += time_step
70 10845 36260 3.3 0.0 time_step_index += 1
Again in more detail:
%lprun -f update_timeset_file solve()
Timer unit: 3.01859e-07 s
Total time: 20.5189 s
File: <ipython-input-10-1a69b20160e0>
Function: update_timeset_file at line 1
Line # Hits Time Per Hit % Time Line Contents
==============================================================
1 def update_timeset_file(timeset, meta_path):
2
3 5437 6109988 1123.8 9.0 ts = numpy.array(timeset)
4 5437 370292 68.1 0.5 tmp_values_file = os.path.join(meta_path, 'tmp.values')
5 5437 14579577 2681.5 21.4 ts.dump(tmp_values_file)
6
7 5437 455100 83.7 0.7 tmp_meta_file = os.path.join(meta_path, 'tmp.json')
8 5437 63703 11.7 0.1 content = dict({'unit': 's', 'count': len(timeset), 'fn': 'timeset.values'})
9 5437 3681250 677.1 5.4 with open(tmp_meta_file, 'w') as file:
10 5437 9022392 1659.4 13.3 json.dump(obj=content, fp=file, indent=4)
11
12 5437 455519 83.8 0.7 ts_values_file = os.path.join(meta_path, 'timeset.values')
13 5437 314816 57.9 0.5 ts_meta_file = os.path.join(meta_path, 'timeset.json')
14
15 5437 2227507 409.7 3.3 if os.path.exists(ts_values_file):
16 5437 5394110 992.1 7.9 os.remove(ts_values_file)
17 5437 2048175 376.7 3.0 if os.path.exists(ts_meta_file):
18 5437 4809399 884.6 7.1 os.remove(ts_meta_file)
19
20 5437 9412867 1731.3 13.8 os.rename(tmp_values_file, ts_values_file)
21 5437 9030373 1660.9 13.3 os.rename(tmp_meta_file, ts_meta_file)
Before: 378.498 s
After: 35.2779 s
In [157]:
#%load_ext line_profiler
http://glowingpython.blogspot.com.br/2014/08/quick-hdf5-with-pandas.html
The inspiration comes from the above link, it seems Pandas' DataFrame can be used to easily represent the HDF groups, creating and appending data where necessary. Let's try it then...
In [158]:
import numpy as np
from pandas import HDFStore, DataFrame, set_option
set_option('io.hdf.default_format','table')
In [159]:
def write_profiles_2_hdf(pipes, output_file, time_step_index, time):
hdf = HDFStore(output_file, complevel=9, complib='blosc')
pipe_index = 0
for pipe in pipes:
pipe_name = 'pipe_' + '{:03d}'.format(pipe_index)
folder_name = 'profiles/time_' + '{:06d}'.format(time_step_index) + '/' + pipe_name
local={}
for var in output_variables_per_pipe:
data = getattr(pipe, var)
local[var] = data
df = DataFrame(local)
hdf.put(folder_name, df)
pipe_index += 1
hdf.close()
In [160]:
def write_trends_2_hdf(pipes, positions, output_file, time_step_index, time):
hdf = HDFStore(output_file, complevel=9, complib='blosc')
pipe_index = 0
for pipe in pipes:
pipe_name = 'pipe_' + '{:03d}'.format(pipe_index)
folder_name = 'trends/' + pipe_name
local={}
for var, pos in zip(output_variables_per_pipe, positions):
data = getattr(pipe, var)
local[var] = data[pos]
df = DataFrame(local, index=[time_step_index])
if time_step_index == 0:
hdf.put(folder_name, df, index=False)
else:
hdf.append(folder_name, df, index=False)
pipe_index += 1
hdf.close()
In [161]:
def update_timeset_2_hdf(timeset, time_step_index, output_file):
hdf = HDFStore(output_file, complevel=9, complib='blosc')
folder_name = 'timeset'
hdf.put(folder_name, DataFrame(timeset))
hdf.close()
In [162]:
def solve_hdf():
time_step_index = 0
t = 0
time_step = initial_time_step
elapsed_time_since_last_trend_output = 0.0
elapsed_time_since_last_profile_output = 0.0
timeset = []
output_file = os.path.join(root_path, 'hdf_results_file.h5')
if os.path.exists(output_file):
os.remove(output_file)
last_profile_metadata = profile_metadata = None
last_trend_metadata = trend_metadata = None
while t < total_simulation_time:
timeset.append(t)
if time_step_index == 0:
# always save initial condition, profiles and trends
write_profiles_2_hdf(pipes, output_file, time_step_index, t)
write_trends_2_hdf(pipes, positions, output_file, time_step_index, t)
# call solve methods or in this case just use the values in the pipes list
elapsed_time_since_last_trend_output += time_step
elapsed_time_since_last_profile_output += time_step
if elapsed_time_since_last_trend_output > trend_output_frequency or \
elapsed_time_since_last_profile_output > profile_output_frequency:
if elapsed_time_since_last_trend_output > trend_output_frequency:
write_trends_2_hdf(pipes, positions, output_file, time_step_index, t)
elapsed_time_since_last_trend_output = 0.0
if elapsed_time_since_last_profile_output > profile_output_frequency:
write_profiles_2_hdf(pipes, output_file, time_step_index, t)
elapsed_time_since_last_profile_output = 0.0
# update timeset file
update_timeset_2_hdf(timeset, time_step_index, output_file)
time_step = new_time_step(time_step, time_step_increase_factor, maximum_time_step)
t += time_step
time_step_index += 1
In [163]:
%%timeit
solve_hdf()
Using compression makes the file size drops from 62,7 Mb to 24,3 Mb.
The use of index=False while appending data made the total time drops from ~780 to ~400 seconds.
Results are almost the same in terms of file size but in terms of time, dumping the raw numpy data showed much faster. The main cause is that appending data to the trends is an expensive operation. If instead of appending trend data, a similar structure as for the profiles is built (save trend data every time), the file gets bigger, around 335 Mb, and the time increases as more elements are added to the hdf file.
lprun -f solve_hdf solve_hdf()
Timer unit: 3.01859e-07 s
Total time: 561.247 s
File: <ipython-input-18-5225533419f9>
Function: solve_hdf at line 1
Line # Hits Time Per Hit % Time Line Contents
==============================================================
1 def solve_hdf():
2
3 1 7 7.0 0.0 time_step_index = 0
4 1 3 3.0 0.0 t = 0
5 1 3 3.0 0.0 time_step = initial_time_step
6
7 1 4 4.0 0.0 elapsed_time_since_last_trend_output = 0.0
8 1 2 2.0 0.0 elapsed_time_since_last_profile_output = 0.0
9
10 1 3 3.0 0.0 timeset = []
11
12 1 140 140.0 0.0 output_file = os.path.join(root_path, 'hdf_results_file.h5')
13 1 2035 2035.0 0.0 if os.path.exists(output_file):
14 1 9709 9709.0 0.0 os.remove(output_file)
15
16 1 9 9.0 0.0 last_profile_metadata = profile_metadata = None
17 1 5 5.0 0.0 last_trend_metadata = trend_metadata = None
18
19 10846 38872 3.6 0.0 while t < total_simulation_time:
20
21 10845 39093 3.6 0.0 timeset.append(t)
22
23 10845 29691 2.7 0.0 if time_step_index == 0:
24 # always save initial condition, profiles and trends
25 1 615936 615936.0 0.0 write_profiles_2_hdf(pipes, output_file, time_step_index, t)
26 1 250852 250852.0 0.0 write_trends_2_hdf(pipes, positions, output_file, time_step_index, t)
27
28 # call solve methods or in this case just use the values in the pipes list
29
30 10845 33713 3.1 0.0 elapsed_time_since_last_trend_output += time_step
31 10845 31333 2.9 0.0 elapsed_time_since_last_profile_output += time_step
32
33 10845 42973 4.0 0.0 if elapsed_time_since_last_trend_output > trend_output_frequency or elapsed_time_since_last_profile_output > profile_output_frequency:
34
35 5437 14523 2.7 0.0 if elapsed_time_since_last_trend_output > trend_output_frequency:
36 5402 1437442723 266094.5 77.3 write_trends_2_hdf(pipes, positions, output_file, time_step_index, t)
37 5402 44686 8.3 0.0 elapsed_time_since_last_trend_output = 0.0
38
39 5437 50935 9.4 0.0 if elapsed_time_since_last_profile_output > profile_output_frequency:
40 71 38007387 535315.3 2.0 write_profiles_2_hdf(pipes, output_file, time_step_index, t)
41 71 593 8.4 0.0 elapsed_time_since_last_profile_output = 0.0
42
43 # update timeset file
44 5437 382398143 70332.6 20.6 timeset = update_timeset_2_hdf(timeset, time_step_index, output_file)
45
46 10845 167670 15.5 0.0 time_step = new_time_step(time_step, time_step_increase_factor, maximum_time_step)
47 10845 44826 4.1 0.0 t += time_step
48 10845 34471 3.2 0.0 time_step_index += 1
lprun -f write_trends_2_hdf solve_hdf()
Timer unit: 3.01859e-07 s
Total time: 466.429 s
File: <ipython-input-16-0960896438b6>
Function: write_trends_2_hdf at line 1
Line # Hits Time Per Hit % Time Line Contents
==============================================================
1 def write_trends_2_hdf(pipes, positions, output_file, time_step_index, time):
2
3 5403 26576072 4918.8 1.7 hdf = HDFStore(output_file, complevel=9, complib='blosc')
4
5 5403 33586 6.2 0.0 pipe_index = 0
6
7 59433 254140 4.3 0.0 for pipe in pipes:
8 54030 1539888 28.5 0.1 pipe_name = 'pipe_' + '{:03d}'.format(pipe_index)
9 54030 233666 4.3 0.0 folder_name = 'trends/' + pipe_name
10 54030 317760 5.9 0.0 local={}
11
12 270150 1168972 4.3 0.1 for var, pos in zip(output_variables_per_pipe, positions):
13 216120 1076875 5.0 0.1 data = getattr(pipe, var)
14 216120 1114198 5.2 0.1 local[var] = data[pos]
15
16 54030 246591786 4564.0 16.0 df = DataFrame(local, index=[time_step_index])
17 54030 319697 5.9 0.0 if time_step_index == 0:
18 10 214424 21442.4 0.0 hdf.put(folder_name, df, index=False)
19 else:
20 54020 1226677630 22707.8 79.4 hdf.append(folder_name, df, index=False)
21 54030 471009 8.7 0.0 pipe_index += 1
22
23
24 5403 38597675 7143.7 2.5 hdf.close()
There is still the possibiity of using numpy compressed format numpy.savez_compressed, in this case:
Before: 26.2Mb and 46 s
After: 13.9 Mb and 104 s
If we try to save the compressed format only for the profiles we can achieve a better total file size (43% reduction) with less impact in time (14% increase):
14.9 Mb and 52.4 s
Since we have now .npz and .npy (compressed and uncompressed) files, it would be good to have this information in the meta-data files as well as an item even if the same method to load the data can be used numpy.load.
Despite numpy.savez_compressed provides some facility, the file format together with the data introduces some metadata already. It was decided then to use the rawest format as possible, so ndarray.tofile was also tested.
11.3Mb and ~40 s
In [164]:
def read_raw_profile(desired_properties):
project_name = 'my_project_folder'
project_path = os.path.join(root_path, project_name)
meta_path = os.path.join(project_path, 'meta')
if not os.path.exists(project_path):
raise(IOError, 'Can not find the project folder: %s' % project_path)
if not os.path.exists(meta_path):
raise(IOError, 'Can not find the metadata folder: %s' % meta_path)
# in this case there is a single metafile, but we could have more than one
profiles_meta_file = os.path.join(os.path.join(meta_path, 'meta_00000000'), 'profiles.json')
with open(profiles_meta_file) as file:
profiles_meta = json.load(file)
for desired_property, desired_edge in desired_properties:
meta = None
for pf in profiles_meta:
if pf['edge'] == desired_edge and pf['property_id'] == desired_property:
meta = pf
break
data_fn = meta['fn']
surface = []
for subdir, dirs, files in os.walk(project_path):
for dir in dirs:
if dir.startswith('TS'):
dir_path = os.path.join(project_path, dir)
profile_path = os.path.join(dir_path, 'profiles')
if os.path.exists(profile_path):
file_path = os.path.join(profile_path, data_fn)
surface.append(np.fromfile(file_path))
# now use the array to do whathever you want...
In [165]:
single_property = [
('pressure', 'pipe_002')
]
In [166]:
%%timeit
# time to read one profile
read_raw_profile(single_property)
In [167]:
#lprun -f read_raw_profile read_raw_profile(single_property)
In [168]:
all_properties = []
for i in range(number_of_pipes):
pipe_name = 'pipe_' + '{:03d}'.format(i)
for prop in output_variables_per_pipe:
all_properties.append((prop, pipe_name))
In [169]:
%%timeit
# time to read several profiles
read_raw_profile(all_properties)
Reading one profile takes ~2 seconds while reading 40 profiles (4 variables x 10 pipes) takes ~80 seconds, fairly linear relation, which is good...
In [170]:
def read_hdf_profile(desired_properties):
from pandas import read_hdf
import pandas
with pandas.HDFStore('hdf_results_file.h5') as hdf:
for desired_property, desired_edge in desired_properties:
surface = []
for key in hdf.keys():
if 'profiles' in key and desired_edge in key:
data = read_hdf('hdf_results_file.h5', key)
surface.append(data[desired_property].values)
In [171]:
%%timeit
read_hdf_profile(single_property)
In [172]:
%%timeit
read_hdf_profile(all_properties)
In [173]:
#lprun -f read_hdf_profile read_hdf_profile(single_property)
Read one property in the HDF takes ~4.5 seconds and to read 40 properties ~190 seconds (~42 times more), also fairly linear...
In [ ]: