Test Write Strategy

Currently HDF5 is our standard file format to store simulation results and this notebook aims to check if playing with raw data yields better perfomance (in terms of both, speed and file size).

The idea is to emulate an ALFASim simulation that generates a large amount of data (profiles and trends). We are saving all pipes variables as profiles and for each pipe the trends are defined as the values at the first position just for easiness.

Set some constants to determine the problem size, frequencies and where to store the data to be saved


In [145]:
number_of_pipes = 10
number_of_cells = 500
total_simulation_time = 6 * 60 * 60  # [s], 6 hours of simulation

trend_output_frequency = 2  # [s]
profile_output_frequency = 5 * 60  # [s]

initial_time_step = 1.0e-2  # [s]
time_step_increase_factor = 1.1
maximum_time_step = 2  # [s]

import tempfile
import os
import json
import numpy

root_path = os.getcwd()

Let's create a list of pipes, each one with some output properies and random values. During the simulation the data for each pipe will change but for our pourposes it can be always the same.


In [146]:
from collections import namedtuple
import numpy as np

output_variables_per_pipe = [
    'pressure', 
    'temperature', 
    'gas_velocity', 
    'liquid_velocity'
]

positions = number_of_pipes * [0]

class Pipe:
    __slots__ = list(output_variables_per_pipe)


pipes = []
for i in range(number_of_pipes):
    pipes.append(Pipe())
    for var in output_variables_per_pipe:
        setattr(pipes[-1], var, np.random.rand(number_of_cells))

Time step sizes are not constant during the simulation, so emulate a growing time step size up to the point when it reaches the maximum value.


In [147]:
def new_time_step(time_step, time_step_increase_factor, maximum_time_step):
    new_time_step = time_step * time_step_increase_factor
    if new_time_step > maximum_time_step:
        return maximum_time_step
    else:
        return new_time_step

Raw data from NumPy and metadata

Crate a folder to store the simulation results. This would have the project name associated to it but here simplename is enough, just check whether it exists or not and clear the folder content if necessary. A folder inside of the main one is also created to store the metadata.


In [148]:
def create_dirs():

    project_name = 'my_project_folder'
    project_path = os.path.join(root_path, project_name)
    meta_path = os.path.join(project_path, 'meta')

    if not os.path.exists(project_path):
        os.makedirs(project_path)
    else:
        # clear directory content
        # in real life need to ask the user what to do, if delete and rerun simulation or stop
        import shutil
        for file in os.listdir(project_path):
            file_path = os.path.join(project_path, file)

            if os.path.isfile(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path): 
                shutil.rmtree(file_path, ignore_errors=True)
    
    os.makedirs(meta_path)
    
    return project_path, meta_path

Some auxiliary methods to actually write the data...


In [149]:
def write_profiles(pipes, tmp_path):
    from numpy import ndarray
    
    profile_path = os.path.join(tmp_path, 'profiles')
    os.makedirs(profile_path)
    
    metadata = []
    pipe_index = 0
    profile_index = 0
    
    for pipe in pipes:
        pipe_name = 'pipe_' + '{:03d}'.format(pipe_index)
        
        for var in sorted(output_variables_per_pipe):
            md = {}
            output_file = 'profile_' + '{:03d}'.format(profile_index) + '.binary'
            data = getattr(pipe, var)
            data.tofile(os.path.join(profile_path, output_file))
            #np.savez_compressed(os.path.join(profile_path, output_file), a=data)
            
            md['property_id'] = var
            md['edge'] = pipe_name
            md['fn'] = output_file
            md['size'] = len(data)
            
            profile_index += 1
            metadata.append(md)
            
        pipe_index += 1

    return metadata

In [150]:
def write_trends(pipes, positions, tmp_path):
    from numpy import array
    
    trends_path = os.path.join(tmp_path, 'trends')
    os.makedirs(trends_path)
    
    metadata = []
    trend_data = []
    i = 0
    
    output_file = 'trends.binary'
    
    for pipe, position in zip(pipes, positions):
        pipe_name = 'pipe_' + '{:03d}'.format(i)
        
        for var in sorted(output_variables_per_pipe):
            md = {}
            
            full_data = getattr(pipe, var)
            trend_data.append(full_data[position])
            
            md['property_id'] = var
            md['edge'] = pipe_name
            md['fn'] = output_file
            md['position'] = position
            
            metadata.append(md)

        i += 1

    trend_data = array(trend_data)
    trend_data.tofile(os.path.join(trends_path, output_file))

    return metadata

In [151]:
def write_metadata(folder_path, metadata, filename):

    with open(os.path.join(folder_path, filename), 'w') as file:
        json.dump(obj=metadata, fp=file, indent=4)

In [152]:
def write_timeset(folder_path, timeset):
    ts = numpy.array(timeset)
    ts_values_file = os.path.join(folder_path, 'timeset.values.binary')
    ts.tofile(ts_values_file)
    
    content = dict({'unit': 's', 'count': len(timeset), 'fn': 'timeset.values.npy'})
    with open(os.path.join(folder_path, 'timeset.json'), 'w') as file:
        json.dump(obj=content, fp=file, indent=4)

In [153]:
def write_metadata_and_timeset(metadata_path, folder_name, profile_metadata, trend_metadata, timeset):
    tmp_path = tempfile.mkdtemp(dir=metadata_path)
    write_metadata(tmp_path, profile_metadata, 'profiles.json')
    write_metadata(tmp_path, trend_metadata, 'trends.json')
    write_timeset(tmp_path, timeset)
    os.rename(tmp_path, os.path.join(metadata_path, folder_name))

In [154]:
def update_timeset_file(timeset, meta_path):

    ts = numpy.array(timeset)
    tmp_values_file = os.path.join(meta_path, 'tmp.values.binary')
    ts.tofile(tmp_values_file)
    
    tmp_meta_file = os.path.join(meta_path, 'tmp.json')
    content = dict({'unit': 's', 'count': len(timeset), 'fn': 'timeset.values.binary'})
    with open(tmp_meta_file, 'w') as file:
        json.dump(obj=content, fp=file, indent=4)        
    
    ts_values_file = os.path.join(meta_path, 'timeset.values.binary')
    ts_meta_file = os.path.join(meta_path, 'timeset.json')
    
    if os.path.exists(ts_values_file):
        os.remove(ts_values_file)
    if os.path.exists(ts_meta_file):
        os.remove(ts_meta_file)
 
    os.rename(tmp_values_file, ts_values_file)
    os.rename(tmp_meta_file, ts_meta_file)

This is the actual simulation loop encapsulated into a function just to facilitate timming it below:


In [155]:
def solve():    
    
    time_step_index = 0
    t = 0
    time_step = initial_time_step

    elapsed_time_since_last_trend_output = 0.0
    elapsed_time_since_last_profile_output = 0.0

    timeset = []

    # create basic directory structire if it doesn't exist, if it exits clear content
    project_path, meta_path = create_dirs()
    meta_folder_name = ''

    last_profile_metadata = profile_metadata = None
    last_trend_metadata = trend_metadata = None

    while t < total_simulation_time:

        ts_folder_name = 'TS_' + '{:08d}'.format(time_step_index)
        
        timeset.append(t)

        if time_step_index == 0:
            # always save initial condition, profiles and trends
            tmp_path = tempfile.mkdtemp(dir=project_path)

            profile_metadata = write_profiles(pipes, tmp_path)
            trend_metadata = write_trends(pipes, positions, tmp_path)
            os.rename(tmp_path, os.path.join(project_path, ts_folder_name))
            
            # also create the metadata folder if everyting goes fine until here
            meta_folder_name = 'meta_' + '{:08d}'.format(time_step_index)
            write_metadata_and_timeset(meta_path, meta_folder_name, profile_metadata, trend_metadata, timeset)
            
            last_profile_metadata = profile_metadata
            last_trend_metadata = trend_metadata
            
        # call solve methods or in this case just use the values in the pipes list

        elapsed_time_since_last_trend_output += time_step
        elapsed_time_since_last_profile_output += time_step

        if elapsed_time_since_last_trend_output > trend_output_frequency or \
            elapsed_time_since_last_profile_output > profile_output_frequency:

            tmp_path = tempfile.mkdtemp(dir=project_path)

            if elapsed_time_since_last_trend_output > trend_output_frequency:
                trend_metadata = write_trends(pipes, positions, tmp_path)
                elapsed_time_since_last_trend_output = 0.0

            if elapsed_time_since_last_profile_output > profile_output_frequency:
                profile_metadata = write_profiles(pipes, tmp_path)
                elapsed_time_since_last_profile_output = 0.0

            os.rename(tmp_path, os.path.join(project_path, ts_folder_name))
            
            # update timeset file 
            update_timeset_file(timeset, os.path.join(meta_path, meta_folder_name))
            
            # if metadata changed, create a new directory and store the updated information
            if last_profile_metadata != profile_metadata or last_trend_metadata != trend_metadata:
                meta_folder_name = 'meta_' + '{:08d}'.format(time_step_index)
                write_metadata_and_timeset(meta_path, meta_folder_name, profile_metadata, trend_metadata, timeset)

                
        time_step = new_time_step(time_step, time_step_increase_factor, maximum_time_step)
        t += time_step
        time_step_index += 1

In [156]:
%%timeit
solve()


36.3 s ± 420 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

First implementation was using a .json file to store the timeset, but as it can become quite big (all simulated time steps, not only the saved ones) the dump to the .json file was a bottleneck: 93.7% of the time was spent just in update_timeset_file method:

%lprun -f solve solve()

Timer unit: 3.01859e-07 s

Total time: 378.498 s
File: <ipython-input-38-26d1dd4204e0>
Function: solve at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     1                                           def solve():    
     2                                               
     3         1            6      6.0      0.0      time_step_index = 0
     4         1            3      3.0      0.0      t = 0
     5         1            3      3.0      0.0      time_step = initial_time_step
     6                                           
     7         1            2      2.0      0.0      elapsed_time_since_last_trend_output = 0.0
     8         1            2      2.0      0.0      elapsed_time_since_last_profile_output = 0.0
     9                                           
    10         1            3      3.0      0.0      timeset = []
    11                                           

    12         1     29782315 29782315.0      2.4      project_path, meta_path = create_dirs()
    13                                           
    14         1           10     10.0      0.0      last_profile_metadata = None
    15         1            5      5.0      0.0      last_trend_metadata = None
    16                                           
    17     10846        38824      3.6      0.0      while t < total_simulation_time:
    18                                           


    19     10845       210044     19.4      0.0          ts_folder_name = 'TS_' + '{:08d}'.format(time_step_index)
    20                                                   
    21     10845        50351      4.6      0.0          timeset.append(t)
    22                                           
    23     10845        32718      3.0      0.0          if time_step_index == 0:
    24                                                       # always save initial condition 
    25         1         2049   2049.0      0.0              tmp_path = tempfile.mkdtemp(dir=project_path)
    26                                           
    27         1       107427 107427.0      0.0              profile_metadata = write_profiles(pipes, tmp_path)
    28         1         4265   4265.0      0.0              trend_metadata = write_trends(pipes, positions, tmp_path)
    29         1         1919   1919.0      0.0              os.rename(tmp_path, os.path.join(project_path, ts_folder_name))
    30                                                       
    31                                                       # also create the metadata folder if everyting goes fine until here
    32         1           26     26.0      0.0              meta_folder_name = 'meta_' + '{:08d}'.format(time_step_index)
    33         1        42555  42555.0      0.0              write_metadata_and_timeset(meta_path, meta_folder_name, profile_metadata, trend_metadata, timeset)
    34                                                       
    35         1            7      7.0      0.0              last_profile_metadata = profile_metadata
    36         1            2      2.0      0.0              last_trend_metadata = trend_metadata
    37                                                       
    38                                                   # set up and run the model but here this is not necessary as there is no simulation,
    39                                                   # we just use the values in the pipes list
    40                                           
    41     10845        36874      3.4      0.0          elapsed_time_since_last_trend_output += time_step
    42     10845        33156      3.1      0.0          elapsed_time_since_last_profile_output += time_step
    43                                           
    44     10845        42624      3.9      0.0          if elapsed_time_since_last_trend_output > trend_output_frequency or             elapsed_time_since_last_profile_output > profile_output_frequency:
    45                                           
    46      5437      5030271    925.2      0.4              tmp_path = tempfile.mkdtemp(dir=project_path)
    47                                           
    48      5437        57617     10.6      0.0              if elapsed_time_since_last_trend_output > trend_output_frequency:
    49      5402     23472668   4345.2      1.9                  trend_metadata = write_trends(pipes, positions, tmp_path)
    50      5402        37765      7.0      0.0                  elapsed_time_since_last_trend_output = 0.0
    51                                           
    52      5437        34341      6.3      0.0              if elapsed_time_since_last_profile_output > profile_output_frequency:
    53        71      7417821 104476.4      0.6                  profile_metadata = write_profiles(pipes, tmp_path)
    54        71          685      9.6      0.0                  elapsed_time_since_last_profile_output = 0.0
    55                                           
    56      5437     11762379   2163.4      0.9              os.rename(tmp_path, os.path.join(project_path, ts_folder_name))
    57                                                       
    58                                                       # update timeset file 
    59      5437   1174911177 216095.5     93.7              update_timeset_file(timeset, os.path.join(meta_path, meta_folder_name))
    60                                                       
    61                                                       # if metadata changed, create a new directory and store the updated information
    62      5437       548502    100.9      0.0              if last_profile_metadata != profile_metadata or last_trend_metadata != trend_metadata:
    63                                                           meta_folder_name = 'meta_' + '{:08d}'.format(time_step_index)
    64                                                           write_metadata_and_timeset(meta_path, meta_folder_name, profile_metadata, trend_metadata, timeset)
    65                                           
    66                                                           
    67     10845       143380     13.2      0.0          time_step = new_time_step(time_step, time_step_increase_factor, maximum_time_step)
    68     10845        48121      4.4      0.0          t += time_step
    69     10845        39312      3.6      0.0          time_step_index += 1

In more detail:

%lprun -f update_timeset_file solve()

Timer unit: 3.01859e-07 s

Total time: 356.392 s
File: <ipython-input-37-7e67500ff9fe>
Function: update_timeset_file at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     1                                           def update_timeset_file(timeset, meta_path):
     2                                           

     3      5437       369593     68.0      0.0      tmp_file = os.path.join(meta_path, 'tmp.json')


     4      5437       320227     58.9      0.0      timeset_file = os.path.join(meta_path, 'timeset.json')
     5                                           
     6      5437        64538     11.9      0.0      content = dict({'unit': 's', 'values': timeset})
     7      5437      3746910    689.2      0.3      with open(tmp_file, 'w') as file:
     8      5437   1156926499 212787.7     98.0          json.dump(obj=content, fp=file, indent=4)    


     9                                               

    10      5437      2704180    497.4      0.2      if os.path.exists(timeset_file):


    11      5437      6365348   1170.7      0.5          os.remove(timeset_file)
    12                                            

    13      5437     10157657   1868.2      0.9      os.rename(tmp_file, timeset_file)

The decision was to split the information and save two files, one containing the timeset metadata timeset.json and other with the values using a numpy dump timeset.values.

%lprun -f solve solve()

Timer unit: 3.01859e-07 s

Total time: 35.2779 s
File: <ipython-input-11-9d0838aa97cd>
Function: solve at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     1                                           def solve():    
     2                                               
     3         1            6      6.0      0.0      time_step_index = 0
     4         1            3      3.0      0.0      t = 0
     5         1            3      3.0      0.0      time_step = initial_time_step
     6                                           
     7         1            2      2.0      0.0      elapsed_time_since_last_trend_output = 0.0
     8         1            3      3.0      0.0      elapsed_time_since_last_profile_output = 0.0
     9                                           
    10         1            2      2.0      0.0      timeset = []
    11                                           
    12                                               # create basic directory structire if it doesn't exist, if it exits clear content
    13         1       197836 197836.0      0.2      project_path, meta_path = create_dirs()
    14         1            7      7.0      0.0      meta_folder_name = ''
    15                                           
    16         1            3      3.0      0.0      last_profile_metadata = profile_metadata = None
    17         1            2      2.0      0.0      last_trend_metadata = trend_metadata = None

    18                                           
    19     10846        39773      3.7      0.0      while t < total_simulation_time:
    20                                           
    21     10845       165287     15.2      0.1          ts_folder_name = 'TS_' + '{:08d}'.format(time_step_index)
    22                                                   
    23     10845        47141      4.3      0.0          timeset.append(t)
    24                                           
    25     10845        31701      2.9      0.0          if time_step_index == 0:
    26                                                       # always save initial condition, profiles and trends
    27         1          953    953.0      0.0              tmp_path = tempfile.mkdtemp(dir=project_path)
    28                                           
    29         1       103800 103800.0      0.1              profile_metadata = write_profiles(pipes, tmp_path)
    30         1         3713   3713.0      0.0              trend_metadata = write_trends(pipes, positions, tmp_path)
    31         1         1826   1826.0      0.0              os.rename(tmp_path, os.path.join(project_path, ts_folder_name))
    32                                                       
    33                                                       # also create the metadata folder if everyting goes fine until here
    34         1           18     18.0      0.0              meta_folder_name = 'meta_' + '{:08d}'.format(time_step_index)
    35         1        41405  41405.0      0.0              write_metadata_and_timeset(meta_path, meta_folder_name, profile_metadata, trend_metadata, timeset)
    36                                                       
    37         1            6      6.0      0.0              last_profile_metadata = profile_metadata
    38         1            3      3.0      0.0              last_trend_metadata = trend_metadata
    39                                                       

    40                                                   # call solve methods or in this case just use the values in the pipes list
    41                                           
    42     10845        37281      3.4      0.0          elapsed_time_since_last_trend_output += time_step
    43     10845        32066      3.0      0.0          elapsed_time_since_last_profile_output += time_step
    44                                           
    45     10845        41421      3.8      0.0          if elapsed_time_since_last_trend_output > trend_output_frequency or             elapsed_time_since_last_profile_output > profile_output_frequency:
    46                                           
    47      5437      4479454    823.9      3.8              tmp_path = tempfile.mkdtemp(dir=project_path)
    48                                           
    49      5437        46541      8.6      0.0              if elapsed_time_since_last_trend_output > trend_output_frequency:
    50      5402     22644276   4191.8     19.4                  trend_metadata = write_trends(pipes, positions, tmp_path)
    51      5402        36690      6.8      0.0                  elapsed_time_since_last_trend_output = 0.0
    52                                           
    53      5437        32757      6.0      0.0              if elapsed_time_since_last_profile_output > profile_output_frequency:
    54        71      7820215 110143.9      6.7                  profile_metadata = write_profiles(pipes, tmp_path)
    55        71          714     10.1      0.0                  elapsed_time_since_last_profile_output = 0.0
    56                                           
    57      5437     12242185   2251.6     10.5              os.rename(tmp_path, os.path.join(project_path, ts_folder_name))
    58                                                       
    59                                                       # update timeset file 
    60      5437     68224529  12548.2     58.4              update_timeset_file(timeset, os.path.join(meta_path, meta_folder_name))
    61                                                       
    62                                                       # if metadata changed, create a new directory and store the updated information
    63      5437       392792     72.2      0.3              if last_profile_metadata != profile_metadata or last_trend_metadata != trend_metadata:
    64                                                           meta_folder_name = 'meta_' + '{:08d}'.format(time_step_index)
    65                                                           write_metadata_and_timeset(meta_path, meta_folder_name, profile_metadata, trend_metadata, timeset)
    66                                           
    67                                                           
    68     10845       122973     11.3      0.1          time_step = new_time_step(time_step, time_step_increase_factor, maximum_time_step)
    69     10845        45225      4.2      0.0          t += time_step
    70     10845        36260      3.3      0.0          time_step_index += 1

Again in more detail:

%lprun -f update_timeset_file solve()    

Timer unit: 3.01859e-07 s

Total time: 20.5189 s
File: <ipython-input-10-1a69b20160e0>
Function: update_timeset_file at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     1                                           def update_timeset_file(timeset, meta_path):
     2                                           
     3      5437      6109988   1123.8      9.0      ts = numpy.array(timeset)
     4      5437       370292     68.1      0.5      tmp_values_file = os.path.join(meta_path, 'tmp.values')
     5      5437     14579577   2681.5     21.4      ts.dump(tmp_values_file)
     6                                               
     7      5437       455100     83.7      0.7      tmp_meta_file = os.path.join(meta_path, 'tmp.json')

     8      5437        63703     11.7      0.1      content = dict({'unit': 's', 'count': len(timeset), 'fn': 'timeset.values'})
     9      5437      3681250    677.1      5.4      with open(tmp_meta_file, 'w') as file:
    10      5437      9022392   1659.4     13.3          json.dump(obj=content, fp=file, indent=4)        
    11                                               
    12      5437       455519     83.8      0.7      ts_values_file = os.path.join(meta_path, 'timeset.values')
    13      5437       314816     57.9      0.5      ts_meta_file = os.path.join(meta_path, 'timeset.json')
    14                                               
    15      5437      2227507    409.7      3.3      if os.path.exists(ts_values_file):
    16      5437      5394110    992.1      7.9          os.remove(ts_values_file)
    17      5437      2048175    376.7      3.0      if os.path.exists(ts_meta_file):
    18      5437      4809399    884.6      7.1          os.remove(ts_meta_file)
    19                                            
    20      5437      9412867   1731.3     13.8      os.rename(tmp_values_file, ts_values_file)
    21      5437      9030373   1660.9     13.3      os.rename(tmp_meta_file, ts_meta_file)

Before: 378.498 s

After: 35.2779 s


In [157]:
#%load_ext line_profiler


The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler

HDF5 with Pandas

http://glowingpython.blogspot.com.br/2014/08/quick-hdf5-with-pandas.html

The inspiration comes from the above link, it seems Pandas' DataFrame can be used to easily represent the HDF groups, creating and appending data where necessary. Let's try it then...


In [158]:
import numpy as np
from pandas import HDFStore, DataFrame, set_option

set_option('io.hdf.default_format','table')

In [159]:
def write_profiles_2_hdf(pipes, output_file, time_step_index, time):

    hdf = HDFStore(output_file, complevel=9, complib='blosc')
    
    pipe_index = 0
    
    for pipe in pipes:
        pipe_name = 'pipe_' + '{:03d}'.format(pipe_index)
        folder_name = 'profiles/time_' + '{:06d}'.format(time_step_index) + '/' + pipe_name
        local={}
        
        for var in output_variables_per_pipe:
            data = getattr(pipe, var)
            local[var] = data

        df = DataFrame(local)
        hdf.put(folder_name, df)
        pipe_index += 1
  
    hdf.close()

In [160]:
def write_trends_2_hdf(pipes, positions, output_file, time_step_index, time):

    hdf = HDFStore(output_file, complevel=9, complib='blosc')
    
    pipe_index = 0

    for pipe in pipes:
        pipe_name = 'pipe_' + '{:03d}'.format(pipe_index)
        folder_name = 'trends/' + pipe_name
        local={}
        
        for var, pos in zip(output_variables_per_pipe, positions):
            data = getattr(pipe, var)
            local[var] = data[pos]

        df = DataFrame(local, index=[time_step_index])
        if time_step_index == 0:
            hdf.put(folder_name, df, index=False)
        else:
            hdf.append(folder_name, df, index=False)
        pipe_index += 1


    hdf.close()

In [161]:
def update_timeset_2_hdf(timeset, time_step_index, output_file):
    hdf = HDFStore(output_file, complevel=9, complib='blosc')
    folder_name = 'timeset'   
    
    hdf.put(folder_name, DataFrame(timeset))

    hdf.close()

In [162]:
def solve_hdf():    
    
    time_step_index = 0
    t = 0
    time_step = initial_time_step

    elapsed_time_since_last_trend_output = 0.0
    elapsed_time_since_last_profile_output = 0.0

    timeset = []

    output_file = os.path.join(root_path, 'hdf_results_file.h5')
    if os.path.exists(output_file):
        os.remove(output_file)

    last_profile_metadata = profile_metadata = None
    last_trend_metadata = trend_metadata = None

    while t < total_simulation_time:
      
        timeset.append(t)

        if time_step_index == 0:
            # always save initial condition, profiles and trends
            write_profiles_2_hdf(pipes, output_file, time_step_index, t)
            write_trends_2_hdf(pipes, positions, output_file, time_step_index, t)
            
        # call solve methods or in this case just use the values in the pipes list

        elapsed_time_since_last_trend_output += time_step
        elapsed_time_since_last_profile_output += time_step

        if elapsed_time_since_last_trend_output > trend_output_frequency or \
            elapsed_time_since_last_profile_output > profile_output_frequency:

            if elapsed_time_since_last_trend_output > trend_output_frequency:
                write_trends_2_hdf(pipes, positions, output_file, time_step_index, t)
                elapsed_time_since_last_trend_output = 0.0

            if elapsed_time_since_last_profile_output > profile_output_frequency:
                write_profiles_2_hdf(pipes, output_file, time_step_index, t)
                elapsed_time_since_last_profile_output = 0.0
            
            # update timeset file 
            update_timeset_2_hdf(timeset, time_step_index, output_file)
                
        time_step = new_time_step(time_step, time_step_increase_factor, maximum_time_step)
        t += time_step
        time_step_index += 1

In [163]:
%%timeit 
solve_hdf()

Using compression makes the file size drops from 62,7 Mb to 24,3 Mb.

The use of index=False while appending data made the total time drops from ~780 to ~400 seconds.

Results are almost the same in terms of file size but in terms of time, dumping the raw numpy data showed much faster. The main cause is that appending data to the trends is an expensive operation. If instead of appending trend data, a similar structure as for the profiles is built (save trend data every time), the file gets bigger, around 335 Mb, and the time increases as more elements are added to the hdf file.

lprun -f solve_hdf solve_hdf()

Timer unit: 3.01859e-07 s

Total time: 561.247 s
File: <ipython-input-18-5225533419f9>
Function: solve_hdf at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     1                                           def solve_hdf():    
     2                                               
     3         1            7      7.0      0.0      time_step_index = 0
     4         1            3      3.0      0.0      t = 0
     5         1            3      3.0      0.0      time_step = initial_time_step
     6                                           
     7         1            4      4.0      0.0      elapsed_time_since_last_trend_output = 0.0
     8         1            2      2.0      0.0      elapsed_time_since_last_profile_output = 0.0
     9                                           
    10         1            3      3.0      0.0      timeset = []
    11                                           
    12         1          140    140.0      0.0      output_file = os.path.join(root_path, 'hdf_results_file.h5')
    13         1         2035   2035.0      0.0      if os.path.exists(output_file):
    14         1         9709   9709.0      0.0          os.remove(output_file)
    15                                           
    16         1            9      9.0      0.0      last_profile_metadata = profile_metadata = None
    17         1            5      5.0      0.0      last_trend_metadata = trend_metadata = None
    18                                           
    19     10846        38872      3.6      0.0      while t < total_simulation_time:
    20                                                 
    21     10845        39093      3.6      0.0          timeset.append(t)
    22                                           
    23     10845        29691      2.7      0.0          if time_step_index == 0:
    24                                                       # always save initial condition, profiles and trends
    25         1       615936 615936.0      0.0              write_profiles_2_hdf(pipes, output_file, time_step_index, t)
    26         1       250852 250852.0      0.0              write_trends_2_hdf(pipes, positions, output_file, time_step_index, t)
    27                                                       
    28                                                   # call solve methods or in this case just use the values in the pipes list
    29                                           
    30     10845        33713      3.1      0.0          elapsed_time_since_last_trend_output += time_step
    31     10845        31333      2.9      0.0          elapsed_time_since_last_profile_output += time_step
    32                                           
    33     10845        42973      4.0      0.0          if elapsed_time_since_last_trend_output > trend_output_frequency or             elapsed_time_since_last_profile_output > profile_output_frequency:
    34                                           
    35      5437        14523      2.7      0.0              if elapsed_time_since_last_trend_output > trend_output_frequency:
    36      5402   1437442723 266094.5     77.3                  write_trends_2_hdf(pipes, positions, output_file, time_step_index, t)
    37      5402        44686      8.3      0.0                  elapsed_time_since_last_trend_output = 0.0
    38                                           
    39      5437        50935      9.4      0.0              if elapsed_time_since_last_profile_output > profile_output_frequency:
    40        71     38007387 535315.3      2.0                  write_profiles_2_hdf(pipes, output_file, time_step_index, t)
    41        71          593      8.4      0.0                  elapsed_time_since_last_profile_output = 0.0
    42                                                       
    43                                                       # update timeset file 
    44      5437    382398143  70332.6     20.6              timeset = update_timeset_2_hdf(timeset, time_step_index, output_file)
    45                                                           
    46     10845       167670     15.5      0.0          time_step = new_time_step(time_step, time_step_increase_factor, maximum_time_step)
    47     10845        44826      4.1      0.0          t += time_step
    48     10845        34471      3.2      0.0          time_step_index += 1
lprun -f write_trends_2_hdf solve_hdf()

Timer unit: 3.01859e-07 s

Total time: 466.429 s
File: <ipython-input-16-0960896438b6>
Function: write_trends_2_hdf at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     1                                           def write_trends_2_hdf(pipes, positions, output_file, time_step_index, time):
     2                                           
     3      5403     26576072   4918.8      1.7      hdf = HDFStore(output_file, complevel=9, complib='blosc')
     4                                               
     5      5403        33586      6.2      0.0      pipe_index = 0
     6                                           
     7     59433       254140      4.3      0.0      for pipe in pipes:
     8     54030      1539888     28.5      0.1          pipe_name = 'pipe_' + '{:03d}'.format(pipe_index)
     9     54030       233666      4.3      0.0          folder_name = 'trends/' + pipe_name
    10     54030       317760      5.9      0.0          local={}
    11                                                   
    12    270150      1168972      4.3      0.1          for var, pos in zip(output_variables_per_pipe, positions):
    13    216120      1076875      5.0      0.1              data = getattr(pipe, var)
    14    216120      1114198      5.2      0.1              local[var] = data[pos]
    15                                           
    16     54030    246591786   4564.0     16.0          df = DataFrame(local, index=[time_step_index])
    17     54030       319697      5.9      0.0          if time_step_index == 0:
    18        10       214424  21442.4      0.0              hdf.put(folder_name, df, index=False)
    19                                                   else:
    20     54020   1226677630  22707.8     79.4              hdf.append(folder_name, df, index=False)
    21     54030       471009      8.7      0.0          pipe_index += 1
    22                                           
    23                                           
    24      5403     38597675   7143.7      2.5      hdf.close()

There is still the possibiity of using numpy compressed format numpy.savez_compressed, in this case:

Before: 26.2Mb and 46 s

After: 13.9 Mb and 104 s

If we try to save the compressed format only for the profiles we can achieve a better total file size (43% reduction) with less impact in time (14% increase):

14.9 Mb and 52.4 s

Since we have now .npz and .npy (compressed and uncompressed) files, it would be good to have this information in the meta-data files as well as an item even if the same method to load the data can be used numpy.load.

Despite numpy.savez_compressed provides some facility, the file format together with the data introduces some metadata already. It was decided then to use the rawest format as possible, so ndarray.tofile was also tested. 11.3Mb and ~40 s

Reading data

In the same way we had some questions about writing data, reading is important also, so let's try retrieving the data from the file(s)...


In [164]:
def read_raw_profile(desired_properties):
    project_name = 'my_project_folder'
    project_path = os.path.join(root_path, project_name)
    meta_path = os.path.join(project_path, 'meta')

    if not os.path.exists(project_path):
        raise(IOError, 'Can not find the project folder: %s' % project_path)
    if not os.path.exists(meta_path):
        raise(IOError, 'Can not find the metadata folder: %s' % meta_path)

    # in this case there is a single metafile, but we could have more than one
    profiles_meta_file = os.path.join(os.path.join(meta_path, 'meta_00000000'), 'profiles.json')
    with open(profiles_meta_file) as file:    
        profiles_meta = json.load(file)

    for desired_property, desired_edge in desired_properties:
        meta = None

        for pf in profiles_meta:
            if pf['edge'] == desired_edge and pf['property_id'] == desired_property:
                meta = pf
                break

        data_fn = meta['fn']

        surface = []
        for subdir, dirs, files in os.walk(project_path):
            for dir in dirs:
                if dir.startswith('TS'):
                    dir_path = os.path.join(project_path, dir)
                    profile_path = os.path.join(dir_path, 'profiles')
                    if os.path.exists(profile_path):
                        file_path = os.path.join(profile_path, data_fn)
                        surface.append(np.fromfile(file_path))
    
        # now use the array to do whathever you want...

In [165]:
single_property = [
    ('pressure', 'pipe_002')
]

In [166]:
%%timeit
# time to read one profile
read_raw_profile(single_property)


1.83 s ± 12.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [167]:
#lprun -f read_raw_profile read_raw_profile(single_property)

In [168]:
all_properties = []

for i in range(number_of_pipes):
    pipe_name = 'pipe_' + '{:03d}'.format(i)
    for prop in output_variables_per_pipe:
        all_properties.append((prop, pipe_name))

In [169]:
%%timeit
# time to read several profiles
read_raw_profile(all_properties)


1min 13s ± 664 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Reading one profile takes ~2 seconds while reading 40 profiles (4 variables x 10 pipes) takes ~80 seconds, fairly linear relation, which is good...


In [170]:
def read_hdf_profile(desired_properties):
    from pandas import read_hdf
    import pandas

    with pandas.HDFStore('hdf_results_file.h5') as hdf:
        for desired_property, desired_edge in desired_properties:
            surface = []
            for key in hdf.keys():
                if 'profiles' in key and desired_edge in key:
                    data = read_hdf('hdf_results_file.h5', key)
                    surface.append(data[desired_property].values)

In [171]:
%%timeit
read_hdf_profile(single_property)


4.58 s ± 20.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [172]:
%%timeit
read_hdf_profile(all_properties)


3min 8s ± 5.36 s per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [173]:
#lprun -f read_hdf_profile read_hdf_profile(single_property)

Read one property in the HDF takes ~4.5 seconds and to read 40 properties ~190 seconds (~42 times more), also fairly linear...


In [ ]: