HDF5 is a great mechanism for storing large numerical arrays of homogenous type, for data models that can be organized hierarchically and benefit from tagging of datasets with arbitrary metadata. It’s quite different from SQL-style relational databases. HDF5 has quite a few organizational tricks up its sleeve (see Chapter 8, for example), but if you find yourself needing to enforce relationships between values in various tables, or wanting to perform JOINs on your data, a relational database is probably more appropriate. Likewise, for tiny 1D datasets you need to be able to read on machines without HDF5 installed. Text formats like CSV (with all their warts) are a reasonable alternative.
HDF5 is just about perfect if you make minimal use of relational features and have a need for very high performance, partial I/O, hierarchical organization, and arbitrary metadata.
So what, specifically, is “HDF5”? I would argue it consists of three things:
- A file specification and associated data model.
- A standard library with API access available from C, C++, Java, Python, and others.
- A software ecosystem, consisting of both client programs using HDF5 and “analysis platforms” like MATLAB, IDL, and Python.
... the three main elements of the HDF5 data model: datasets, array-like objects that store your numerical data on disk; groups, hierarchical containers that store datasets and other groups; and attributes, user-defined bits of metadata that can be attached to datasets (and groups!). Python and HDF by Andrew Collette
In [1]:
#
# This examaple creates an HDF5 file dset.h5 and an empty datasets /dset in it.
#
import h5py
#
# Create a new file using defaut properties.
#
file = h5py.File('output/dset.h5','w')
#
# Create a dataset under the Root group.
#
dataset = file.create_dataset("dset",(4, 6), h5py.h5t.STD_I32BE)
print("Dataset dataspace is", dataset.shape)
print("Dataset Numpy datatype is", dataset.dtype)
print("Dataset name is", dataset.name)
print("Dataset is a member of the group", dataset.parent)
print("Dataset was created in the file", dataset.file)
#
# Close the file before exiting
#
file.close()
In [2]:
#
# This example writes data to the existing empty dataset created by h5_crtdat.py and then reads it back.
#
import h5py
import numpy as np
#
# Open an existing file using default properties.
#
file = h5py.File('output/dset.h5','r+')
#
# Open "dset" dataset under the root group.
#
dataset = file['/dset']
#
# Initialize data object with 0.
#
data = np.zeros((4,6))
#
# Assign new values
#
for i in range(4):
for j in range(6):
data[i][j]= i*6+j+1
#
# Write data
#
print("Writing data...")
dataset[...] = data
#
# Read data back and print it.
#
print("Reading data back...")
data_read = dataset[...]
print("Printing data...")
print(data_read)
#
# Close the file before exiting
#
for item in dataset.attrs.keys():
print (dataset.attrs[item])
file.close()
In [3]:
#
# This examaple creates and writes two attributes on the "dset" dataset created by h5_crtdat.py.
#
import h5py
import numpy as np
#
# Open an existing file using defaut properties.
#
file = h5py.File('output/dset.h5','r+')
#
# Open "dset" dataset.
#
dataset = file['/dset']
#
# Create string attribute.
#
attr_string = "Meter per second"
dataset.attrs["Units"] = attr_string
#
# Create integer array attribute.
#
attr_data = np.zeros((2))
attr_data[0] = 100
attr_data[1] = 200
#
#
dataset.attrs.create("Speed", attr_data, (2,), h5py.h5t.STD_I32BE)
#
# Close the file before exiting
#
for item in dataset.attrs.keys():
print (item, dataset.attrs[item])
file.close()
In [4]:
#
# This example creates an HDF5 file group.h5 and a group MyGroup in it
# using H5Py interfaces to the HDF5 library.
#
import sys
import h5py
# Uncomment the next line if you want to save the output from this script to a file named "out".
#sys.stdout = open('out', 'w')
#
# Use 'w' to remove existing file and create a new one; use 'w-' if
# create operation should fail when the file already exists.
#
print("Creating an HDF5 file with the name group.h5...")
file = h5py.File('group.h5','w')
#
# Show the Root group which is created when the file is created.
#
print("When an HDF5 file is created, it has a Root group with the name '",file.name,"'.")
#
# Create a group with the name "MyGroup"
#
print("Creating a group MyGroup in the file...")
group = file.create_group("MyGroup")
#
# Print the content of the Root group
#
print("An HDF5 group is a container for other objects; a group is similar to Python dictionary with the keys being the links to the group members.")
print("Show the members of the Root group using dictionary key method:", file.keys())
#
# Another way to show the content of the Root group.
print("Show the members of the Root group using the list function:", list(file))
#
# Close the file before exiting; H5Py will close the group.
#
file.close()
In [5]:
#
# This example creates HDF5 file group.h5 and group MyGroup in it.
# Absolute and relative paths are used to create groups in MyGroup.
#
import sys
import h5py
#
# Use 'w' to remove existing file and create a new one; use 'w-' if
# create operation should fail when the file already exists.
#
print("Creating HDF5 file group.h5...")
file = h5py.File('group.h5','w')
#
# Create a group with the name "MyGroup"
#
print("Creating group MyGroup in the file...")
group = file.create_group("MyGroup")
#
# Create group "Group_A" in group MyGroup
#
print("Creating group Group_A in MyGroup using absolute path...")
group_a = file.create_group("/MyGroup/Group_A")
#
# Create group "Group_B" in group MyGroup
#
print("Creating group Group_B in MyGroup using relative path...")
group_b = group.create_group("Group_B")
#
# Print the contents of MyGroup group
#
print("Printing members of MyGroup group:", group.keys())
#
# Close the file before exiting; H5Py will close the groups we created.
#
file.close()
In [6]:
#
# This example writes data to the existing empty dataset created by h5_crtdat.py and then reads it back.
#
import h5py
import numpy as np
#
# Open an existing file using defaut properties.
#
file = h5py.File('group.h5','r+')
#
# Open "MyGroup" group and create dataset dset1 in it.
#
print("Creating dataset dset1 in MyGroup group...")
dataset1 = file.create_dataset("/MyGroup/dset1", (3,3), dtype = h5py.h5t.STD_I32BE)
#
# Initialize data and write it to dset1.
#
data = np.zeros((3,3))
for i in range(3):
for j in range(3):
data[i][j] = j + 1
print("Writing data to dset1...")
dataset1[...] = data
#
# Open "MyGroup/Group_A" group and create dataset dset2 in it.
#
print("Creating dataset dset2 in /MyGroup/Group_A group...")
group = file['/MyGroup/Group_A']
dataset2 = group.create_dataset("dset2", (2,10), dtype = h5py.h5t.STD_I16LE)
#
# Initialize data and write it to dset2.
#
data = np.zeros((2,10))
for i in range(2):
for j in range(10):
data[i][j] = j + 1
print("Writing data to dset2...")
dataset2[...] = data
#
# Close the file before exiting.
#
file.close()
In [7]:
"""
This example illustrates how to create a compressed dataset.
Tested with:
Fedora 18:
HDF5 1.8.9, Python 2.7.3, Numpy 1.7.1, h5py 2.1.3
Fedora 18:
HDF5 1.8.9, Python 3.3.0, Numpy 1.7.1, h5py 2.1.3
Mac OS X 10.6.8:
HDF5 1.8.10, Python 3.2.5, Numpy 1.7.1, h5py 2.1.3
"""
import sys
import numpy as np
import h5py
FILE = "output/cmprss.h5"
DATASET = "Compressed_Data"
# Strings are handled very differently between python2 and python3.
if sys.hexversion >= 0x03000000:
FILE = FILE.encode()
DATASET = DATASET.encode()
DIM0 = 100
DIM1 = 20
def run():
# Create a file.
fid = h5py.h5f.create(FILE)
# Create dataset "Compressed Data" in the group using absolute names.
dims = (DIM0, DIM1)
space_id = h5py.h5s.create_simple(dims)
dcpl = h5py.h5p.create(h5py.h5p.DATASET_CREATE)
# Datasets must be chunked for compression.
cdims = (20, 20)
dcpl.set_chunk(cdims)
# Set ZLIB / DEFLATE compression using compression level 6.
dcpl.set_deflate(6)
dset = h5py.h5d.create(fid, DATASET,
h5py.h5t.STD_I32BE,
space_id, dcpl, h5py.h5p.DEFAULT)
buf = np.zeros((DIM0, DIM1))
for i in range(DIM0):
buf[i] = i + np.arange(DIM1)
dset.write(h5py.h5s.ALL, h5py.h5s.ALL, buf)
# Now reopen the file and dataset.
fid = h5py.h5f.open(FILE)
dset = h5py.h5d.open(fid, DATASET)
dcpl = dset.get_create_plist()
numfilt = dcpl.get_nfilters()
print("Number of filters associated with dataset: %d" % numfilt)
for j in range(numfilt):
code, flags, values, name = dcpl.get_filter(j)
print(name)
newdata = np.zeros((DIM0, DIM1))
dset.read(h5py.h5s.ALL,h5py.h5s.ALL, newdata)
print(newdata)
run()
In [ ]:
In [8]:
#!/usr/bin/env python
'''uses h5py to build the verysimple.nx5 data file'''
import h5py
angle = [18.9094, 18.9096, 18.9098, 18.91, 18.9102,
18.9104, 18.9106, 18.9108, 18.911, 18.9112,
18.9114, 18.9116, 18.9118, 18.912, 18.9122]
diode = [1193, 4474, 53220, 274310, 515430, 827880,
1227100, 1434640, 1330280, 1037070, 598720,
316460, 56677, 1000, 1000]
f = h5py.File('output/verysimple.nx5', 'w')
f.attrs['default'] = 'entry'
nxentry = f.create_group('entry')
nxentry.attrs["NX_class"] = 'NXentry'
nxentry.attrs['default'] = 'data'
nxdata = nxentry.create_group('data')
nxdata.attrs["NX_class"] = 'NXdata'
nxdata.attrs['signal'] = 'counts'
nxdata.attrs['axes'] = 'two_theta'
nxdata.attrs['two_theta_indices'] = [0,]
tth = nxdata.create_dataset('two_theta', data=angle)
tth.attrs['units'] = 'degrees'
tth.attrs['long_name'] = 'photodiode counts'
counts = nxdata.create_dataset('counts', data=diode)
counts.attrs['units'] = 'counts'
counts.attrs['long_name'] = 'two_theta (degrees)'
f.close()
In [9]:
#!/usr/bin/env python
'''
Writes the simplest NeXus HDF5 file using h5py
Uses method accepted at 2014NIAC
according to the example from Figure 1.3
in the Introduction chapter
'''
import h5py
import numpy
buffer = numpy.loadtxt('output/input.dat').T
tthData = buffer[0] # float[]
countsData = numpy.asarray(buffer[1],'int32') # int[]
f = h5py.File('output/writer_1_3.hdf5', "w") # create the HDF5 NeXus file
# since this is a simple example, no attributes are used at this point
nxentry = f.create_group('Scan')
j = f.create_group('blahblah')
nxentry.attrs["NX_class"] = 'NXentry'
j.create_group('here')
nxdata = nxentry.create_group('data')
nxdata.attrs["NX_class"] = 'NXdata'
nxdata.attrs['signal'] = "counts"
nxdata.attrs['axes'] = "two_theta"
nxdata.attrs['two_theta_indices'] = [0,]
tth = nxdata.create_dataset("two_theta", data=tthData)
tth.attrs['units'] = "degrees"
counts = nxdata.create_dataset("counts", data=countsData)
counts.attrs['units'] = "counts"
f.close() # be CERTAIN to close the file
In [10]:
# basic writer
'''Writes a NeXus HDF5 file using h5py and numpy'''
import h5py # HDF5 support
import numpy
print("Write a NeXus HDF5 file")
fileName = "output/prj_test.nexus.hdf5"
timestamp = "2017-01-23T17:17:04-0500"
# load data from two column format
data = numpy.loadtxt('output/input.dat').T
mr_arr = data[0]
i00_arr = numpy.asarray(data[1],'int32')
# create the HDF5 NeXus file
f = h5py.File(fileName, "w")
# point to the default data to be plotted
f.attrs['default'] = 'entry'
# give the HDF5 root some more attributes
f.attrs['file_name'] = fileName
f.attrs['file_time'] = timestamp
f.attrs['instrument'] = 'APS USAXS at 32ID-B'
f.attrs['creator'] = 'BasicWriter.py'
f.attrs['NeXus_version'] = '4.3.0'
f.attrs['HDF5_Version'] = h5py.version.hdf5_version
f.attrs['h5py_version'] = h5py.version.version
# create the NXentry group
nxentry = f.create_group('entry')
nxentry.attrs['NX_class'] = 'NXentry'
nxentry.attrs['default'] = 'mr_scan'
nxentry.create_dataset('title', data='1-D scan of I00 v. mr')
# create the NXentry group
nxdata = nxentry.create_group('mr_scan')
nxdata.attrs['NX_class'] = 'NXdata'
nxdata.attrs['signal'] = 'I00' # Y axis of default plot
nxdata.attrs['axes'] = 'mr' # X axis of default plot
nxdata.attrs['mr_indices'] = [0,] # use "mr" as the first dimension of I00
# X axis data
ds = nxdata.create_dataset('mr', data=mr_arr)
ds.attrs['units'] = 'degrees'
ds.attrs['long_name'] = 'USAXS mr (degrees)' # suggested X axis plot label
# Y axis data
ds = nxdata.create_dataset('I00', data=i00_arr)
ds.attrs['units'] = 'counts'
ds.attrs['long_name'] = 'USAXS I00 (counts)' # suggested Y axis plot label
f.close() # be CERTAIN to close the file
print("wrote file:", fileName)
In [11]:
#Basic Reader
'''Reads NeXus HDF5 files using h5py and prints the contents'''
import h5py # HDF5 support
fileName = "output/prj_test.nexus.hdf5"
f = h5py.File(fileName, "r")
for item in f.attrs.keys():
print(item + ":", f.attrs[item])
mr = f['/entry/mr_scan/mr']
i00 = f['/entry/mr_scan/I00']
print("%s\t%s\t%s" % ("#", "mr", "I00"))
for i in range(len(mr)):
print("%d\t%g\t%d" % (i, mr[i], i00[i]))
f.close()
In [12]:
# Refer to the above picture
#!/usr/bin/env python
'''
Writes a simple NeXus HDF5 file using h5py with links
according to the example from Figure 2.1 in the Design chapter
'''
import h5py
import numpy
buffer = numpy.loadtxt('output/input.dat').T
tthData = buffer[0] # float[]
countsData = numpy.asarray(buffer[1],'int32') # int[]
f = h5py.File('output/writer_2_1.hdf5', "w") # create the HDF5 NeXus file
f.attrs['default'] = 'entry'
nxentry = f.create_group('entry')
nxentry.attrs['NX_class'] = 'NXentry'
nxentry.attrs['default'] = 'data'
nxinstrument = nxentry.create_group('instrument')
nxinstrument.attrs['NX_class'] = 'NXinstrument'
nxdetector = nxinstrument.create_group('detector')
nxdetector.attrs['NX_class'] = 'NXdetector'
# store the data in the NXdetector group
ds_tth = nxdetector.create_dataset('two_theta', data=tthData)
ds_tth.attrs['units'] = 'degrees'
ds_counts = nxdetector.create_dataset('counts', data=countsData)
ds_counts.attrs['units'] = 'counts'
# create the NXdata group to define the default plot
nxdata = nxentry.create_group('data')
nxdata.attrs['NX_class'] = 'NXdata'
nxdata.attrs['signal'] = 'counts'
nxdata.attrs['axes'] = 'two_theta'
nxdata.attrs['two_theta_indices'] = [0,]
source_addr = '/entry/instrument/detector/two_theta' # existing data
target_addr = 'two_theta' # new location
ds_tth.attrs['target'] = source_addr # a NeXus API convention for links
nxdata._id.link(source_addr, target_addr, h5py.h5g.LINK_HARD)
source_addr = '/entry/instrument/detector/counts' # existing data
target_addr = 'counts' # new location
ds_counts.attrs['target'] = source_addr # a NeXus API convention for links
nxdata._id.link(source_addr, target_addr, h5py.h5g.LINK_HARD)
f.close() # be CERTAIN to close the file
In [13]:
# external link
#!/usr/bin/env python
'''
Writes a NeXus HDF5 file using h5py with links to data in other HDF5 files.
This example is based on ``writer_2_1``.
'''
import h5py
import numpy
FILE_HDF5_MASTER = 'output/external_master.hdf5'
FILE_HDF5_ANGLES = 'output/external_angles.hdf5'
FILE_HDF5_COUNTS = 'output/external_counts.hdf5'
#---------------------------
# get some data
buffer = numpy.loadtxt('output/input.dat').T
tthData = buffer[0] # float[]
countsData = numpy.asarray(buffer[1],'int32') # int[]
# put the angle data in an external (non-NeXus) HDF5 data file
f = h5py.File(FILE_HDF5_ANGLES, "w")
ds = f.create_dataset('angles', data=tthData)
ds.attrs['units'] = 'degrees'
f.close() # be CERTAIN to close the file
# put the detector counts in an external HDF5 data file
# with *incomplete* NeXus structure (no NXdata group)
f = h5py.File(FILE_HDF5_COUNTS, "w")
nxentry = f.create_group('entry')
nxentry.attrs['NX_class'] = 'NXentry'
nxinstrument = nxentry.create_group('instrument')
nxinstrument.attrs['NX_class'] = 'NXinstrument'
nxdetector = nxinstrument.create_group('detector')
nxdetector.attrs['NX_class'] = 'NXdetector'
ds = nxdetector.create_dataset('counts', data=countsData)
ds.attrs['units'] = 'counts'
# link the "two_theta" data stored in separate file
local_addr = nxdetector.name+'/two_theta'
f[local_addr] = h5py.ExternalLink(FILE_HDF5_ANGLES, '/angles')
f.close()
# create a master NeXus HDF5 file
f = h5py.File(FILE_HDF5_MASTER, "w")
f.attrs['default'] = 'entry'
nxentry = f.create_group('entry')
nxentry.attrs['NX_class'] = 'NXentry'
nxentry.attrs["default"] = 'data'
nxdata = nxentry.create_group('data')
nxdata.attrs['NX_class'] = 'NXdata'
# link in the signal data
local_addr = '/entry/data/counts'
external_addr = '/entry/instrument/detector/counts'
f[local_addr] = h5py.ExternalLink(FILE_HDF5_COUNTS, external_addr)
nxdata.attrs['signal'] = 'counts'
# link in the axes data
local_addr = '/entry/data/two_theta'
f[local_addr] = h5py.ExternalLink(FILE_HDF5_ANGLES, '/angles')
nxdata.attrs['axes'] = 'two_theta'
nxdata.attrs['two_theta_indices'] = [0,]
local_addr = '/entry/instrument'
f[local_addr] = h5py.ExternalLink(FILE_HDF5_COUNTS, '/entry/instrument')
f.close()
In [ ]: