Link: http://docs.h5py.org/en/latest/mpi.html
An HDF5 file is a container for two kinds of objects:
where: Groups work like dictionaries, and datasets work like NumPy arrays
In [17]:
import h5py
import numpy as np
In [27]:
!rm mytestfile.hdf5
In [28]:
# create a new hdf5 file
f = h5py.File("mytestfile.hdf5", "w")
In [100]:
f.filename, f.name
Out[100]:
In [29]:
# you can create group or dataset into the file
# Using existing data:
data = np.random.logistic(size=100)
dsetdata = f.create_dataset("dsetdata", data=data)
print(dsetdata)
# Creating the dataset without data
dsetname = f.create_dataset(
"dsetname", shape=(100, ), dtype='i') # i, f, etc..
print(dsetname)
In [35]:
dsetdata.dtype, dsetdata.shape, dsetdata.size
Out[35]:
In [99]:
# check data in filename
for i in f.items():
print(i)
In [60]:
# support array-style slicing, example of read/write in this cell
dsetname[0] = 5
dsetname[5:15] = np.random.uniform(size=10,low=0, high=5)
dsetname[10:20:2]
Out[60]:
In [58]:
dsetname.value, dsetname.name, dsetname.fillvalue, dsetname.shuffle
Out[58]:
In [106]:
# “HDF” stands for “Hierarchical Data Format”.
# Every object in an HDF5 file has a name, and they’re arranged in a POSIX-style hierarchy with /-separators
print(f.name, dsetname.name, dsetdata.name)
# create subgroup
grp = f.create_group("subgroup")
# create dataset from the group variable
dset2 = grp.create_dataset("another_dataset", (50, ), dtype='f')
dset2.name
Out[106]:
In [107]:
# giving a full path, each group will be created
dset3 = f.create_dataset('subgroup2/dataset_three', (10,), dtype='i')
dset3.name
Out[107]:
In [109]:
# get dataset using full path
dataset_three = f['subgroup2/dataset_three']
dataset_three
Out[109]:
In [113]:
"subgroup/another_dataset" in f
Out[113]:
In [114]:
# wrong way to check the entire tree structure
for name in f:
print(name)
In [132]:
# iterate over the file or group using visit() or visititems() which both take a callable
def printname(x):
print(x)
f.visit(printname)
print('')
grp.visit(printname)
In [87]:
# you can store metadata right next to the data (groups and datasets) it describes (in a dictionary interface)
dsetname.attrs['descrizione'] = 'dati a caso'
dsetname.attrs['data'] = '04/04/2014'
dsetname.attrs['pippo'] = 150
'data' in dsetname.attrs
Out[87]:
In [92]:
# see all metadata
for i in dsetname.attrs.items():
print(i)
In [166]:
f.close()
In [239]:
!rm iris.hdf5
In [240]:
import pandas as pd
df = pd.read_csv('iris.csv')
df.to_hdf(path_or_buf='iris.hdf5', key='iris_', mode='w', format='fixed')
# fixed format: Fast writing/reading. Not-appendable, nor searchable
# table forma: Write as a PyTables Table structure which may perform worse
# but allow more flexible operations like searching selecting subsets of the data
In [231]:
# reopen the file and check the structure and how it has stored the columns etc..
newf = h5py.File('iris.hdf5')
newf.visit(printname)
In [232]:
for i in newf.keys():
for j in newf[i].keys():
print(newf[i + '/' + j])
In [233]:
for i in newf.keys():
for j in newf[i].keys():
print(newf[i + '/' + j].name)
print(newf[i + '/' + j].value, end='\n\n')
In [234]:
newf.close()
In [235]:
# read an hdf5 file to pandas dataframe
df_hdf5 = pd.read_hdf('iris.hdf5')
df_hdf5.head()
Out[235]:
In [245]:
## append data to the previous data on the hdf5
df.to_hdf('iris.hdf5', 'data', append=True, format='table')
df.to_hdf('iris.hdf5', 'data', append=True, format='table') # since the format is table you can append stak data
In [248]:
# to retrive the data you have to use the key (here data or iris_)
df_hdf5 = pd.read_hdf('iris.hdf5',key='data')
print(len(df_hdf5))
df_hdf5
Out[248]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: