"""
ConsistentTreesHDF5Arbor class and member functions
"""
#-----------------------------------------------------------------------------
# Copyright (c) ytree development team. All rights reserved.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
#-----------------------------------------------------------------------------
import h5py
import numpy as np
import re
from yt.funcs import \
get_pbar
from ytree.data_structures.arbor import \
SegmentedArbor
from ytree.frontends.consistent_trees.utilities import \
parse_ctrees_header
from ytree.frontends.consistent_trees_hdf5.fields import \
ConsistentTreesHDF5FieldInfo
from ytree.frontends.consistent_trees_hdf5.io import \
ConsistentTreesHDF5DataFile, \
ConsistentTreesHDF5RootFieldIO, \
ConsistentTreesHDF5TreeFieldIO
from ytree.utilities.exceptions import \
ArborDataFileEmpty
from ytree.utilities.logger import \
ytreeLogger as mylog
_access_names = {
'tree': {'group' : 'TreeInfo',
'host_id' : 'TreeRootID',
'offset' : 'TreeHalosOffset',
'size' : 'TreeNhalos',
'total' : 'TotNtrees',
'file_size' : 'Ntrees',
'host_attr' : 'tree_root_id'},
'forest': {'group' : 'ForestInfo',
'host_id' : 'ForestID',
'offset' : 'ForestHalosOffset',
'size' : 'ForestNhalos',
'total' : 'TotNforests',
'file_size' : 'Nforests',
'host_attr' : 'forest_id'}
}
[docs]class ConsistentTreesHDF5Arbor(SegmentedArbor):
"""
Arbors loaded from consistent-trees data converted into HDF5.
"""
_parameter_file_is_data_file = True
_field_info_class = ConsistentTreesHDF5FieldInfo
_root_field_io_class = ConsistentTreesHDF5RootFieldIO
_tree_field_io_class = ConsistentTreesHDF5TreeFieldIO
_default_dtype = np.float32
_node_io_attrs = ('_fi', '_si', '_ei')
[docs] def __init__(self, filename, access='tree'):
if access not in _access_names:
raise ValueError(
f"Invalid access value: {access}. Valid options are: {_access_names}.")
self.access = access
self._node_io_attrs += (_access_names[access]['host_attr'],)
super().__init__(filename)
def _node_io_loop_finish(self, data_file):
data_file._field_cache.reset()
data_file.close()
@property
def _virtual_dataset(self):
return re.search(r"\_\d+\.h5$", self.parameter_filename) is None
def _get_data_files(self):
aname = _access_names[self.access]['file_size']
if self._virtual_dataset:
with h5py.File(self.filename, mode='r') as f:
self.data_files = \
[ConsistentTreesHDF5DataFile(self.filename, lname) for lname in f]
self._file_count = \
np.array([f[lname].attrs[aname] for lname in f])
else:
if not isinstance(self.filename, list):
fns = [self.filename]
else:
fns = self.filename
self.data_files = [ConsistentTreesHDF5DataFile(fn, None) for fn in fns]
self._file_count = \
np.array([h5py.File(fn, mode='r').attrs[aname] for fn in fns])
self._size = sum(self._file_count)
def _parse_parameter_file(self):
f = h5py.File(self.parameter_filename, mode='r')
# Is the file a collection of virtual data sets
# pointing to multiple data files?
virtual = self._virtual_dataset
if virtual:
fgroup = f.get('File0')
if fgroup is None:
raise ArborDataFileEmpty(self.filename)
else:
fgroup = f
if 'halos' in fgroup['Forests']:
# array of structs layout
mylog.warning(
"This dataset was written in array of structs format. "
"Field access will be significantly slower than struct "
"of arrays format.")
self._aos = True
ftypes = fgroup['Forests/halos'].dtype
my_fi = dict((ftypes.names[i], {'dtype': ftypes[i]})
for i in range(len(ftypes)))
else:
# struct of arrays layout
self._aos = False
my_fi = dict((field, {'dtype': data.dtype})
for field, data in fgroup['Forests'].items())
if virtual:
aname = _access_names[self.access]['total']
self._size = f.attrs[aname]
header = fgroup.attrs['Consistent Trees_metadata'].astype(str)
header = header.tolist()
f.close()
header_fi = parse_ctrees_header(
self, header, ntrees_in_file=False)
# Do some string manipulation to match the header with
# altered names in the hdf5 file.
new_fi = {}
for field in header_fi:
new_field = field
# remove ?| characters
new_field = re.sub(r'[?|]', '', new_field)
# replace []/() characters with _
new_field = re.sub(r'[\[\]\/\(\)]', '_', new_field)
# remove leading/trailing underscores
new_field = new_field.strip('_')
# replace double underscore with single underscore
new_field = new_field.replace('__', '_')
new_fi[new_field] = header_fi[field].copy()
if 'column' in new_fi[new_field]:
del new_fi[new_field]['column']
for field in my_fi:
my_fi[field].update(new_fi.get(field, {}))
self.field_list = list(my_fi.keys())
self.field_info.update(my_fi)
def _plant_trees(self):
if self.is_planted or self._size == 0:
return
my_access = _access_names[self.access]
groupname = my_access['group']
hostname = my_access['host_id']
hostattr = my_access['host_attr']
offsetname = my_access['offset']
sizename = my_access['size']
c = 0
file_offsets = self._file_count.cumsum() - self._file_count
pbar = get_pbar(f'Planting {self.access}s', self._size)
for idf, data_file in enumerate(self.data_files):
data_file.open()
hostids = data_file.fh[groupname][hostname][()]
offsets = data_file.fh[groupname][offsetname][()]
tree_sizes = data_file.fh[groupname][sizename][()]
data_file.close()
istart = file_offsets[idf]
iend = istart + offsets.size
self._node_info[hostattr][istart:iend] = hostids
self._node_info['_fi'][istart:iend] = idf
self._node_info['_si'][istart:iend] = offsets
self._node_info['_ei'][istart:iend] = offsets + tree_sizes
self._node_info['_tree_size'][istart:iend] = tree_sizes
c += offsets.size
pbar.update(c)
pbar.finish()
self._node_info['uid'] = self['uid']
@classmethod
def _is_valid(self, *args, **kwargs):
"""
Should be an hdf5 file with a few key attributes.
"""
fns = args[0]
if not isinstance(fns, list):
fns = [fns]
for fn in fns:
if not h5py.is_hdf5(fn):
return False
# single data file
if re.search(r"\_\d+\.h5$", fn):
attrs = ['Nforests', 'Ntrees', 'Nhalos']
# virtual data set file
else:
if len(fns) > 1:
raise RuntimeError(
'Virtual data set file cannot be given in a list.')
attrs = ['Nfiles', 'TotNforests', 'TotNhalos', 'TotNtrees']
with h5py.File(fn, mode='r') as f:
for attr in attrs:
if attr not in f.attrs:
return False
return True