Source code for ytree.frontends.consistent_trees_hdf5.arbor

"""
ConsistentTreesHDF5Arbor class and member functions



"""

# -----------------------------------------------------------------------------
# Copyright (c) ytree development team. All rights reserved.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# -----------------------------------------------------------------------------

import h5py
import numpy as np
import re

from yt.funcs import get_pbar

from ytree.data_structures.arbor import SegmentedArbor
from ytree.frontends.consistent_trees.utilities import parse_ctrees_header
from ytree.frontends.consistent_trees_hdf5.fields import ConsistentTreesHDF5FieldInfo
from ytree.frontends.consistent_trees_hdf5.io import (
    ConsistentTreesHDF5DataFile,
    ConsistentTreesHDF5RootFieldIO,
    ConsistentTreesHDF5TreeFieldIO,
)
from ytree.utilities.exceptions import ArborDataFileEmpty
from ytree.utilities.logger import ytreeLogger as mylog

_access_names = {
    "tree": {
        "group": "TreeInfo",
        "host_id": "TreeRootID",
        "offset": "TreeHalosOffset",
        "size": "TreeNhalos",
        "total": "TotNtrees",
        "file_size": "Ntrees",
        "host_attr": "tree_root_id",
    },
    "forest": {
        "group": "ForestInfo",
        "host_id": "ForestID",
        "offset": "ForestHalosOffset",
        "size": "ForestNhalos",
        "total": "TotNforests",
        "file_size": "Nforests",
        "host_attr": "forest_id",
    },
}


[docs] class ConsistentTreesHDF5Arbor(SegmentedArbor): """ Arbors loaded from consistent-trees data converted into HDF5. """ _parameter_file_is_data_file = True _field_info_class = ConsistentTreesHDF5FieldInfo _root_field_io_class = ConsistentTreesHDF5RootFieldIO _tree_field_io_class = ConsistentTreesHDF5TreeFieldIO _default_dtype = np.float32 _node_io_attrs = ("_fi", "_si", "_ei")
[docs] def __init__(self, filename, access="tree"): if access not in _access_names: raise ValueError( f"Invalid access value: {access}. Valid options are: {_access_names}." ) self.access = access self._node_io_attrs += (_access_names[access]["host_attr"],) super().__init__(filename)
def _node_io_loop_finish(self, data_file): data_file._field_cache.reset() data_file.close() @property def _virtual_dataset(self): return re.search(r"\_\d+\.h5$", self.parameter_filename) is None def _get_data_files(self): aname = _access_names[self.access]["file_size"] if self._virtual_dataset: with h5py.File(self.filename, mode="r") as f: self.data_files = [ ConsistentTreesHDF5DataFile(self.filename, lname) for lname in f ] self._file_count = np.array([f[lname].attrs[aname] for lname in f]) else: if not isinstance(self.filename, list): fns = [self.filename] else: fns = self.filename self.data_files = [ConsistentTreesHDF5DataFile(fn, None) for fn in fns] self._file_count = np.array( [h5py.File(fn, mode="r").attrs[aname] for fn in fns] ) self._size = sum(self._file_count) def _parse_parameter_file(self): f = h5py.File(self.parameter_filename, mode="r") # Is the file a collection of virtual data sets # pointing to multiple data files? virtual = self._virtual_dataset if virtual: fgroup = f.get("File0") if fgroup is None: raise ArborDataFileEmpty(self.filename) else: fgroup = f if "halos" in fgroup["Forests"]: # array of structs layout mylog.warning( "This dataset was written in array of structs format. " "Field access will be significantly slower than struct " "of arrays format." ) self._aos = True ftypes = fgroup["Forests/halos"].dtype my_fi = dict( (ftypes.names[i], {"dtype": ftypes[i]}) for i in range(len(ftypes)) ) else: # struct of arrays layout self._aos = False my_fi = dict( (field, {"dtype": data.dtype}) for field, data in fgroup["Forests"].items() ) if virtual: aname = _access_names[self.access]["total"] self._size = f.attrs[aname] header = fgroup.attrs["Consistent Trees_metadata"].astype(str) header = header.tolist() f.close() header_fi = parse_ctrees_header(self, header, ntrees_in_file=False) # Do some string manipulation to match the header with # altered names in the hdf5 file. new_fi = {} for field in header_fi: new_field = field # remove ?| characters new_field = re.sub(r"[?|]", "", new_field) # replace []/() characters with _ new_field = re.sub(r"[\[\]\/\(\)]", "_", new_field) # remove leading/trailing underscores new_field = new_field.strip("_") # replace double underscore with single underscore new_field = new_field.replace("__", "_") new_fi[new_field] = header_fi[field].copy() if "column" in new_fi[new_field]: del new_fi[new_field]["column"] for field in my_fi: my_fi[field].update(new_fi.get(field, {})) self.field_list = list(my_fi.keys()) self.field_info.update(my_fi) def _plant_trees(self): if self.is_planted or self._size == 0: return my_access = _access_names[self.access] groupname = my_access["group"] hostname = my_access["host_id"] hostattr = my_access["host_attr"] offsetname = my_access["offset"] sizename = my_access["size"] c = 0 file_offsets = self._file_count.cumsum() - self._file_count pbar = get_pbar(f"Planting {self.access}s", self._size) for idf, data_file in enumerate(self.data_files): data_file.open() hostids = data_file.fh[groupname][hostname][()] offsets = data_file.fh[groupname][offsetname][()] tree_sizes = data_file.fh[groupname][sizename][()] data_file.close() istart = file_offsets[idf] iend = istart + offsets.size self._node_info[hostattr][istart:iend] = hostids self._node_info["_fi"][istart:iend] = idf self._node_info["_si"][istart:iend] = offsets self._node_info["_ei"][istart:iend] = offsets + tree_sizes self._node_info["_tree_size"][istart:iend] = tree_sizes c += offsets.size pbar.update(c) pbar.finish() self._node_info["uid"] = self["uid"] @classmethod def _is_valid(self, *args, **kwargs): """ Should be an hdf5 file with a few key attributes. """ fns = args[0] if not isinstance(fns, list): fns = [fns] for fn in fns: if not h5py.is_hdf5(fn): return False # single data file if re.search(r"\_\d+\.h5$", fn): attrs = ["Nforests", "Ntrees", "Nhalos"] # virtual data set file else: if len(fns) > 1: raise RuntimeError( "Virtual data set file cannot be given in a list." ) attrs = ["Nfiles", "TotNforests", "TotNhalos", "TotNtrees"] with h5py.File(fn, mode="r") as f: for attr in attrs: if attr not in f.attrs: return False return True