Source code for ytree.frontends.consistent_trees_hdf5.arbor

"""
ConsistentTreesHDF5Arbor class and member functions



"""

#-----------------------------------------------------------------------------
# Copyright (c) ytree development team. All rights reserved.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
#-----------------------------------------------------------------------------

import h5py
import numpy as np
import re

from yt.funcs import \
    get_pbar

from ytree.data_structures.arbor import \
    SegmentedArbor
from ytree.frontends.consistent_trees.utilities import \
    parse_ctrees_header
from ytree.frontends.consistent_trees_hdf5.fields import \
    ConsistentTreesHDF5FieldInfo
from ytree.frontends.consistent_trees_hdf5.io import \
    ConsistentTreesHDF5DataFile, \
    ConsistentTreesHDF5RootFieldIO, \
    ConsistentTreesHDF5TreeFieldIO
from ytree.utilities.exceptions import \
    ArborDataFileEmpty
from ytree.utilities.logger import \
    ytreeLogger as mylog

_access_names = {
    'tree':   {'group'     : 'TreeInfo',
               'host_id'   : 'TreeRootID',
               'offset'    : 'TreeHalosOffset',
               'size'      : 'TreeNhalos',
               'total'     : 'TotNtrees',
               'file_size' : 'Ntrees',
               'host_attr' : 'tree_root_id'},
    'forest': {'group'     : 'ForestInfo',
               'host_id'   : 'ForestID',
               'offset'    : 'ForestHalosOffset',
               'size'      : 'ForestNhalos',
               'total'     : 'TotNforests',
               'file_size' : 'Nforests',
               'host_attr' : 'forest_id'}
}

[docs]class ConsistentTreesHDF5Arbor(SegmentedArbor):
    """
    Arbors loaded from consistent-trees data converted into HDF5.
    """

    _parameter_file_is_data_file = True
    _field_info_class = ConsistentTreesHDF5FieldInfo
    _root_field_io_class = ConsistentTreesHDF5RootFieldIO
    _tree_field_io_class = ConsistentTreesHDF5TreeFieldIO
    _default_dtype = np.float32
    _node_io_attrs = ('_fi', '_si', '_ei')

[docs]    def __init__(self, filename, access='tree'):
        if access not in _access_names:
            raise ValueError(
                f"Invalid access value: {access}. Valid options are: {_access_names}.")
        self.access = access
        self._node_io_attrs += (_access_names[access]['host_attr'],)
        super().__init__(filename)

    def _node_io_loop_finish(self, data_file):
        data_file._field_cache.reset()
        data_file.close()

    @property
    def _virtual_dataset(self):
        return re.search(r"\_\d+\.h5$", self.parameter_filename) is None

    def _get_data_files(self):
        aname = _access_names[self.access]['file_size']

        if self._virtual_dataset:
            with h5py.File(self.filename, mode='r') as f:
                self.data_files = \
                  [ConsistentTreesHDF5DataFile(self.filename, lname) for lname in f]
                self._file_count = \
                  np.array([f[lname].attrs[aname] for lname in f])
        else:
            if not isinstance(self.filename, list):
                fns = [self.filename]
            else:
                fns = self.filename
            self.data_files = [ConsistentTreesHDF5DataFile(fn, None) for fn in fns]
            self._file_count = \
              np.array([h5py.File(fn, mode='r').attrs[aname] for fn in fns])
            self._size = sum(self._file_count)

    def _parse_parameter_file(self):
        f = h5py.File(self.parameter_filename, mode='r')

        # Is the file a collection of virtual data sets
        # pointing to multiple data files?
        virtual = self._virtual_dataset
        if virtual:
            fgroup = f.get('File0')
            if fgroup is None:
                raise ArborDataFileEmpty(self.filename)
        else:
            fgroup = f

        if 'halos' in fgroup['Forests']:
            # array of structs layout
            mylog.warning(
                "This dataset was written in array of structs format. "
                "Field access will be significantly slower than struct "
                "of arrays format.")
            self._aos = True
            ftypes = fgroup['Forests/halos'].dtype
            my_fi = dict((ftypes.names[i], {'dtype': ftypes[i]})
                         for i in range(len(ftypes)))
        else:
            # struct of arrays layout
            self._aos = False
            my_fi = dict((field, {'dtype': data.dtype})
                        for field, data in fgroup['Forests'].items())

        if virtual:
            aname = _access_names[self.access]['total']
            self._size = f.attrs[aname]
        header = fgroup.attrs['Consistent Trees_metadata'].astype(str)
        header = header.tolist()
        f.close()

        header_fi = parse_ctrees_header(
            self, header, ntrees_in_file=False)
        # Do some string manipulation to match the header with
        # altered names in the hdf5 file.
        new_fi = {}
        for field in header_fi:
            new_field = field
            # remove ?| characters
            new_field = re.sub(r'[?|]', '', new_field)
            # replace []/() characters with _
            new_field = re.sub(r'[\[\]\/\(\)]', '_', new_field)
            # remove leading/trailing underscores
            new_field = new_field.strip('_')
            # replace double underscore with single underscore
            new_field = new_field.replace('__', '_')

            new_fi[new_field] = header_fi[field].copy()
            if 'column' in new_fi[new_field]:
                del new_fi[new_field]['column']

        for field in my_fi:
            my_fi[field].update(new_fi.get(field, {}))

        self.field_list = list(my_fi.keys())
        self.field_info.update(my_fi)

    def _plant_trees(self):
        if self.is_planted or self._size == 0:
            return

        my_access  = _access_names[self.access]
        groupname  = my_access['group']
        hostname   = my_access['host_id']
        hostattr   = my_access['host_attr']
        offsetname = my_access['offset']
        sizename   = my_access['size']

        c = 0
        file_offsets = self._file_count.cumsum() - self._file_count
        pbar = get_pbar(f'Planting {self.access}s', self._size)
        for idf, data_file in enumerate(self.data_files):
            data_file.open()
            hostids = data_file.fh[groupname][hostname][()]
            offsets = data_file.fh[groupname][offsetname][()]
            tree_sizes = data_file.fh[groupname][sizename][()]
            data_file.close()

            istart = file_offsets[idf]
            iend = istart + offsets.size
            self._node_info[hostattr][istart:iend] = hostids
            self._node_info['_fi'][istart:iend] = idf
            self._node_info['_si'][istart:iend] = offsets
            self._node_info['_ei'][istart:iend] = offsets + tree_sizes
            self._node_info['_tree_size'][istart:iend] = tree_sizes
            c += offsets.size
            pbar.update(c)
        pbar.finish()
        self._node_info['uid'] = self['uid']

    @classmethod
    def _is_valid(self, *args, **kwargs):
        """
        Should be an hdf5 file with a few key attributes.
        """
        fns = args[0]
        if not isinstance(fns, list):
            fns = [fns]

        for fn in fns:
            if not h5py.is_hdf5(fn):
                return False
            # single data file
            if re.search(r"\_\d+\.h5$", fn):
                attrs = ['Nforests', 'Ntrees', 'Nhalos']
            # virtual data set file
            else:
                if len(fns) > 1:
                    raise RuntimeError(
                        'Virtual data set file cannot be given in a list.')
                attrs = ['Nfiles', 'TotNforests', 'TotNhalos', 'TotNtrees']

            with h5py.File(fn, mode='r') as f:
                for attr in attrs:
                    if attr not in f.attrs:
                        return False
        return True