From d31ac29294a9e4b8c91811cfb4904d0a598f69be Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Wed, 11 Sep 2024 19:02:15 -0700 Subject: [PATCH] clean up old hdf5 reader methods, fix truncate_hdf5 method, make proper'd test data files with working references --- nwb_linkml/src/nwb_linkml/io/hdf5.py | 95 ++- nwb_linkml/src/nwb_linkml/maps/hdf5.py | 840 +---------------------- nwb_linkml/tests/data/aibs.nwb | Bin 974974 -> 976999 bytes nwb_linkml/tests/data/aibs_ecephys.nwb | Bin 288732 -> 288036 bytes nwb_linkml/tests/fixtures/paths.py | 9 +- nwb_linkml/tests/test_io/test_io_hdf5.py | 82 ++- 6 files changed, 173 insertions(+), 853 deletions(-) diff --git a/nwb_linkml/src/nwb_linkml/io/hdf5.py b/nwb_linkml/src/nwb_linkml/io/hdf5.py index 750a337..bf4fbe6 100644 --- a/nwb_linkml/src/nwb_linkml/io/hdf5.py +++ b/nwb_linkml/src/nwb_linkml/io/hdf5.py @@ -22,7 +22,6 @@ Other TODO: import json import os -import pdb import re import shutil import subprocess @@ -39,7 +38,12 @@ from numpydantic.interface.hdf5 import H5ArrayPath from pydantic import BaseModel from tqdm import tqdm -from nwb_linkml.maps.hdf5 import get_references, resolve_hardlink +from nwb_linkml.maps.hdf5 import ( + get_attr_references, + get_dataset_references, + get_references, + resolve_hardlink, +) if TYPE_CHECKING: from nwb_linkml.providers.schema import SchemaProvider @@ -342,8 +346,6 @@ class HDF5IO: path = "/" return context[path] - pdb.set_trace() - def write(self, path: Path) -> Never: """ Write to NWB file @@ -396,7 +398,7 @@ class HDF5IO: provider = SchemaProvider(versions=versions) # build schema so we have them cached - # provider.build_from_dicts(schema) + provider.build_from_dicts(schema) h5f.close() return provider @@ -484,7 +486,7 @@ def find_references(h5f: h5py.File, path: str) -> List[str]: return references -def truncate_file(source: Path, target: Optional[Path] = None, n: int = 10) -> Path: +def truncate_file(source: Path, target: Optional[Path] = None, n: int = 10) -> Path | None: """ Create a truncated HDF5 file where only the first few samples are kept. @@ -500,6 +502,14 @@ def truncate_file(source: Path, target: Optional[Path] = None, n: int = 10) -> P Returns: :class:`pathlib.Path` path of the truncated file """ + if shutil.which("h5repack") is None: + warnings.warn( + "Truncation requires h5repack to be available, " + "or else the truncated files will be no smaller than the originals", + stacklevel=2, + ) + return + target = source.parent / (source.stem + "_truncated.hdf5") if target is None else Path(target) source = Path(source) @@ -515,17 +525,34 @@ def truncate_file(source: Path, target: Optional[Path] = None, n: int = 10) -> P os.chmod(target, 0o774) to_resize = [] + attr_refs = {} + dataset_refs = {} def _need_resizing(name: str, obj: h5py.Dataset | h5py.Group) -> None: if isinstance(obj, h5py.Dataset) and obj.size > n: to_resize.append(name) - print("Resizing datasets...") + def _find_attr_refs(name: str, obj: h5py.Dataset | h5py.Group) -> None: + """Find all references in object attrs""" + refs = get_attr_references(obj) + if refs: + attr_refs[name] = refs + + def _find_dataset_refs(name: str, obj: h5py.Dataset | h5py.Group) -> None: + """Find all references in datasets themselves""" + refs = get_dataset_references(obj) + if refs: + dataset_refs[name] = refs + # first we get the items that need to be resized and then resize them below # problems with writing to the file from within the visititems call + print("Planning resize...") h5f_target = h5py.File(str(target), "r+") h5f_target.visititems(_need_resizing) + h5f_target.visititems(_find_attr_refs) + h5f_target.visititems(_find_dataset_refs) + print("Resizing datasets...") for resize in to_resize: obj = h5f_target.get(resize) try: @@ -535,10 +562,14 @@ def truncate_file(source: Path, target: Optional[Path] = None, n: int = 10) -> P # so we have to copy and create a new dataset tmp_name = obj.name + "__tmp" original_name = obj.name + obj.parent.move(obj.name, tmp_name) old_obj = obj.parent.get(tmp_name) - new_obj = obj.parent.create_dataset(original_name, data=old_obj[0:n]) + new_obj = obj.parent.create_dataset( + original_name, data=old_obj[0:n], dtype=old_obj.dtype + ) for k, v in old_obj.attrs.items(): + new_obj.attrs[k] = v del new_obj.parent[tmp_name] @@ -546,16 +577,18 @@ def truncate_file(source: Path, target: Optional[Path] = None, n: int = 10) -> P h5f_target.close() # use h5repack to actually remove the items from the dataset - if shutil.which("h5repack") is None: - warnings.warn( - "Truncated file made, but since h5repack not found in path, file won't be any smaller", - stacklevel=2, - ) - return target - print("Repacking hdf5...") res = subprocess.run( - ["h5repack", "-f", "GZIP=9", str(target), str(target_tmp)], capture_output=True + [ + "h5repack", + "--verbose=2", + "--enable-error-stack", + "-f", + "GZIP=9", + str(target), + str(target_tmp), + ], + capture_output=True, ) if res.returncode != 0: warnings.warn(f"h5repack did not return 0: {res.stderr} {res.stdout}", stacklevel=2) @@ -563,6 +596,36 @@ def truncate_file(source: Path, target: Optional[Path] = None, n: int = 10) -> P target_tmp.unlink() return target + h5f_target = h5py.File(str(target_tmp), "r+") + + # recreate references after repacking, because repacking ruins them if they + # are in a compound dtype + for obj_name, obj_refs in attr_refs.items(): + obj = h5f_target.get(obj_name) + for attr_name, ref_target in obj_refs.items(): + ref_target = h5f_target.get(ref_target) + obj.attrs[attr_name] = ref_target.ref + + for obj_name, obj_refs in dataset_refs.items(): + obj = h5f_target.get(obj_name) + if isinstance(obj_refs, list): + if len(obj_refs) == 1: + ref_target = h5f_target.get(obj_refs[0]) + obj[()] = ref_target.ref + else: + targets = [h5f_target.get(ref).ref for ref in obj_refs[:n]] + obj[:] = targets + else: + # dict for a compound dataset + for col_name, column_refs in obj_refs.items(): + targets = [h5f_target.get(ref).ref for ref in column_refs[:n]] + data = obj[:] + data[col_name] = targets + obj[:] = data + + h5f_target.flush() + h5f_target.close() + target.unlink() target_tmp.rename(target) diff --git a/nwb_linkml/src/nwb_linkml/maps/hdf5.py b/nwb_linkml/src/nwb_linkml/maps/hdf5.py index a215b02..a507678 100644 --- a/nwb_linkml/src/nwb_linkml/maps/hdf5.py +++ b/nwb_linkml/src/nwb_linkml/maps/hdf5.py @@ -5,772 +5,47 @@ We have sort of diverged from the initial idea of a generalized map as in :class so we will make our own mapping class here and re-evaluate whether they should be unified later """ -# FIXME: return and document whatever is left of this godforsaken module after refactoring # ruff: noqa: D102 # ruff: noqa: D101 -import contextlib -import datetime -import inspect -import sys -from abc import abstractmethod -from pathlib import Path -from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Type, Union +from typing import List, Union import h5py -from numpydantic.interface.hdf5 import H5ArrayPath -from pydantic import BaseModel, ConfigDict, Field - -from nwb_linkml.annotations import unwrap_optional -from nwb_linkml.maps import Map -from nwb_linkml.types.hdf5 import HDF5_Path - -if sys.version_info.minor >= 11: - from enum import StrEnum -else: - from enum import Enum - - class StrEnum(str, Enum): - """StrEnum-ish class for python 3.10""" -if TYPE_CHECKING: - from nwb_linkml.providers.schema import SchemaProvider +def get_attr_references(obj: h5py.Dataset | h5py.Group) -> dict[str, str]: + """ + Get any references in object attributes + """ + refs = { + k: obj.file.get(ref).name + for k, ref in obj.attrs.items() + if isinstance(ref, h5py.h5r.Reference) + } + return refs -class ReadPhases(StrEnum): - plan = "plan" - """Before reading starts, building an index of objects to read""" - read = "read" - """Main reading operation""" - construct = "construct" - """After reading, casting the results of the read into their models""" - - -class H5SourceItem(BaseModel): +def get_dataset_references(obj: h5py.Dataset | h5py.Group) -> list[str] | dict[str, str]: """ - Descriptor of items for each element when :func:`.flatten_hdf` flattens an hdf5 file. - - Consumed by :class:`.HDF5Map` classes, orchestrated by :class:`.ReadQueue` - """ - - path: str - """Absolute hdf5 path of element""" - h5f_path: str - """Path to the source hdf5 file""" - leaf: bool - """ - If ``True``, this item has no children - (and thus we should start instantiating it before ascending to parent classes) - """ - h5_type: Literal["group", "dataset"] - """What kind of hdf5 element this is""" - depends: List[str] = Field(default_factory=list) - """ - Paths of other source items that this item depends on before it can be instantiated. - eg. from softlinks - """ - attrs: dict = Field(default_factory=dict) - """Any static attrs that can be had from the element""" - namespace: Optional[str] = None - """Optional: The namespace that the neurodata type belongs to""" - neurodata_type: Optional[str] = None - """Optional: the neurodata type for this dataset or group""" - - model_config = ConfigDict(arbitrary_types_allowed=True) - - @property - def parts(self) -> List[str]: - """path split by /""" - return self.path.split("/") - - -class H5ReadResult(BaseModel): - """ - Result returned by each of our mapping operations. - - Also used as the source for operations in the ``construct`` :class:`.ReadPhases` - """ - - path: str - """absolute hdf5 path of element""" - source: Union[H5SourceItem, "H5ReadResult"] - """ - Source that this result is based on. - The map can modify this item, so the container should update the source - queue on each pass - """ - completed: bool = False - """ - Was this item completed by this map step? False for cases where eg. - we still have dependencies that need to be completed before this one - """ - result: Optional[dict | str | int | float | BaseModel] = None - """ - If completed, built result. A dict that can be instantiated into the model. - If completed is True and result is None, then remove this object - """ - model: Optional[Type[BaseModel]] = None - """ - The model that this item should be cast into - """ - completes: List[HDF5_Path] = Field(default_factory=list) - """ - If this result completes any other fields, we remove them from the build queue. - """ - namespace: Optional[str] = None - """ - Optional: the namespace of the neurodata type for this object - """ - neurodata_type: Optional[str] = None - """ - Optional: The neurodata type to use for this object - """ - applied: List[str] = Field(default_factory=list) - """ - Which map operations were applied to this item - """ - errors: List[str] = Field(default_factory=list) - """ - Problems that occurred during resolution - """ - depends: List[HDF5_Path] = Field(default_factory=list) - """ - Other items that the final resolution of this item depends on - """ - - -FlatH5 = Dict[str, H5SourceItem] - - -class HDF5Map(Map): - phase: ReadPhases - priority: int = 0 - """ - Within a phase, sort mapping operations from low to high priority - (maybe this should be renamed because highest priority last doesn't make a lot of sense) - """ - - @classmethod - @abstractmethod - def check( - cls, - src: H5SourceItem | H5ReadResult, - provider: "SchemaProvider", - completed: Dict[str, H5ReadResult], - ) -> bool: - """Check if this map applies to the given item to read""" - - @classmethod - @abstractmethod - def apply( - cls, - src: H5SourceItem | H5ReadResult, - provider: "SchemaProvider", - completed: Dict[str, H5ReadResult], - ) -> H5ReadResult: - """Actually apply the map!""" - - -# -------------------------------------------------- -# Planning maps -# -------------------------------------------------- - - -def check_empty(obj: h5py.Group) -> bool: - """ - Check if a group has no attrs or children OR has no attrs and all its children - also have no attrs and no children - - Returns: - bool + Get references in datasets """ + refs = [] + # For datasets, apply checks depending on shape of data. if isinstance(obj, h5py.Dataset): - return False - - # check if we are empty - no_attrs = False - if len(obj.attrs) == 0: - no_attrs = True - - no_children = False - if len(obj.keys()) == 0: - no_children = True - - # check if immediate children are empty - # handles empty groups of empty groups - children_empty = False - if all( - [ - isinstance(item, h5py.Group) and len(item.keys()) == 0 and len(item.attrs) == 0 - for item in obj.values() - ] - ): - children_empty = True - - # if we have no attrs and we are a leaf OR our children are empty, remove us - return bool(no_attrs and (no_children or children_empty)) - - -class PruneEmpty(HDF5Map): - """Remove groups with no attrs""" - - phase = ReadPhases.plan - - @classmethod - def check( - cls, src: H5SourceItem, provider: "SchemaProvider", completed: Dict[str, H5ReadResult] - ) -> bool: - if src.h5_type == "group": - with h5py.File(src.h5f_path, "r") as h5f: - obj = h5f.get(src.path) - return check_empty(obj) - - @classmethod - def apply( - cls, src: H5SourceItem, provider: "SchemaProvider", completed: Dict[str, H5ReadResult] - ) -> H5ReadResult: - return H5ReadResult.model_construct(path=src.path, source=src, completed=True) - - -class ResolveModelGroup(HDF5Map): - """ - HDF5 Groups that have a model, as indicated by ``neurodata_type`` in their attrs. - We use the model to determine what fields we should get, and then stash references - to the children to process later as :class:`.HDF5_Path` - - **Special Case:** Some groups like ``ProcessingGroup`` and others that have an arbitrary - number of named children have a special ``children`` field that is a dictionary mapping - names to the objects themselves. - - So for example, this: - - /processing/ - eye_tracking/ - cr_ellipse_fits/ - center_x - center_y - ... - eye_ellipse_fits/ - ... - pupil_ellipse_fits/ - ... - eye_tracking_rig_metadata/ - ... - - would pack the ``eye_tracking`` group (a ``ProcessingModule`` ) as: - - { - "name": "eye_tracking", - "children": { - "cr_ellipse_fits": HDF5_Path('/processing/eye_tracking/cr_ellipse_fits'), - "eye_ellipse_fits" : HDF5_Path('/processing/eye_tracking/eye_ellipse_fits'), - ... - } - } - - We will do some nice things in the model metaclass to make it possible to access the children - like ``nwbfile.processing.cr_ellipse_fits.center_x`` - rather than having to switch between indexing and attribute access :) - """ - - phase = ReadPhases.read - priority = 10 # do this generally last - - @classmethod - def check( - cls, src: H5SourceItem, provider: "SchemaProvider", completed: Dict[str, H5ReadResult] - ) -> bool: - return bool("neurodata_type" in src.attrs and src.h5_type == "group") - - @classmethod - def apply( - cls, src: H5SourceItem, provider: "SchemaProvider", completed: Dict[str, H5ReadResult] - ) -> H5ReadResult: - model = provider.get_class(src.namespace, src.neurodata_type) - res = {} - depends = [] - with h5py.File(src.h5f_path, "r") as h5f: - obj = h5f.get(src.path) - for key in model.model_fields: - if key == "children": - res[key] = {name: resolve_hardlink(child) for name, child in obj.items()} - depends.extend([resolve_hardlink(child) for child in obj.values()]) - elif key in obj.attrs: - res[key] = obj.attrs[key] - continue - elif key in obj: - # make sure it's not empty - if check_empty(obj[key]): - continue - # stash a reference to this, we'll compile it at the end - depends.append(resolve_hardlink(obj[key])) - res[key] = resolve_hardlink(obj[key]) - - res["hdf5_path"] = src.path - res["name"] = src.parts[-1] - return H5ReadResult( - path=src.path, - source=src, - completed=True, - result=res, - model=model, - namespace=src.namespace, - neurodata_type=src.neurodata_type, - applied=["ResolveModelGroup"], - depends=depends, - ) - - -class ResolveDatasetAsDict(HDF5Map): - """ - Resolve datasets that do not have a ``neurodata_type`` of their own as a dictionary - that will be packaged into a model in the next step. Grabs the array in an - :class:`~numpydantic.interface.hdf5.H5ArrayPath` - under an ``array`` key, and then grabs any additional ``attrs`` as well. - - Mutually exclusive with :class:`.ResolveScalars` - this only applies to datasets that are larger - than a single entry. - """ - - phase = ReadPhases.read - priority = 11 - - @classmethod - def check( - cls, src: H5SourceItem, provider: "SchemaProvider", completed: Dict[str, H5ReadResult] - ) -> bool: - if src.h5_type == "dataset" and "neurodata_type" not in src.attrs: - with h5py.File(src.h5f_path, "r") as h5f: - obj = h5f.get(src.path) - return obj.shape != () - else: - return False - - @classmethod - def apply( - cls, src: H5SourceItem, provider: "SchemaProvider", completed: Dict[str, H5ReadResult] - ) -> H5ReadResult: - - res = { - "array": H5ArrayPath(file=src.h5f_path, path=src.path), - "hdf5_path": src.path, - "name": src.parts[-1], - **src.attrs, - } - return H5ReadResult( - path=src.path, source=src, completed=True, result=res, applied=["ResolveDatasetAsDict"] - ) - - -class ResolveScalars(HDF5Map): - phase = ReadPhases.read - priority = 11 # catchall - - @classmethod - def check( - cls, src: H5SourceItem, provider: "SchemaProvider", completed: Dict[str, H5ReadResult] - ) -> bool: - if src.h5_type == "dataset" and "neurodata_type" not in src.attrs: - with h5py.File(src.h5f_path, "r") as h5f: - obj = h5f.get(src.path) - return obj.shape == () - else: - return False - - @classmethod - def apply( - cls, src: H5SourceItem, provider: "SchemaProvider", completed: Dict[str, H5ReadResult] - ) -> H5ReadResult: - with h5py.File(src.h5f_path, "r") as h5f: - obj = h5f.get(src.path) - res = obj[()] - return H5ReadResult( - path=src.path, source=src, completed=True, result=res, applied=["ResolveScalars"] - ) - - -class ResolveContainerGroups(HDF5Map): - """ - Groups like ``/acquisition``` and others that have no ``neurodata_type`` - (and thus no model) are returned as a dictionary with :class:`.HDF5_Path` references to - the children they contain - """ - - phase = ReadPhases.read - priority = 9 - - @classmethod - def check( - cls, src: H5SourceItem, provider: "SchemaProvider", completed: Dict[str, H5ReadResult] - ) -> bool: - if src.h5_type == "group" and "neurodata_type" not in src.attrs and len(src.attrs) == 0: - with h5py.File(src.h5f_path, "r") as h5f: - obj = h5f.get(src.path) - return len(obj.keys()) > 0 - else: - return False - - @classmethod - def apply( - cls, src: H5SourceItem, provider: "SchemaProvider", completed: Dict[str, H5ReadResult] - ) -> H5ReadResult: - """Simple, just return a dict with references to its children""" - depends = [] - with h5py.File(src.h5f_path, "r") as h5f: - obj = h5f.get(src.path) - children = {} - for k, v in obj.items(): - children[k] = HDF5_Path(v.name) - depends.append(HDF5_Path(v.name)) - - # res = { - # 'name': src.parts[-1], - # 'hdf5_path': src.path, - # **children - # } - - return H5ReadResult( - path=src.path, - source=src, - completed=True, - result=children, - depends=depends, - applied=["ResolveContainerGroups"], - ) - - -# -------------------------------------------------- -# Completion Steps -# -------------------------------------------------- - - -class CompletePassThrough(HDF5Map): - """ - Passthrough map for the construction phase for models that don't need any more work done - - - :class:`.ResolveDynamicTable` - - :class:`.ResolveDatasetAsDict` - - :class:`.ResolveScalars` - """ - - phase = ReadPhases.construct - priority = 1 - - @classmethod - def check( - cls, src: H5ReadResult, provider: "SchemaProvider", completed: Dict[str, H5ReadResult] - ) -> bool: - passthrough_ops = ("ResolveDynamicTable", "ResolveDatasetAsDict", "ResolveScalars") - - return any(hasattr(src, "applied") and op in src.applied for op in passthrough_ops) - - @classmethod - def apply( - cls, src: H5ReadResult, provider: "SchemaProvider", completed: Dict[str, H5ReadResult] - ) -> H5ReadResult: - return src - - -class CompleteContainerGroups(HDF5Map): - """ - Complete container groups (usually top-level groups like /acquisition) - that do not have a ndueodata type of their own by resolving them as dictionaries - of values (that will then be given to their parent model) - - """ - - phase = ReadPhases.construct - priority = 3 - - @classmethod - def check( - cls, src: H5ReadResult, provider: "SchemaProvider", completed: Dict[str, H5ReadResult] - ) -> bool: - return ( - src.model is None - and src.neurodata_type is None - and src.source.h5_type == "group" - and all([depend in completed for depend in src.depends]) - ) - - @classmethod - def apply( - cls, src: H5ReadResult, provider: "SchemaProvider", completed: Dict[str, H5ReadResult] - ) -> H5ReadResult: - res, errors, completes = resolve_references(src.result, completed) - - return H5ReadResult( - result=res, - errors=errors, - completes=completes, - **src.model_dump(exclude={"result", "errors", "completes"}), - ) - - -class CompleteModelGroups(HDF5Map): - phase = ReadPhases.construct - priority = 4 - - @classmethod - def check( - cls, src: H5ReadResult, provider: "SchemaProvider", completed: Dict[str, H5ReadResult] - ) -> bool: - return ( - src.model is not None - and src.source.h5_type == "group" - and src.neurodata_type != "NWBFile" - and all([depend in completed for depend in src.depends]) - ) - - @classmethod - def apply( - cls, src: H5ReadResult, provider: "SchemaProvider", completed: Dict[str, H5ReadResult] - ) -> H5ReadResult: - # gather any results that were left for completion elsewhere - # first get all already-completed items - res = {k: v for k, v in src.result.items() if not isinstance(v, HDF5_Path)} - unpacked_results, errors, completes = resolve_references(src.result, completed) - res.update(unpacked_results) - - # now that we have the model in hand, we can solve any datasets that had an array - # but whose attributes are fixed (and thus should just be an array, rather than a subclass) - for k, v in src.model.model_fields.items(): - annotation = unwrap_optional(v.annotation) - if ( - inspect.isclass(annotation) - and not issubclass(annotation, BaseModel) - and isinstance(res, dict) - and k in res - and isinstance(res[k], dict) - and "array" in res[k] - ): - res[k] = res[k]["array"] - - instance = src.model(**res) - return H5ReadResult( - path=src.path, - source=src, - result=instance, - model=src.model, - completed=True, - completes=completes, - neurodata_type=src.neurodata_type, - namespace=src.namespace, - applied=src.applied + ["CompleteModelGroups"], - errors=errors, - ) - - -class CompleteNWBFile(HDF5Map): - """ - The Top-Level NWBFile class is so special cased we just make its own completion special case! - - .. todo:: - - This is truly hideous, just meant as a way to get to the finish line on a late night, - will be cleaned up later - - """ - - phase = ReadPhases.construct - priority = 11 - - @classmethod - def check( - cls, src: H5ReadResult, provider: "SchemaProvider", completed: Dict[str, H5ReadResult] - ) -> bool: - return src.neurodata_type == "NWBFile" and all( - [depend in completed for depend in src.depends] - ) - - @classmethod - def apply( - cls, src: H5ReadResult, provider: "SchemaProvider", completed: Dict[str, H5ReadResult] - ) -> H5ReadResult: - res = {k: v for k, v in src.result.items() if not isinstance(v, HDF5_Path)} - unpacked_results, errors, completes = resolve_references(src.result, completed) - res.update(unpacked_results) - - res["name"] = "root" - res["file_create_date"] = [ - datetime.datetime.fromisoformat(ts.decode("utf-8")) - for ts in res["file_create_date"]["array"][:] - ] - if "stimulus" not in res: - res["stimulus"] = provider.get_class("core", "NWBFileStimulus")() - electrode_groups = [] - egroup_keys = list(res["general"].get("extracellular_ephys", {}).keys()) - egroup_dict = {} - for k in egroup_keys: - if k != "electrodes": - egroup = res["general"]["extracellular_ephys"][k] - electrode_groups.append(egroup) - egroup_dict[egroup.hdf5_path] = egroup - del res["general"]["extracellular_ephys"][k] - if len(electrode_groups) > 0: - res["general"]["extracellular_ephys"]["electrode_group"] = electrode_groups - trode_type = provider.get_class("core", "NWBFileGeneralExtracellularEphysElectrodes") - # anmro = list(type(res['general']['extracellular_ephys']['electrodes']).__mro__) - # anmro.insert(1, trode_type) - trodes_original = res["general"]["extracellular_ephys"]["electrodes"] - trodes = trode_type.model_construct(trodes_original.model_dump()) - res["general"]["extracellular_ephys"]["electrodes"] = trodes - - instance = src.model(**res) - return H5ReadResult( - path=src.path, - source=src, - result=instance, - model=src.model, - completed=True, - completes=completes, - neurodata_type=src.neurodata_type, - namespace=src.namespace, - applied=src.applied + ["CompleteModelGroups"], - errors=errors, - ) - - -class ReadQueue(BaseModel): - """Container model to store items as they are built""" - - h5f: Path = Field( - description=( - "Path to the source hdf5 file used when resolving the queue! " - "Each translation step should handle opening and closing the file, " - "rather than passing a handle around" - ) - ) - provider: "SchemaProvider" = Field( - description="SchemaProvider used by each of the items in the read queue" - ) - queue: Dict[str, H5SourceItem | H5ReadResult] = Field( - default_factory=dict, - description="Items left to be instantiated, keyed by hdf5 path", - ) - completed: Dict[str, H5ReadResult] = Field( - default_factory=dict, - description="Items that have already been instantiated, keyed by hdf5 path", - ) - model_config = ConfigDict(arbitrary_types_allowed=True) - phases_completed: List[ReadPhases] = Field( - default_factory=list, description="Phases that have already been completed" - ) - - def apply_phase(self, phase: ReadPhases, max_passes: int = 5) -> None: - phase_maps = [m for m in HDF5Map.__subclasses__() if m.phase == phase] - phase_maps = sorted(phase_maps, key=lambda x: x.priority) - - results = [] - - # TODO: Thread/multiprocess this - for item in self.queue.values(): - for op in phase_maps: - if op.check(item, self.provider, self.completed): - # Formerly there was an "exclusive" property in the maps which let - # potentially multiple operations be applied per stage, - # except if an operation was `exclusive` which would break - # iteration over the operations. - # This was removed because it was badly implemented, - # but if there is ever a need to do that, - # then we would need to decide what to do with the multiple results. - results.append(op.apply(item, self.provider, self.completed)) - break # out of inner iteration - - # remake the source queue and save results - completes = [] - for res in results: - # remove the original item - del self.queue[res.path] - if res.completed: - # if the item has been finished and there is some result, add it to the results - if res.result is not None: - self.completed[res.path] = res - # otherwise if the item has been completed and there was no result, - # just drop it. - - # if we have completed other things, delete them from the queue - completes.extend(res.completes) - - else: - # if we didn't complete the item (eg. we found we needed more dependencies), - # add the updated source to the queue again - if phase != ReadPhases.construct: - self.queue[res.path] = res.source - else: - self.queue[res.path] = res - - # delete the ones that were already completed but might have been - # incorrectly added back in the pile - for c in completes: - with contextlib.suppress(KeyError): - del self.queue[c] - - # if we have nothing left in our queue, we have completed this phase - # and prepare only ever has one pass - if phase == ReadPhases.plan: - self.phases_completed.append(phase) - return - - if len(self.queue) == 0: - self.phases_completed.append(phase) - if phase != ReadPhases.construct: - # if we're not in the last phase, move our completed to our queue - self.queue = self.completed - self.completed = {} - elif max_passes > 0: - self.apply_phase(phase, max_passes=max_passes - 1) - - -def flatten_hdf( - h5f: h5py.File | h5py.Group, skip: str = "specifications" -) -> Dict[str, H5SourceItem]: - """ - Flatten all child elements of hdf element into a dict of :class:`.H5SourceItem` s - keyed by their path - - Args: - h5f (:class:`h5py.File` | :class:`h5py.Group`): HDF file or group to flatten! - """ - items = {} - - def _itemize(name: str, obj: h5py.Dataset | h5py.Group) -> None: - if skip in name: - return - - leaf = isinstance(obj, h5py.Dataset) or len(obj.keys()) == 0 - - if isinstance(obj, h5py.Dataset): - h5_type = "dataset" - elif isinstance(obj, h5py.Group): - h5_type = "group" - else: - raise ValueError(f"Object must be a dataset or group! {obj}") - - # get references in attrs and datasets to populate dependencies - # depends = get_references(obj) - - if not name.startswith("/"): - name = "/" + name - - attrs = dict(obj.attrs.items()) - - items[name] = H5SourceItem.model_construct( - path=name, - h5f_path=h5f.file.filename, - leaf=leaf, - # depends = depends, - h5_type=h5_type, - attrs=attrs, - namespace=attrs.get("namespace"), - neurodata_type=attrs.get("neurodata_type"), - ) - - h5f.visititems(_itemize) - # then add the root item - _itemize(h5f.name, h5f) - return items + if obj.shape == (): + # scalar + if isinstance(obj[()], h5py.h5r.Reference): + refs = [obj.file.get(obj[()]).name] + elif len(obj) > 0 and isinstance(obj[0], h5py.h5r.Reference): + # single-column + refs = [obj.file.get(ref).name for ref in obj[:]] + elif len(obj.dtype) > 1: + # "compound" datasets + refs = {} + for name in obj.dtype.names: + if isinstance(obj[name][0], h5py.h5r.Reference): + refs[name] = [obj.file.get(ref).name for ref in obj[name]] + return refs def get_references(obj: h5py.Dataset | h5py.Group) -> List[str]: @@ -791,57 +66,18 @@ def get_references(obj: h5py.Dataset | h5py.Group) -> List[str]: List[str]: List of paths that are referenced within this object """ # Find references in attrs - refs = [ref for ref in obj.attrs.values() if isinstance(ref, h5py.h5r.Reference)] + attr_refs = get_attr_references(obj) + dataset_refs = get_dataset_references(obj) - # For datasets, apply checks depending on shape of data. - if isinstance(obj, h5py.Dataset): - if obj.shape == (): - # scalar - if isinstance(obj[()], h5py.h5r.Reference): - refs.append(obj[()]) - elif len(obj) > 0 and isinstance(obj[0], h5py.h5r.Reference): - # single-column - refs.extend(obj[:].tolist()) - elif len(obj.dtype) > 1: - # "compound" datasets - for name in obj.dtype.names: - if isinstance(obj[name][0], h5py.h5r.Reference): - refs.extend(obj[name].tolist()) - - # dereference and get name of reference - if isinstance(obj, h5py.Dataset): - depends = list(set([obj.parent.get(i).name for i in refs])) + # flatten to list + refs = [ref for ref in attr_refs.values()] + if isinstance(dataset_refs, list): + refs.extend(dataset_refs) else: - depends = list(set([obj.get(i).name for i in refs])) - return depends + for v in dataset_refs.values(): + refs.extend(v) - -def resolve_references( - src: dict, completed: Dict[str, H5ReadResult] -) -> Tuple[dict, List[str], List[HDF5_Path]]: - """ - Recursively replace references to other completed items with their results - - """ - completes = [] - errors = [] - res = {} - for path, item in src.items(): - if isinstance(item, HDF5_Path): - other_item = completed.get(item) - if other_item is None: - errors.append(f"Couldn't find: {item}") - res[path] = other_item.result - completes.append(item) - - elif isinstance(item, dict): - inner_res, inner_error, inner_completes = resolve_references(item, completed) - res[path] = inner_res - errors.extend(inner_error) - completes.extend(inner_completes) - else: - res[path] = item - return res, errors, completes + return refs def resolve_hardlink(obj: Union[h5py.Group, h5py.Dataset]) -> str: diff --git a/nwb_linkml/tests/data/aibs.nwb b/nwb_linkml/tests/data/aibs.nwb index 6000e09d073fcd332f3cda053b1e95f5ef7b68ff..1380c551a19ea4f58961f7b5e79af01b6045b378 100644 GIT binary patch delta 8392 zcmZ8mdt6l2_IIDhoO1?d#5Wr0keJT`jmpQALyT!Q{i1%9k4VwTjOa#1H%dz^t`0ip z3e(s6+`_2|Hw~tEG-$VO$zu&xrka(J8GPb zD|Cejc>)Pnl|a(;vI^hikZf==(~$F`&Z$V6-XZGLK++7a@aE8pPiK63o1wK~9dTDw zuJ94EGSKMgQ<1r{Ofy6?m%9RK>n4$EonF4K)S$1};|Y{*a+6kFHO*a1GAi8LW5_ko z>W?Rpeie-$@8OC{D*a?w#mK$p-%N4R!Aa!7Kx0au3je{VH%#>VHY&%9C3BWS!2ai2H5oHH#IyeqT*Uxz5>9Msf`fbqFM%)H5ScE)bvL4K3+I zJm8BaK|+2;1W{pFZ}I|%Ak$0!i@rahg3m!R4KlRG6AY#T`v;QRB+`f8@HmM#!dHG* z)GuW4^@KYk2#y|6Dl^Eq^$4v8hP8~g`VXv!yL-3;do@&q@nUqalZo-OSOl^)rM@$dx*k2rM& zh=-0?g%ZD3>~D!z1J>l;VrkL@A(lL6D9)T9v>V4jS6uoaDlUms+&aZnOkKZXas;QLM0MD!bX@mr zVYS6bpHQ|s0d9F(SW3tRLnM?f@ajP^x4dK!ZJZ|bfzVh1k$JBPJgs!&Jr|(rI}Cx` z1wuMMo~$0x=py7z1q5R(E8iA6WmHB0nchgv(Lqz9_%1}3twn*=kpdYXm;z>gNwj05 z&`wI3jjIZ@mJ0`r{R$+txM@c5bc`t8C@B|u4AVOaye^7Og4FBlU9mLV zf*dh1W|YC%W)YVQgI4|W;PPQT{yY_bv2@(=I*EKS=tA<|=G662lArCAQAmUgyC-~vsJq6J3)c@y+oCt!4j%W zphtU)sbVwAocv07T7~%oM6=d}9Ejb;){h&KIaHJcJxzn>!+ndMCE~sX858%XBO5~)Q433R zL^oCsT_a5@5_2H^rLcdkYef%j|2s;YViH894JYiebjezB53F1ung>}=;FbauyJ}dW zXhnr!k=QAVol8+75flP2i?^35Q5q`&p%CCsa(s^oLLfXNe}2Y7=6)qmgfF} zLCt!`)J>~$Bri#Yg>lU0CohHRF)t(ie>q($2!}&oMd?Dc`7)$*ldzAlxeeS)Br|RG zW*uCL67O)hXJw>XO#M}olWgY9;5vynI6c>#fD2>e`4q? zl^*?CQlRDYuzr?B;QJR__HFR?LsFtz{Hf%G!mq;;dN=ajAggplExM5!Ays!gEQLAc zrIVhOVj*-=GCL7&Iri?ZvO9Tn^k}_mJ_AYV-U5BbH*({vKz%~bUCqn!r8H1GYT%=kr z8{`iN3#jE*F5>6s20k4V}jU!$g|1tjorF9x^&roaQ62z zYY%4~**|Oitz%NUe6?);gl(@L+jDpO=ET>UCoXv8sow4FMtAyhk(?(QeJf>&JV?6z zo7iN7^VSmCC(xVH~BW%LOzxC=#U zn8+L%+nQ&;$)z~h;dEFnw+N&sy#0}URVn^c$^z$Cve_41Y3RqwuQBXyNGU}Do-f~p zb4@Q!a~K*60(QW$6zyU2JS~^-gmaw5EnQ{6p}fW1QouFh;(F?1*uFEoL6&-eU341N|&+(jB&+Wp4RbSh65zpoKeLypjndSezu; z5ZHN(h1WP^+=)&^@IN@XB*l^gm3M~K>nA#63`(prBmgB$c-lHt{Yk3DNz2d5vCz8C zf;pbyF~NGsJ07sKlYd8Q&zNEg=uxK~Q&Ia6XiK$VDwoZ&nEB6E2A(I_x~BpeWC=$+ z$HGr{cybo+XtKlo6Ig1XJq-EF7Bl3!mYi2Cr39;@t_5P=w3HgxZ>n>pB4*<-=VJKy zNyEr_$*=HGB6F-g81@>M?In{SWg4TqH=1;Fxhiehgwf82Zu4}XSwvTEx9~ztTdNU$ zjG=KO>kpe~>S)AZ79O>@1)&HI9kmF?y^w~!voweaMY6Fz=-Fs7%a@LbK4Wo<(~&(d zQNU!I+H5K1wdsNdb80@k@;P(U*%3Bm)TS$FsH_OAgOzNoOBTh9dV`|)#l1yxMf0X< zDjbPX%+{k%ObOi;xA0>TWOrBi!DaVHcDaoIis|%->OFYq48|5(A5_eWi|q@qj>{!V;M5*n|F#Iue)!`sP^>X0jRs zEd|VcGHsivV6%Bc;oS>dKBb9}EI>NzGrhzoVOg=#l^=T?sl8-}0dlQY_^pv(Sj|?mt-$Y-l~i;@K8{u5u%=4M=9fG@91k2)QpNjrDlqR21xsefH%d%b9R@L{6?`Up1zE$73+y5> zQc=Lu5)K&haUA(oaf-E_WrDwVmT_NFoMZ$vp23LkXi?2fbP1{->k^W2r$biN?0>pm zU$p8Z8yVBw$K+nBIn()jtD^}KVC4C0T!+b}{&>|(9%oEEfb)l`4x`LcyeNC1#ibS! zyx-_tid;OSlfzIZLJK`;x(D>VNyP*i@knQ8-J{6-8)wd+8D2~%Q}v5q?pENj1;TU{ z_CBqe$$>h(Li28z{)}oKDcwxYOKJn9-G7hMdlspej8XJ74tK9vYG?I4V@+2@1sONLjnpuR4aM%y0+mq3}bW*SMn~^ z?1#Eq-);nX3{Y{p?Mb+$TAj(W*8e%%g^Wm>$wxcetp5t(5GhgkomyZPzTZ^5!Jk%r zL^!wyPMlFcR*ii1w_wCh?}2{5hDo8{kaS@WO#j^^LCjS&FW4$~Ox8HCSkt`1kV;sh zX(zhtssb?s=0Ye=o69EJM!y=b!t|T9d(2}ySYzWm(lKWGbKRsQtq$)u_(lwl+cgaL zIs-)!wS)%8Yr6~&*&{G5acM5{kpTyUOJ>h`MVkt)|Iu;@(P-Np8oqnZxB9GlveAs` zopfYAhC2EIO5qtOc|tqV%dnpNiuMIgr3O50p5}?-h8LmBiwt@0@^YNo%Q4EvsSmVMGHltP*@-d4g~~KO z#GxLqaB2fw{!sgrSB&yH4QI7onp?!DF{6W&R%vDjVTP$xtvRv0{_&xPXTHl~?-zqAtA7nLhrPyn|k2cT!*7lzIChLivJKKth{Q-=; z(Yjuxt(n$nT9^}o_L7|j(8!dgm#k-ba)(~EVtRF>@q?{;cJlN>U%lieKq|&4bjIq2 zBYtZGUuk(U!@Oo-{Hv{IN;Ygj&|4f--Xltd+%hYF^w!->1Lo#AF41t@T1K<>n{GO) ztzIEz6Ktur&NhBbftU~rN)UGMvTnsLf5H^i51s#{)kVsn$ZJi-YyMfQ!|+ZwRC10P zQfOl>uFBDD&Fc}4)y}JoH5!n$UJVpXmT0sVRb`x)kQU`70jNERDrvBL)C}?bW{uwX zI2mB@@u!1d#J?g*kkm7(+9uk!M2g{~e=&S?QxrbF^^asgZbDSQzT&NsH2qfw@$TzM z%|a(i)!k8@DbV1GT0_LYb&_H~SctDjlAt^_YE+~6RwRw;?yVaXjqa_BWI*QN=*fck zQ6$Z9Z(k=;XlV3x&a@7X?*5hdLnkTS%D&6Y%K1(V<|Vz{J?bVsrr^I4{X!hI_T+yX z-tFsW3le%E-V`c`v%S!dPEIg|^xtr{+-V!d`$WuKn+Ha?Y*^^--m1a7FWR0~Y20Mn zUXnpiOt$fI$fU?k^w)mEII&B*4Xf4_JV}q(1eK2U+p=j+x{cpohdgfMrH^4~&lxtS z@K`#mdC``Q@85H5W-URkiwI@uaA>b>Ha`43Z!?Qr%!@X!5Yi!Luh&(Ty=HTgICyA= z4U1y>qOg4VVw)G=f{2IFOZ}z=8oCedyI0uUP6k7di3S2%F)}+&=CbTecDok$bM2GYX%tFdk zIB_wosN@Rj&YB9t{tAhGj>Tbv3U@9EyZZ}uzkF$q}@W3?Y zA8y`}o5R9P6XXrCcL?I9X*L)h%YM!oTW#r0y|>x<_T%r5Zkm&9cjE*3tp0Yi7I&AO z?>^1F+wLbb7#&}!CffPbs$Y9tZMJBroMgvc32; zC)=@WB@MPap?yx++15|29vYmB%f5$6;DF|Q-k!+s@O8UWtXgC@Qa`lVj;Vhl*PcyBEVh@@&~iKfXFNVXqj;~wA0wgoxZOp> zrhx*DKWU#Jev!&QLcK(T5<854l*^Jq@ekr3vH#&8yJXp5weII1{_pjVdxfbM4tkb< zjIQP%1>f?IreejAN!rRkT7S_DN`|OU2n0rc;k@+XyeeI2f z3*TP2aMr(WyVVl2`pug4OKa9Iu35jlW@bUnOmEH1!kU@Bnwf9ZC|xS&^|$;#-R)fY delta 8230 zcmb7Jdt6ji*KY4~&YTO(5luxA6$D{q^lTTu^5CFL#UQZ|~jhnTn|HK|*}~n*PxccUkdD{#afj zO2z43D+R9fk9+*H?&{7j?$9lY5s6Ev*n3wu>R*z)S@&h7J>_)2ufLOQt*Gfg$Zi5U z5%AWE`pxkYTOdWeUNLw0cY>kH%0SiaVX9-}#&WvKpESWk@9tBveZuy;4eYvA#8t6l ziI)(UzkYLcMb^@QY?x-f?eu4?N+;8FdHJd`L%!rQx4&$iiEr+g{>;QE-QgoA(a`$0hMBZKo!rndef!Hg-}Ft|AL*Z4-%Rd>7vCY6l=C~)#v1z) zM@8+W3^PVjgO5D=4}HRKmLBmdw*J11kgUHOZTXbs7$7W;k{QIycHd5-p_~%0u~t?Z zP2Av(Bqf9}&6`Njrz4qWI-%CXg88HqI6D!uM9>&du+jhsRg*;avo7OFyiwNj2O=(! z<9`?Y7$%6jHDU?mjv?g&`QxS-oa0GezCIyNPUipHS!5C;dMsA1@oI&=sHG&ouP#3FRKwQKNL%%0eCGh%*S!U++6F2Mk z4k}k_O7zTMN%G7}A)`EGmxhQ=$JUauraUQDbg_c<==8IRn6NpdEUG}K8>YNqhb8yp zhO(%gbgp&>7NkBpKa^R$J>f{TPScI^yF zB>|}aRa;@;$B;5^87fCyM5V~;pQO>?J4f9Ti(Q3OXlc4>_7j?|{1weQ8fL-uS`>Fn z$RgTBp$iH5z|8V_6M(KcpGtuwL6`Dh`r ziSb-CID1em^Nz(KXq{1+ezdqB&;r+8H<9K+WQk=_`~X4(R-!?=VE~P5=w7obCbf9h zvX(ZJQF@HkzWowNn`pq;Dt}Ugi?!@Q|3$_U8jom82pqHmFl3tx+=sQoSA|T>_P|QN z!_==Bk(u)#E^TT3P0wY5i=~{SE}EVUGtbdI=H_FmJ#?%t%9U8k@3e{%&VOq03^w%- z8gEow=hv}6w5a)zJcpYsF~@N#u|qOf#tFkGW8$pK z!*-oI5`R*6VCW#QLW%oFpl%l985cr+9b&7&9xY=kkgl&K}cVyDdl=gfXqE>Vm98(`N%k!vrvmw zG=#JIPotCbkV4H~J9ZCq7dq-iD}DiYDsm!+H4A7;df!3aNK+SFY3ZyE$g|q_xMngX9Nw|sg4c;!y8#j24L9&sSOx}B`)$$E z|H?3d*j-$fx&ESRtl58z;H=2cyX&P+#p2N5WT5k`LHqao`#9=k`&lYB4nCMEMG^0+6r16p7MYgG9>$ewCJul_1 z4zexWqGT06Hmnyq%K2up5A?M{(l*{Cu-55Z%<&oG|7Li{{h+1V@OcOy?^YLh_i4GN zLVUWAC>3w#9WeLnkcD1jZx=yg{c3o}+V9%pU%n=i4Tnzf7Eh5-m^swkDPXO1W$gKo z61N=h63Wro3wX&apY}ulV z_3P%sJ%V}NFk|Q%hR7G8>0JSd>&*~CEh=BY>;G}Vp&e9RSvyX!jzLnU;N!X~T*c*e zPf8H`j9_&pW9oWVaM2+OY=2g;u9>2I!9z#tBD`h_UJ8C<)w z`Qz@_$nnfB&Bh(G?G2%TGj5;c`9cpK_IFJ&G_&UmUh3boxFbtiB$Swu7YQDc4x7$v z`y7`cz|yd|pRh!H8J4Kn@C9c;usEt+n#}zP7WYFf8*F2x z4+#!vskGen!K`E}8hmSd*HwqSOA{!LbbEvbN)n(c)pkQH{)^8B#}Pp?d#E!5gD?FW z-A4iGfd(cmjtk~hrq5wnpW8Z-mox-(4r3Tmg2J;xsfiPJUhuKTR9h#&`L0oftqm5h z0P{oc?AN^Uiiv#z9W$>A9`l03VF&uPu_5XHfX5PT$ex7GHj5znciP-+Z;Z`FZiB@O z&>iw_x0y>XUaIL5Yz_oy`zo%y@=lvoHSOH8$VBve$I!9Tp|(8Oai7iNrG9QR2cX3o z!vfI4a?eYA zr~kE;7%yn1>-qVJ&7JZH#-VChVlxjF*2_b$d0@d(n{}KpPY)RjHNR?=b*#0S4a-VV zVh!uC1|EpNt{HeY+04fPYutm5=QKym?))xSOtjb({KjUoi3N|_ z>PQMhCEhG)_)vlN^)_>&XS#djIdso8+#%z#Enp1nmA}}KMP9?x_qh!lwWB3ebE5}c zL(d&BbdrcXt38Ot^V&p>BZp1c{3y{Hy!0U`p`C~iJOjI^ooHQI*oHj#{HT8k z0U@@BXf^brJ5Ziz%JUK}xiR%5i&x2`Y;hhY7}q}p6&!>va}065BP7-*@$830r2i&B zA8nc#DO$4zUQqb%n<%D|qgpVHNx7oMV4dUcJkiA^cm+5ZElt5&x*()Rg5$mNl9wA~yFAL>K)H-9He&mhiv~|48g_GzBOMh&Tn0Wpl1tyUj9piY3O=N1%u9Zf|eWX&{D6)ZjY${8N z<9adQF0sJqyZSOlRpS=U0}N%cuJlVW+v;*!F-9D~LasL!<9l7yhMNBqquS^g5Or2W zdY#bxvCQs

oQ5BgPfcL6YIc2PI^dH(A6X`S;cquB&L_4YSzNAX(INQptzi0=;p2 zgNkHPPlxtKN)95zZnuQJJ!-1xfRqlBxyRt^DBVxUq*mHEOUr!ml801jN?h{XK2j&6 zr80clx}m`-%_VpR*84PV?eLJ7re$;87^Hi#^4JroJYp&fpAO{_%)&f+L(N94 z0_>e4S$7B`ddMS?yh4k#cY15Eynjgzcz2n~*&$_I5jqiQx>t^u;DASZ&^YE0^|pi? zG0sr1fqkUm;C?scr7ymDIj#>ogyw2kl&o%T;aYcnpX}9!&NoC*X+D_~UhfF!#I%w??bd84iK?-Ix@VIoq=g%2oU~aoczQF8z2WMtFiyxAy*q=@I$PJtcLfWlDkWgo+KaR4a&#eBV(nnoAwO@ zC5e>`k++7N(pjlfc2adG^ywvclPVvU&l{dB{}p*GI3Jbs2{&b@2rCMe_Z3#y2LUq2 zwF1a`3IS%C08l+y4szA8_OFhjnHY5jo4^W-+|$Z9t5U3s~o$^Te( zL&9(}LD8IR;j10GEITo_eG0MJ0j17?7fO$aF_wrJMV*0JyP$xH&x^Qpz zrI5EiSTtNl`~j?OnxcL7#t(gw0U6s-vy}-n{w9A%@X3u9RwoSJkgfYye@HIW-HrQ~ zVMrXS6vdn}ASzO6BxIZ6U)fQ~G^Pv4>Y^ZBP8o7Y>Y!75yx zhxCVTU_bs!`S>0;1Uj>MQpHFiG%1m=S(+*e$;7afvgUXL2 zmc84`A#M+@=CSp#VUIG&TuoFEv!;Rduzjnt0k4dcmaX=y2=kp%oOH)}Sm03}!~y8M z5@)PR&$slVmT&Y1P%bMAZVlC;@`ff(->@Y&!m2`0j_}ac8w?m$(k@~G%e@wPn{j6F zOZ_EAW9-I1A*p@D9y=QsYyO+zXZ2$E$Bg)P{wJ6tHDRh4|5LN^5U5lI+%XJHv znf@_$^uH@y0Y$O)X))a3a2eBoHra5Wy}#M{qjKzS=;yT0CETL{8CFcUPr;YbaC;Sg z(LFibZnizmirmJo#Lz(`3I=%W_?FGfw3k(mvQxbHw8qwJyxnYUnlWoX!S0|_N5k^z z_FVkv{H)z2?pT(_O4*fesazg;4#TfJ?LNmb`o( zn$CLy272t)1Q>i6qq>&ZUG(e|uwaQj*F2ya1h`5tL8^{1my`10&7j(1Yu93g>@g6t z&WZqao9q_|-8BXdRJ4L(bE_~!%vf#tR%kuMvHeL6=PpYu_wReEPT)q}r$Y)a9vQo2}j@WrawX~&z2 zLw8sTJpm}_rM6JK2hoj~Eb!c|nk>K{{9G(AMRk!|;l%2RC5;9pDxZOdnD{gS24rY;ogDAE$`w{=zQI924AM-T|7a>A80dS!35RfO5>-B zo0a6K4*GZ|oXoN0P&y5Bqkdh6pKxjm)hhBPTRuxQIj{T8ovk{!;5-#h<*0Zt`n|1| zLG_!eMfwYGTj}c7$9x#&D-#h4-&J$WssF-q^?o^Lje3jr-v$4i3Ef%x3N?;qdv(VKI!#w;b0eLByU&RSkkBzAU(>IQT=(~;%Ru-(s|ty0ZrM%`Pe>9bSC=XDQ{ zYCgYr?pLR^RUh#fK{E7E{-plq{e{2QK=BFHN&LO`-NuSet3Ca__r>4*qx__L2j{$( zV-ruS54BeP(>Nrl{$A}#&j~F4l$yq6OtSs|yhAXdKlD{pKcJd`|6}|i-Bo2I^^60 z5DQ+y4T_2_u>pxAt~1+Gk_j0nuWk@4Nl3eS6iu5t#3rpxot9*JAOF7(Rx?v(*uCf6 zbI$$ldEI;0c_F0ty^yv>-9 zCFnT5_*D!yekeW@4;&80xz`f0rOynZuh=sBzW29TPS(TTv=zb!5frt7{_FI7I-cu6 zN7C^u4?3QXlRfAKbj&R2`W8AidC&=T>}h|Ypk=B+ZaaXExqvLE6X70;KceI3Xxwze z_fR^}EJ>DPJ(!Yoh|;A`y6HbrB)a?xD!+B9d!9)Z`nhSD(yKG1y)FX^Wn9dbq;n=V zC3f{-iX;tebkjJ^O*4Z{MmNo*z+;}-s|$0_GcOiUfVw@3MLK}?x*7hIaWD0K?j1~3 z0FXTtRLWZ%M0Z%=p64FZXa;C1k_F@c_zu6N0$y^b{cK%0gak{$+-Bq>V%-A#-JjQC z^~gzNV7Zro$B#S?*0VUPTjU*aSfIdAKE#!z0{rU7F}QTl49i}zRS(kq>+MdJvgd`QWkF3WHL#Z&Y&vtq!T7_nZZS>9W5q`rjm<%7GRTA2a zALxmmQ+iPBwWdvcBgv1=+^D*il1 z#nW%8V0B=>x2xBl*TjV_>Oq&fFe7{*NtKrbt;@A)^uPy#ueSk9BOXkMHnOy4e`$Le ziB0dQklVzD#9>8hMCm>qJC~9gG`f2~b^iwJ95V-uUldbxm-Y)4%Do_`jd9;Sf(w6d zjBva!rgc2mH9B-`a3mVxGm)y%&9WVHbRwf0GV_e#%7n0Bw*4gh-!9*fbWC0C_y9MK z)q?ep_}j6R_;a5K$yD(r4KER^)@ii$39`8ag$<6qBg~uu)i|cjHEhHgmkJ?m1#Y<% zpKbY2uyxc$8eOGh(X{k@*5%Ia7NhY!EIr%DRNE^p!=V$#*B+;XZ||Q zgfp+@LUW&e6i}>+H%~6Z&Ts4B{48>;9Mz-mT=J}ur`;%k{ytmdja3E+4JMnz&|2hA z2E&kRYoq~CF${5Y(FVCZ0^!-0>tyR~2)A60)=vcDFrfnYgyXYY0}%ySp1{nNfHlh7 z9+fna*mSB=-dKU}*n%lkgP4N^@c)xvfJjyUY z+WkUGxywt+?_TPAYQM~il3?7=RV0xit|Eb|`D|>|v#fHVh`m1<19P&fOFfeZ3p748!KREKh2hH zZsb8;)@C#0u9bqh?cQ2pyQccqwnOmNi8*BIFwCz_8noT$ZVn{u zX;kRi-CUe?DaUUf_rCq?(`Xsuc~OfVaEX|;K-yb&qX7gh{p5{(C{{a>4z!QFK%8~x zA>_-R+H+I3sR5ad=bJ^d`V#RIX!z#{?>d@1I5y^Lvc^o!dBgkd)s6J%GP(}{a&Rxo zW_uq8$j^>-PozLE!&5RpQ&2%`sFUx5iz9AO} zQ%4(Fg;6b-+6lcp)Eb=y?D``~(}j3ndKg!(>he`~wR$He2Rl)9M8}L{zq?(;Y#56s z^B6xtJ-8W{Rt4Ti7t#wk5u?c*+!n96LUaesxNkI**F)SWZdOl_ctG_SB(ClSYU1JYLMtV-E#< zD$b2&F)Kuz>EW4FAMY*kLVTwe;=6FxS93{FFIpxK!~%YPK@1K$yj+&Me??C8Y9}GK zb`=oQ%P1IfPq1LA|r}n9^H3ebgAp z@nmR&s(7*_1w5{{(49oW5+DO2PVom7?(I%zb$Tcnj>7dtZfW|uTOP=QsM}pe``02e zunw8!r%y@t&r+aGg)2*cK;v%K@!CgKa{X?Yke{VW7nZ|D75aVZg*daI42FF9krpKm z<=caDAPk@;+rBUlMiJz`Ac~(}LX)ay)WFP(^$soGp&2aR#wLxltYjo<%?EW|XqQM0 z@kw{=PG&Csil&pc$^CX=ta6QRC=>kOXmZ0_>BTIMmvPPgptb$45jmWw7PccFeQ znT`@#?+lK7q+fr7<=WQa=3p2u z;*l|H&_dTRl3!#?ckmeGYZxbIB#LOjeHD{>3W)ZK5YB+BBg1r+??YHxW^nYl_?6vL?>2`CK#M~f}ZhO zjm!fB$|0ZTDb@4OYbtsvd6*a+i9%W-`5>S>!`1WY#&W$kmahzs1R_x|K1%kfLd6^$ z;L*o8%5xPrLy6gKwY!WwP@&WsBOekX=@s_T3MB)eyn+O6QL1yUYrRm1?-tZ-YE?eH zS?#9m0Q^wo5f*q)n_;C^d_y3Dwz=4MMfW0R)OV161H90 zV))O1Shbj}+=IgP*Q7sxy%i|*trd>9#HZ3%ExiY6zeg$JB-f}}>?AkobL_{j{{ap^ Bz2yJ^ delta 5505 zcma(V3vg7`_1*Ix$?j%D776*WY(9j5VF`gIHp&VyBpB%i14O|XBS-*0At+5DFlitd zHFhu(Zpf7yNDze7rj%vVuEmI^lL4U%CM8k}tD-|4?X0#{iXCRFy`T5??Xob_ok{kd z?>(P;&OLVzo*CzRf85S#;9J|C-6mgI0A%Yj<#(>(0)*d+XWj`yoOD(4O&*av*jZ)_`C3l(>7DrUd|&8baSxpj{?l=ac4){#--?F zfuAP6q}o5U58Cjyq$HCr(LQ(x+fSvrA5Ts*$>u#0d3ird#ozl1@Mn`Vl%8zcYFu@^ z02gMK!oVIXZc0;?^}X6Y___O)DIJT^emrB-RP-8tfc`&#e@_1&#IMu;ue;A~nwO3< z_Z^EiQwJsKGX7}aOylTkh3hEAXTMnp=3n8J-Kl)1T^WUVsM~`3FFA3;(L>P&E|g1! zQgN~CDR8*3%OzOxsH@Ojbv#!FUpH=hyPC$+f~!s&WN#C~@0^}wGPjF3+A(fBZG*g5 z@R8FwChw52-hYxs*SNzgz@6ANB@qwL9OE9hXZ#p6&TMN zHITqgWJITMmbY$6Uk}}xeuQh1q{l%9BO`2>KLVSMs~jc{cY&%Kb7QTw6Ll`Foadb=>F zL4~iZb+=u(hUB~@2+zA%5<8pXwUmt^BPGhjBA@EK=7};QWHZl;?<7^#!NhQnYc~jZ zM?Lk}(>^?pd!kFb=U!|Axbi7J;oA)Tf5jtT=;hG$(KM*~3$42tH5QU}jnY0i37bDQ zz}3$vfjC-`0gAt(OI$^fj_=wBJ8|ISTqvL?>1N#eZ83Blrib&K0U9N|FR&S${|U0Oif#OFDI7oS8M_KS zX24G$HIUpWTq)&jiPN%WqG>>qiFH$>@-q?oow(VNO6-$R zE==rYd(Sn>RoTKqdq%jjimBAX+c_7t(2z@=@s|kJff-?}v3Jw0v~3X{=^jr8W+4mg zK*35FjR{r)WG3*nV>Zo~HC>zqUh%~GBa{SldU*>VeamEn$1w+0SY)ajzc1ex&0J#Z zRHe+Q?Wf@VzVXEAL>QD(@~IQ0j%6LfB8eIKZ8}QbVV}&!oYZJjjHrrJ z-sN~n8eKy(*hzfL>6!Y)jdtA@ZrAuQyW+{LLdb+oD6&`Mr=tzfS5Lm)jMkX+zCo(D zpoQSiCvR*)i|DuK`WAEt3ObsI2md@jHuO?T3fnN5%<4enJ^n_t3&AgY$+jkxrd}tl z?Sn6o0Vh<#8@Iq8kHl9r)8pE>mDDw(9H`&!+1E@D8q_y=e*Y{=KqkGzlM5}Vh}oU; z9O?|3Ys0Qf^YP^jC5TDo4#d&OYDXw7%z<=*UPRR>!jfG{QTX^rCa7s|K7Zwt>L?~=afI3H`1{^xo6xa$`q6pN7j{TBG^H^<9?7Z7P~LfTc$Vw(^; zI%H7?1&l4f-nUirwSc0NLwvuJu3w~6p+>)p!m;#n@h#CtMM>I4Ve?OQ_!8}N8vh*1 zzcOE^Gp>bEIxJJTY_7C5sDO=QBSn%_8^=guI(R5Yl4>e-80YCQQ@D-xB#q9vAcW5J zH6-YCHXfBy12ukXi`0s?>WFBHxSO8ez;q@m43I+vSUZ|ImU_5Qr}K!ZFf&NIb^q`0 zWc8Yk6kx5{z4O=)3O@$2xiuoKeA@DN-mHQLJ}`h2leV2`BMe~@)s8lrI)1_|l%)PL zdJiRV;mod^2oo=+EvTcDT)?OC#c+FsO@Qom2^+|eVx z(?q1Ct{F7I?1^Il-yL*Gr>&VhO1c0!(k-HNy*ng2x#ZK`A-CbIa4TmsD_I}AYd=~d z?>LIsD``1Y93o%R*Ajob5c6vk6F8tXB8blv`!6G12T+0Ra3MVNlL;RG0rZ8Dm{+0< zaxM+F2h)@18Tk#mw1HHm1BT2g<^g}fHJR91ftKcS#Dfw_+ej&(s!U@aezR2TF239JS@<1ek zA7$r4AiphwDX!tm2(t%=bw0fW!`_;TOhmQh7+wn(nJ~`SJUV%WN9RQ zR0e$a@U`8h7?yfp6@219?Fae+<&wRsyuVWK6ATsquz>R(5#MjBeDyVSmM-B=*cX9* z9n*V-RPIBjC$HYHp@m1X;_&PR-!;(mR zrI41kOOGP7Y5TMkb=CR!e^tIxBKxUO06S5!qN2OvjWeUC44S=@}PfD?> ztwY;4tRc}NSt~l7H!{P!k}ObvKT83Ba(EZ>Z1CIE{1jjF$Dg%UI1nuq`63mGxz=6| z+mmGfO~~vfmIJpUm|u+O!D1(CB!U07LeC+#dJRy!o*b!#ji#_jCkR=4q^uw~A_+mI zY(jHvAWiFGBlK_ZTw4#8%d#_4U{{jCRI297zK%zV)c`o^*cv@fzFH57_+^KM@CSkU znXmv>MJU|uu`ZR5A-Oh!A_h3Q@m7Q#5ro|y`p8&uQm$Dd)3+pHwIvT!uyu@9LiS~x z)JJDx*|D6H-c!ACK19LYylNa-ie5Dw(=km^o56<&m#0}$0ymU~HFz|I!|KuNiZJpe ztqSFqsoIlQ_TA2wF=?tP{%i{VO(d-^QiC82+=Rd%Sdkm{An zo3cOB33eHi(Z`fRnf7uLXF;jtj#Ww)_=ZXKDrJQnpdy2-l;H0L(foM8I3XWm7v{@svA@hG{^|x36 diff --git a/nwb_linkml/tests/fixtures/paths.py b/nwb_linkml/tests/fixtures/paths.py index d7f5f0c..f2d0e1e 100644 --- a/nwb_linkml/tests/fixtures/paths.py +++ b/nwb_linkml/tests/fixtures/paths.py @@ -15,7 +15,12 @@ def tmp_output_dir(request: pytest.FixtureRequest) -> Path: if subdir.name == "git": # don't wipe out git repos every time, they don't rly change continue - elif subdir.is_file() and subdir.parent != path: + elif ( + subdir.is_file() + and subdir.parent != path + or subdir.is_file() + and subdir.suffix == ".nwb" + ): continue elif subdir.is_file(): subdir.unlink(missing_ok=True) @@ -54,5 +59,5 @@ def tmp_output_dir_mod(tmp_output_dir) -> Path: @pytest.fixture(scope="session") def data_dir() -> Path: - path = Path(__file__).parent.resolve() / "data" + path = Path(__file__).parents[1].resolve() / "data" return path diff --git a/nwb_linkml/tests/test_io/test_io_hdf5.py b/nwb_linkml/tests/test_io/test_io_hdf5.py index 1b2a623..4222a2c 100644 --- a/nwb_linkml/tests/test_io/test_io_hdf5.py +++ b/nwb_linkml/tests/test_io/test_io_hdf5.py @@ -1,11 +1,10 @@ -import pdb - import h5py import networkx as nx import numpy as np import pytest from nwb_linkml.io.hdf5 import HDF5IO, filter_dependency_graph, hdf_dependency_graph, truncate_file +from nwb_linkml.maps.hdf5 import resolve_hardlink @pytest.mark.skip() @@ -14,7 +13,7 @@ def test_hdf_read(data_dir, dset): NWBFILE = data_dir / dset io = HDF5IO(path=NWBFILE) # the test for now is just whether we can read it lol - model = io.read() + _ = io.read() def test_truncate_file(tmp_output_dir): @@ -87,35 +86,6 @@ def test_truncate_file(tmp_output_dir): assert target_h5f["data"]["dataset_contig"].attrs["anattr"] == 1 -@pytest.mark.skip() -def test_flatten_hdf(): - from nwb_linkml.maps.hdf5 import flatten_hdf - - path = "/Users/jonny/Dropbox/lab/p2p_ld/data/nwb/sub-738651046_ses-760693773.nwb" - import h5py - - h5f = h5py.File(path) - flat = flatten_hdf(h5f) - assert not any(["specifications" in v.path for v in flat.values()]) - pdb.set_trace() - raise NotImplementedError("Just a stub for local testing for now, finish me!") - - -@pytest.mark.dev -def test_dependency_graph(nwb_file, tmp_output_dir): - """ - dependency graph is correctly constructed from an HDF5 file - """ - graph = hdf_dependency_graph(nwb_file) - A_unfiltered = nx.nx_agraph.to_agraph(graph) - A_unfiltered.draw(tmp_output_dir / "test_nwb_unfiltered.png", prog="dot") - graph = filter_dependency_graph(graph) - A_filtered = nx.nx_agraph.to_agraph(graph) - A_filtered.draw(tmp_output_dir / "test_nwb_filtered.png", prog="dot") - pass - - -@pytest.mark.skip def test_dependencies_hardlink(nwb_file): """ Test that hardlinks are resolved (eg. from /processing/ecephys/LFP/ElectricalSeries/electrodes @@ -126,4 +96,50 @@ def test_dependencies_hardlink(nwb_file): Returns: """ - pass + parent = "/processing/ecephys/LFP/ElectricalSeries" + source = "/processing/ecephys/LFP/ElectricalSeries/electrodes" + target = "/acquisition/ElectricalSeries/electrodes" + + # assert that the hardlink exists in the test file + with h5py.File(str(nwb_file), "r") as h5f: + node = h5f.get(source) + linked_node = resolve_hardlink(node) + assert linked_node == target + + graph = hdf_dependency_graph(nwb_file) + # the parent should link to the target as a child + assert (parent, target) in graph.edges([parent]) + assert graph.edges[parent, target]["label"] == "child" + + +@pytest.mark.dev +def test_dependency_graph_images(nwb_file, tmp_output_dir): + """ + Generate images of the dependency graph + """ + graph = hdf_dependency_graph(nwb_file) + A_unfiltered = nx.nx_agraph.to_agraph(graph) + A_unfiltered.draw(tmp_output_dir / "test_nwb_unfiltered.png", prog="dot") + graph = filter_dependency_graph(graph) + A_filtered = nx.nx_agraph.to_agraph(graph) + A_filtered.draw(tmp_output_dir / "test_nwb_filtered.png", prog="dot") + + +@pytest.mark.parametrize( + "dset", + [ + {"name": "aibs.nwb", "source": "sub-738651046_ses-760693773.nwb"}, + { + "name": "aibs_ecephys.nwb", + "source": "sub-738651046_ses-760693773_probe-769322820_ecephys.nwb", + }, + ], +) +@pytest.mark.dev +def test_make_truncated_datasets(tmp_output_dir, data_dir, dset): + input_file = tmp_output_dir / dset["source"] + output_file = data_dir / dset["name"] + if not input_file.exists(): + return + + truncate_file(input_file, output_file, 10)