Like 15 validation errors away from fully reading a file.

Just going to make a special completion phase for the top-level NWBFile because it's so special cased it's not worth trying to generalize. - Resolving hardlinked datasets in read maps - Resolving 2nd-order empty groups - Completing container groups - Move fixing references to recursive function - Multiple passes for a given phase - NDArrayProxies should resolve dask or h5py reading on their own - Correctly build split schema from PydanticProvider instead of needing external logic
2025-01-10 14:14:27 +00:00 · 2023-10-02 22:19:11 -07:00 · 2023-10-02 22:19:11 -07:00 · 34f8969fa9
commit 34f8969fa9
parent eca7a5ec2e
5 changed files with 213 additions and 80 deletions
--- a/nwb_linkml/src/nwb_linkml/generators/pydantic.py
+++ b/nwb_linkml/src/nwb_linkml/generators/pydantic.py
@ -247,7 +247,6 @@ class NWBPydanticGenerator(PydanticGenerator):
            # Don't get classes that are defined in this schema!
            if module_name == self.schema.name:
                continue
            # pdb.set_trace()
            schema_name = module_name.split('.')[0]
            if self.versions and schema_name != self.schema.name.split('.')[0] and schema_name in self.versions:
                version = version_module_case(self.versions[schema_name])
@ -255,6 +254,7 @@ class NWBPydanticGenerator(PydanticGenerator):
                    local_mod_name = '...' + module_case(schema_name) + '.' + version + '.' + module_case(module_name)
                else:
                    local_mod_name = '...' + module_case(schema_name) + '.' + version + '.' + 'namespace'
            else:
                local_mod_name = '.' + module_case(module_name)
@ -709,6 +709,14 @@ class NWBPydanticGenerator(PydanticGenerator):
                    self.generate_python_range(slot_range, s, class_def)
                    for slot_range in slot_ranges
                ]
                # --------------------------------------------------
                # Special Case - since we get abstract classes from
                # potentially multiple versions (which are then different)
                # model classes, we allow container classes to also
                # be generic descendants of BaseModel
                # --------------------------------------------------
                if 'DynamicTable' in pyranges:
                    pyranges.append('BaseModel')
                pyranges = list(set(pyranges))  # remove duplicates
                pyranges.sort()
--- a/nwb_linkml/src/nwb_linkml/io/hdf5.py
+++ b/nwb_linkml/src/nwb_linkml/io/hdf5.py
@ -80,7 +80,6 @@ class HDF5IO():
            provider=provider
        )
        #pdb.set_trace()
        # Apply initial planning phase of reading
        queue.apply_phase(ReadPhases.plan)
        print('phase - plan completed')
@ -89,23 +88,14 @@ class HDF5IO():
        print('phase - read completed')
        # pdb.set_trace()
        # if len(queue.queue)> 0:
        #     warnings.warn('Did not complete all items during read phase!')
        queue.apply_phase(ReadPhases.construct)
        pdb.set_trace()
        # --------------------------------------------------
        # FIXME: Hardcoding top-level file reading just for the win
        # --------------------------------------------------
        root = finish_root_hackily(queue)
        file = NWBFile(**root)
        pdb.set_trace()
--- a/nwb_linkml/src/nwb_linkml/maps/hdf5.py
+++ b/nwb_linkml/src/nwb_linkml/maps/hdf5.py
@ -4,9 +4,10 @@ Maps for reading and writing from HDF5
 We have sort of diverged from the initial idea of a generalized map as in :class:`linkml.map.Map` ,
 so we will make our own mapping class here and re-evaluate whether they should be unified later
 """
 import pdb
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Literal, List, Dict, Optional, Type, Union
+from typing import Literal, List, Dict, Optional, Type, Union, Tuple
 import h5py
 from enum import StrEnum
@ -135,16 +136,50 @@ class HDF5Map(ABC):
 # --------------------------------------------------
 # Planning maps
 # --------------------------------------------------
 def check_empty(obj: h5py.Group) -> bool:
    """
    Check if a group has no attrs or children OR has no attrs and all its children also have no attrs and no children
    Returns:
        bool
    """
    if isinstance(obj, h5py.Dataset):
        return False
    # check if we are empty
    no_attrs = False
    if len(obj.attrs) == 0:
        no_attrs = True
    no_children = False
    if len(obj.keys()) == 0:
        no_children = True
    # check if immediate children are empty
    # handles empty groups of empty groups
    children_empty = False
    if all([isinstance(item, h5py.Group) and \
            len(item.keys()) == 0 and \
            len(item.attrs) == 0 \
            for item in obj.values()]):
        children_empty = True
    # if we have no attrs and we are a leaf OR our children are empty, remove us
    if no_attrs and (no_children or children_empty):
        return True
    else:
        return False
 class PruneEmpty(HDF5Map):
    """Remove groups with no attrs """
    phase = ReadPhases.plan
    @classmethod
    def check(cls, src: H5SourceItem, provider:SchemaProvider, completed: Dict[str, H5ReadResult]) -> bool:
-        if src.leaf and src.h5_type == 'group':
+        if src.h5_type == 'group':
            with h5py.File(src.h5f_path, 'r') as h5f:
                obj = h5f.get(src.path)
-                if len(obj.attrs) == 0:
+                return check_empty(obj)
                    return True
    @classmethod
    def apply(cls, src: H5SourceItem, provider:SchemaProvider, completed: Dict[str, H5ReadResult]) -> H5ReadResult:
@ -268,15 +303,18 @@ class ResolveModelGroup(HDF5Map):
            obj = h5f.get(src.path)
            for key, type in model.model_fields.items():
                if key == 'children':
-                    res[key] = {name: HDF5_Path(child.name) for name, child in obj.items()}
+                    res[key] = {name: resolve_hardlink(child) for name, child in obj.items()}
-                    depends.extend([HDF5_Path(child.name) for child in obj.values()])
+                    depends.extend([resolve_hardlink(child)for child in obj.values()])
                elif key in obj.attrs:
                    res[key] = obj.attrs[key]
                    continue
                elif key in obj.keys():
                    # make sure it's not empty
                    if check_empty(obj[key]):
                        continue
                    # stash a reference to this, we'll compile it at the end
-                    depends.append(HDF5_Path(obj[key].name))
+                    depends.append(resolve_hardlink(obj[key]))
-                    res[key] = HDF5_Path(obj[key].name)
+                    res[key] = resolve_hardlink(obj[key])
        res['hdf5_path'] = src.path
@ -393,16 +431,18 @@ class ResolveContainerGroups(HDF5Map):
                children[k] = HDF5_Path(v.name)
                depends.append(HDF5_Path(v.name))
-        res = {
+        # res = {
-            'name': src.parts[-1],
+        #     'name': src.parts[-1],
-            **children
+        #     'hdf5_path': src.path,
-        }
+        #     **children
        # }
        return H5ReadResult(
            path=src.path,
            source=src,
            completed=True,
-            result=res,
+            result=children,
            depends=depends,
            applied=['ResolveContainerGroups']
        )
@ -411,24 +451,64 @@ class ResolveContainerGroups(HDF5Map):
 # Completion Steps
 # --------------------------------------------------
-class CompleteDynamicTables(HDF5Map):
+class CompletePassThrough(HDF5Map):
-    """Nothing to do! already done!"""
+    """
    Passthrough map for the construction phase for models that don't need any more work done
    - :class:`.ResolveDynamicTable`
    - :class:`.ResolveDatasetAsDict`
    - :class:`.ResolveScalars`
    """
    phase = ReadPhases.construct
    priority = 1
    @classmethod
    def check(cls, src: H5ReadResult, provider:SchemaProvider, completed: Dict[str, H5ReadResult]) -> bool:
-        if 'ResolveDynamicTable' in src.applied:
+        passthrough_ops = ('ResolveDynamicTable', 'ResolveDatasetAsDict', 'ResolveScalars')
        for op in passthrough_ops:
            if hasattr(src, 'applied') and op in src.applied:
                return True
        else:
        return False
    @classmethod
    def apply(cls, src: H5ReadResult, provider:SchemaProvider, completed: Dict[str, H5ReadResult]) -> H5ReadResult:
        return src
 class CompleteContainerGroups(HDF5Map):
    """
    Complete container groups (usually top-level groups like /acquisition)
    that do not have a ndueodata type of their own by resolving them as dictionaries
    of values (that will then be given to their parent model)
    """
    phase = ReadPhases.construct
    priority = 3
    @classmethod
    def check(cls, src: H5ReadResult, provider:SchemaProvider, completed: Dict[str, H5ReadResult]) -> bool:
        if src.model is None and \
            src.neurodata_type is None and \
                src.source.h5_type == 'group' and \
                all([depend in completed.keys() for depend in src.depends]):
            return True
        else:
            return False
    @classmethod
    def apply(cls, src: H5ReadResult, provider:SchemaProvider, completed: Dict[str, H5ReadResult]) -> H5ReadResult:
        res, errors, completes = resolve_references(src.result, completed)
        return H5ReadResult(
            result=res,
            errors=errors,
            completes=completes,
            **src.model_dump(exclude={'result', 'errors', 'completes'})
        )
 class CompleteModelGroups(HDF5Map):
    phase = ReadPhases.construct
-    priority = 2
+    priority = 4
    @classmethod
    def check(cls, src: H5ReadResult, provider:SchemaProvider, completed: Dict[str, H5ReadResult]) -> bool:
@ -442,31 +522,15 @@ class CompleteModelGroups(HDF5Map):
    @classmethod
    def apply(cls, src: H5ReadResult, provider:SchemaProvider, completed: Dict[str, H5ReadResult]) -> H5ReadResult:
        # gather any results that were left for completion elsewhere
        # first get all already-completed items
        res = {k:v for k,v in src.result.items() if not isinstance(v, HDF5_Path)}
-        errors = []
+        unpacked_results, errors, completes = resolve_references(src.result, completed)
-        completes = []
+        res.update(unpacked_results)
        for path, item in src.result.items():
            if isinstance(item, HDF5_Path):
                other_item = completed.get(item, None)
                if other_item is None:
                    errors.append(f'Couldnt find {item}')
                    continue
                if isinstance(other_item.result, dict):
                    # resolve any other children that it might have...
                    # FIXME: refactor this lmao so bad
                    for k,v in other_item.result.items():
                        if isinstance(v, HDF5_Path):
                            inner_result = completed.get(v, None)
                            if inner_result is None:
                                errors.append(f'Couldnt find inner item {v}')
                                continue
                            other_item.result[k] = inner_result.result
                            completes.append(v)
                    res[other_item.result['name']] = other_item.result
                else:
                    res[path] = other_item.result
-                completes.append(other_item.path)
+        # # final cleanups
        # for key, val in res.items():
        #     # if we're supposed to be a list, but instead we're an array, fix that!
        #
        #try:
        instance = src.model(**res)
@ -523,11 +587,10 @@ class ReadQueue(BaseModel):
        default_factory=list,
        description="Phases that have already been completed")
-    def apply_phase(self, phase:ReadPhases):
+    def apply_phase(self, phase:ReadPhases, max_passes=5):
        phase_maps = [m for m in HDF5Map.__subclasses__() if m.phase == phase]
        phase_maps = sorted(phase_maps, key=lambda x: x.priority)
        results = []
        # TODO: Thread/multiprocess this
@ -585,6 +648,8 @@ class ReadQueue(BaseModel):
                # if we're not in the last phase, move our completed to our queue
                self.queue = self.completed
                self.completed = {}
        elif max_passes>0:
            self.apply_phase(phase, max_passes=max_passes-1)
@ -606,6 +671,7 @@ def flatten_hdf(h5f:h5py.File | h5py.Group, skip='specifications') -> Dict[str,
    def _itemize(name: str, obj: h5py.Dataset | h5py.Group):
        if skip in name:
            return
        leaf = isinstance(obj, h5py.Dataset) or len(obj.keys()) == 0
        if isinstance(obj, h5py.Dataset):
@ -635,8 +701,8 @@ def flatten_hdf(h5f:h5py.File | h5py.Group, skip='specifications') -> Dict[str,
        )
    h5f.visititems(_itemize)
-    # # then add the root item
+    # then add the root item
-    # _itemize(h5f.name, h5f)
+    _itemize(h5f.name, h5f)
    return items
@ -681,3 +747,45 @@ def get_references(obj: h5py.Dataset | h5py.Group) -> List[str]:
    else:
        depends = list(set([obj.get(i).name for i in refs]))
    return depends
 def resolve_references(src: dict, completed: Dict[str, H5ReadResult]) -> Tuple[dict, List[str], List[HDF5_Path]]:
    """
    Recursively replace references to other completed items with their results
    """
    completes = []
    errors = []
    res = {}
    for path, item in src.items():
        if isinstance(item, HDF5_Path):
            other_item = completed.get(item, None)
            if other_item is None:
                errors.append(f"Couldnt find: {item}")
            res[path] = other_item.result
            completes.append(item)
        elif isinstance(item, dict):
            inner_res, inner_error, inner_completes = resolve_references(item, completed)
            res[path] =  inner_res
            errors.extend(inner_error)
            completes.extend(inner_completes)
        else:
            res[path] = item
    return res, errors, completes
 def resolve_hardlink(obj: Union[h5py.Group, h5py.Dataset]) -> HDF5_Path:
    """
    Unhelpfully, hardlinks are pretty challenging to detect with h5py, so we have
    to do extra work to check if an item is "real" or a hardlink to another item.
    Particularly, an item will be excluded from the ``visititems`` method used by
    :func:`.flatten_hdf` if it is a hardlink rather than an "original" dataset,
    meaning that we don't even have them in our sources list when start reading.
    We basically dereference the object and return that path instead of the path
    given by the object's ``name``
    """
    return HDF5_Path(obj.file[obj.ref].name)
--- a/nwb_linkml/src/nwb_linkml/maps/hdmf.py
+++ b/nwb_linkml/src/nwb_linkml/maps/hdmf.py
@ -106,11 +106,11 @@ def dynamictable_to_model(
            try:
                items[col] = da.from_array(group[col])
            except NotImplementedError:
-                if str in get_inner_types(col_type.annotation):
+                # if str in get_inner_types(col_type.annotation):
-                    # dask can't handle this, we just arrayproxy it
+                #     # dask can't handle this, we just arrayproxy it
                items[col] = NDArrayProxy(h5f_file=group.file.filename, path=group[col].name)
-                else:
+                #else:
-                    warnings.warn(f"Dask cant handle object type arrays like {col} in {group.name}. Skipping")
+                #    warnings.warn(f"Dask cant handle object type arrays like {col} in {group.name}. Skipping")
                # pdb.set_trace()
                # # can't auto-chunk with "object" type
                # items[col] = da.from_array(group[col], chunks=-1)
--- a/nwb_linkml/src/nwb_linkml/providers/schema.py
+++ b/nwb_linkml/src/nwb_linkml/providers/schema.py
@ -468,11 +468,11 @@ class PydanticProvider(Provider):
            out_file: Optional[Path] = None,
            version: Optional[str] = None,
            versions: Optional[dict] = None,
-            split: bool = False,
+            split: bool = True,
            dump: bool = True,
            force: bool = False,
            **kwargs
-    ) -> str:
+    ) -> str | List[str]:
        """
        Notes:
@ -528,12 +528,6 @@ class PydanticProvider(Provider):
            fn = module_case(fn) + '.py'
            out_file = self.path / name / version / fn
        if out_file.exists() and not force:
            with open(out_file, 'r') as ofile:
                serialized = ofile.read()
            return serialized
        default_kwargs = {
            'split': split,
            'emit_metadata': True,
@ -541,6 +535,17 @@ class PydanticProvider(Provider):
            'pydantic_version': '2'
        }
        default_kwargs.update(kwargs)
        if split:
            return self._build_split(path, versions, default_kwargs, dump, out_file, force)
        else:
            return self._build_unsplit(path, versions, default_kwargs, dump, out_file, force)
    def _build_unsplit(self, path, versions, default_kwargs, dump, out_file, force):
        if out_file.exists() and not force:
            with open(out_file, 'r') as ofile:
                serialized = ofile.read()
            return serialized
        generator = NWBPydanticGenerator(
            str(path),
@ -552,19 +557,28 @@ class PydanticProvider(Provider):
            out_file.parent.mkdir(parents=True,exist_ok=True)
            with open(out_file, 'w') as ofile:
                ofile.write(serialized)
-            with open(out_file.parent / '__init__.py', 'w') as initfile:
+
-                initfile.write(' ')
+            # make initfiles for this directory and parent,
-            # make parent file, being a bit more careful because it could be for another module
+            initfile = out_file.parent / '__init__.py'
            parent_init = out_file.parent.parent / '__init__.py'
-            if not parent_init.exists():
+            for ifile in (initfile, parent_init):
-                with open(parent_init, 'w') as initfile:
+                if not ifile.exists():
-                    initfile.write(' ')
+                    with open(ifile, 'w') as ifile_open:
                        ifile_open.write(' ')
        return serialized
    def _build_split(self, path:Path, versions, default_kwargs, dump, out_file, force) -> List[str]:
        serialized = []
        for schema_file in path.parent.glob('*.yaml'):
            this_out = out_file.parent / (module_case(schema_file.stem) + '.py')
            serialized.append(self._build_unsplit(schema_file, versions, default_kwargs, dump, this_out, force))
        return serialized
    @classmethod
    def module_name(self, namespace:str, version: str) -> str:
-        name_pieces = ['nwb_linkml', 'models', 'pydantic',  namespace, version_module_case(version), 'namespace']
+        name_pieces = ['nwb_linkml', 'models', 'pydantic',  module_case(namespace), version_module_case(version)]
        module_name = '.'.join(name_pieces)
        return module_name
    def import_module(
@ -594,6 +608,19 @@ class PydanticProvider(Provider):
        if not path.exists():
            raise ImportError(f'Module has not been built yet {path}')
        module_name = self.module_name(namespace, version)
        # import module level first - when python does relative imports,
        # it needs to have the parent modules imported separately
        # this breaks split model creation when they are outside of the
        # package repository (ie. loaded from an nwb file) because it tries
        # to look for the containing namespace folder within the nwb_linkml package and fails
        init_spec = importlib.util.spec_from_file_location(module_name, path.parent / '__init__.py')
        init_module = importlib.util.module_from_spec(init_spec)
        sys.modules[module_name] = init_module
        init_spec.loader.exec_module(init_module)
        # then the namespace package
        module_name = module_name + '.namespace'
        spec = importlib.util.spec_from_file_location(module_name, path)
        module = importlib.util.module_from_spec(spec)
        sys.modules[module_name] = module
@ -638,7 +665,7 @@ class PydanticProvider(Provider):
        if version is None:
            version = self.available_versions[namespace][-1]
-        module_name = self.module_name(namespace, version)
+        module_name = self.module_name(namespace, version) + '.namespace'
        if module_name in sys.modules:
            return sys.modules[module_name]
@ -745,10 +772,10 @@ class SchemaProvider(Provider):
        linkml_provider = LinkMLProvider(path=self.path, verbose=verbose)
        pydantic_provider = PydanticProvider(path=self.path, verbose=verbose)
-        linkml_res = linkml_provider.build(ns_adapter=ns_adapter, **linkml_kwargs)
+        linkml_res = linkml_provider.build(ns_adapter=ns_adapter, versions=self.versions, **linkml_kwargs)
        results = {}
        for ns, ns_result in linkml_res.items():
-            results[ns] = pydantic_provider.build(ns_result['namespace'], **pydantic_kwargs)
+            results[ns] = pydantic_provider.build(ns_result['namespace'], versions=self.versions, **pydantic_kwargs)
        return results
    def get(self, namespace: str, version: Optional[str] = None) -> ModuleType: