nearing the end - need to do final top-level packing but we're almost there!

2025-01-10 14:14:27 +00:00 · 2023-09-27 01:19:10 -07:00 · 2023-09-27 01:19:10 -07:00 · 9fcc1458fb
commit 9fcc1458fb
parent a2da236b2b
7 changed files with 338 additions and 62 deletions
--- a/nwb_linkml/src/nwb_linkml/adapters/dataset.py
+++ b/nwb_linkml/src/nwb_linkml/adapters/dataset.py
@ -118,7 +118,7 @@ class DatasetAdapter(ClassAdapter):
                - null
        """
-        if self.cls.name and ((
+        if self.cls.name and len(self.cls.attributes) == 0 and ((
                # single-layer list
                not any([isinstance(dim, list) for dim in self.cls.dims]) and
                len(self.cls.dims) == 1
@ -209,27 +209,36 @@ class DatasetAdapter(ClassAdapter):
        dims_shape = tuple(dict.fromkeys(dims_shape).keys())
        # if we only have one possible dimension, it's equivalent to a list, so we just return the slot
-        if len(dims_shape) == 1 and self.parent:
+        # if len(dims_shape) == 1 and self.parent:
-            quantity = QUANTITY_MAP[dataset.quantity]
+        #     quantity = QUANTITY_MAP[dataset.quantity]
-            slot = SlotDefinition(
+        #     slot = SlotDefinition(
-                name=dataset.name,
+        #         name=dataset.name,
-                range=dtype,
+        #         range=dtype,
-                description=dataset.doc,
+        #         description=dataset.doc,
-                required=quantity['required'],
+        #         required=quantity['required'],
-                multivalued=True
+        #         multivalued=True
-            )
+        #     )
-            res.classes[0].attributes.update({dataset.name: slot})
+        #     res.classes[0].attributes.update({dataset.name: slot})
-            self._handlers.append('arraylike-1d')
+        #     self._handlers.append('arraylike-1d')
-            return res
+        #     return res
        # --------------------------------------------------
        # SPECIAL CASE - allen institute's ndx-aibs-ecephys.extension
        # confuses "dims" with "shape" , eg shape = [None], dims = [3].
        # So we hardcode that here...
        # --------------------------------------------------
        if len(dims_shape) == 1 and isinstance(dims_shape[0][0], int) and dims_shape[0][1] is None:
            dims_shape = (('dim', dims_shape[0][0]),)
        # now make slots for each of them
        slots = []
        for dims, shape in dims_shape:
-            # if a dim is present in all possible combinations of dims, make it required
+            # if there is just a single list of possible dimensions, it's required
-            if all([dims in inner_dim for inner_dim in dataset.dims]):
+            if not any([isinstance(inner_dim, list) for inner_dim in dataset.dims]):
                required = True
-            # or if there is just a single list of possible dimensions
+            # if a dim is present in all possible combinations of dims, make it required
-            elif not any([isinstance(inner_dim, list) for inner_dim in dataset.dims]):
+            elif all([dims in inner_dim for inner_dim in dataset.dims]):
                required = True
            else:
                required = False
--- a/nwb_linkml/src/nwb_linkml/generators/pydantic.py
+++ b/nwb_linkml/src/nwb_linkml/generators/pydantic.py
@ -217,6 +217,7 @@ class NWBPydanticGenerator(PydanticGenerator):
    SKIP_CLASSES=('',)
    INJECTED_FIELDS = (
        'hdf5_path: Optional[str] = Field(None, description="The absolute path that this object is stored in an NWB file")',
        'object_id: Optional[str] = Field(None, description="Unique UUID for each object")'
    )
    # SKIP_CLASSES=('VectorData','VectorIndex')
    split:bool=True
--- a/nwb_linkml/src/nwb_linkml/io/hdf5.py
+++ b/nwb_linkml/src/nwb_linkml/io/hdf5.py
@ -37,6 +37,7 @@ from nwb_linkml.translate import generate_from_nwbfile
 if TYPE_CHECKING:
    from nwb_linkml.models import NWBFile
 from nwb_linkml.providers.schema import SchemaProvider
 from nwb_linkml.types.hdf5 import HDF5_Path
 class HDF5IO():
@ -58,8 +59,9 @@ class HDF5IO():
    def read(self, path:str) -> BaseModel | Dict[str, BaseModel]: ...
    def read(self, path:Optional[str] = None):
        print('starting read')
        provider = self.make_provider()
-
+        print('provider made')
        h5f = h5py.File(str(self.path))
        if path:
            src = h5f.get(path)
@ -71,7 +73,7 @@ class HDF5IO():
            children = flatten_hdf(src)
        else:
            raise NotImplementedError('directly read individual datasets')
-
+        print('hdf flattened')
        queue = ReadQueue(
            h5f=self.path,
            queue=children,
@ -81,11 +83,30 @@ class HDF5IO():
        #pdb.set_trace()
        # Apply initial planning phase of reading
        queue.apply_phase(ReadPhases.plan)
-
+        print('phase - plan completed')
        # Now do read operations until we're finished
        queue.apply_phase(ReadPhases.read)
        print('phase - read completed')
        # if len(queue.queue)> 0:
        #     warnings.warn('Did not complete all items during read phase!')
        queue.apply_phase(ReadPhases.construct)
        # --------------------------------------------------
        # FIXME: Hardcoding top-level file reading just for the win
        # --------------------------------------------------
        root = finish_root_hackily(queue)
        file = NWBFile(**root)
        pdb.set_trace()
        #
        #
        # data = {}
@ -169,6 +190,22 @@ class HDF5IO():
            return list(data[:])
 def finish_root_hackily(queue: ReadQueue) -> dict:
    root = {'name': 'root'}
    for k, v in queue.queue.items():
        if isinstance(v.result, dict):
            res_dict = {}
            for inner_k, inner_v in v.result.items():
                if isinstance(inner_v, HDF5_Path):
                     inner_res = queue.completed.get(inner_v)
                     if inner_res is not None:
                        res_dict[inner_k] = inner_res.result
                else:
                    res_dict[inner_k] = inner_v
            root[res_dict['name']] = res_dict
        else:
            root[v.path.split('/')[-1]] = v.result
    return root
 def read_specs_as_dicts(group: h5py.Group) -> dict:
    """
--- a/nwb_linkml/src/nwb_linkml/maps/dtype.py
+++ b/nwb_linkml/src/nwb_linkml/maps/dtype.py
@ -71,3 +71,21 @@ np_to_python = {
    **{n:float for n in (np.float16, np.float32, np.floating, np.float32, np.float64, np.single, np.double, np.float_)},
    **{n:str for n in (np.character, np.str_, np.string_, np.unicode_)}
 }
 allowed_precisions = {
    'float': ['double'],
    'int8': ['short', 'int', 'long', 'int16', 'int32', 'int64'],
    'short': ['int', 'long'],
    'int': ['long'],
    'uint8': ['uint8', 'uint16', 'uint32', 'uint64'],
    'uint16': ['uint16', 'uint32', 'uint64'],
    'uint32': ['uint32', 'uint64'],
    'float16': ['float16', 'float32', 'float64'],
    'float32': ['float32', 'float64'],
    'utf': ['ascii']
 }
 """
 Following HDMF, it turns out that specifying precision actually specifies minimum precision
 https://github.com/hdmf-dev/hdmf/blob/ddc842b5c81d96e0b957b96e88533b16c137e206/src/hdmf/validate/validator.py#L22
 https://github.com/hdmf-dev/hdmf/blob/ddc842b5c81d96e0b957b96e88533b16c137e206/src/hdmf/spec/spec.py#L694-L714
 """
--- a/nwb_linkml/src/nwb_linkml/maps/hdf5.py
+++ b/nwb_linkml/src/nwb_linkml/maps/hdf5.py
@ -4,18 +4,21 @@ Maps for reading and writing from HDF5
 We have sort of diverged from the initial idea of a generalized map as in :class:`linkml.map.Map` ,
 so we will make our own mapping class here and re-evaluate whether they should be unified later
 """
 import pdb
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Literal, List, Dict, Optional, Type
+from typing import Literal, List, Dict, Optional, Type, Union
 import h5py
 from enum import StrEnum
-from pydantic import BaseModel, Field, ConfigDict
+from pydantic import BaseModel, Field, ConfigDict, ValidationError
 import dask.array as da
 from nwb_linkml.providers.schema import SchemaProvider
 from nwb_linkml.maps.hdmf import dynamictable_to_model
 from nwb_linkml.types.hdf5 import HDF5_Path
 from nwb_linkml.types.ndarray import NDArrayProxy
 class ReadPhases(StrEnum):
@ -55,7 +58,7 @@ class H5ReadResult(BaseModel):
    """Result returned by each of our mapping operations"""
    path: str
    """absolute hdf5 path of element"""
-    source: H5SourceItem
+    source: Union[H5SourceItem, 'H5ReadResult']
    """
    Source that this result is based on. 
    The map can modify this item, so the container should update the source
@ -66,7 +69,7 @@ class H5ReadResult(BaseModel):
    Was this item completed by this map step? False for cases where eg. 
    we still have dependencies that need to be completed before this one
    """
-    result: Optional[BaseModel | dict | str | int | float] = None
+    result: Optional[dict | str | int | float | BaseModel] = None
    """
    If completed, built result. A dict that can be instantiated into the model.
    If completed is True and result is None, then remove this object
@ -87,6 +90,14 @@ class H5ReadResult(BaseModel):
    """
    Optional: The neurodata type to use for this object 
    """
    applied: List[str] = Field(default_factory=list)
    """
    Which stages were applied to this item
    """
    errors: List[str] = Field(default_factory=list)
    """
    Problems that occurred during resolution
    """
 FlatH5 = Dict[str, H5SourceItem]
@ -133,6 +144,8 @@ class PruneEmpty(HDF5Map):
            completed=True
        )
 # class ResolveVectorData(HDF5Map):
 #     """
 #     We will load vanilla VectorData as part of :class:`.ResolveDynamicTable`
@ -194,7 +207,8 @@ class ResolveDynamicTable(HDF5Map):
            source=src,
            result=model,
            completes=completes,
-            completed = True
+            completed = True,
            applied=['ResolveDynamicTable']
        )
@ -212,6 +226,8 @@ class ResolveModelGroup(HDF5Map):
    @classmethod
    def apply(cls, src: H5SourceItem, provider:SchemaProvider, completed: Dict[str, H5ReadResult]) -> H5ReadResult:
        model = provider.get_class(src.namespace, src.neurodata_type)
        res = {}
        with h5py.File(src.h5f_path, 'r') as h5f:
@ -222,7 +238,12 @@ class ResolveModelGroup(HDF5Map):
                    continue
                if key in obj.keys():
                    # stash a reference to this, we'll compile it at the end
-                    res[key] = HDF5_Path('/'.join([src.path, key]))
+                    if src.path == '/':
                        target_path = '/' + key
                    else:
                        target_path = '/'.join([src.path, key])
                    res[key] = HDF5_Path(target_path)
        res['hdf5_path'] = src.path
        res['name'] = src.parts[-1]
@ -233,25 +254,44 @@ class ResolveModelGroup(HDF5Map):
            result = res,
            model = model,
            namespace=src.namespace,
-            neurodata_type=src.neurodata_type
+            neurodata_type=src.neurodata_type,
            applied=['ResolveModelGroup']
        )
 class ResolveDatasetAsDict(HDF5Map):
    """Mutually exclusive with :class:`.ResolveScalars`"""
    phase = ReadPhases.read
    priority = 11
    exclusive = True
    @classmethod
    def check(cls, src: H5SourceItem, provider:SchemaProvider, completed: Dict[str, H5ReadResult]) -> bool:
        if src.h5_type == 'dataset' and 'neurodata_type' not in src.attrs:
            with h5py.File(src.h5f_path, 'r') as h5f:
                obj = h5f.get(src.path)
                if obj.shape != ():
                    return True
                else: return False
        else:
            return False
    @classmethod
    def apply(cls, src: H5SourceItem, provider:SchemaProvider, completed: Dict[str, H5ReadResult]) -> H5ReadResult:
        res = {
            'array': NDArrayProxy(h5f_file=src.h5f_path, path=src.path),
            'hdf5_path' : src.path,
            'name': src.parts[-1],
            **src.attrs
        }
        return H5ReadResult(
            path = src.path,
            source=src,
            completed=True,
            result=res,
            applied=['ResolveDatasetAsDict']
        )
 #
 # class ResolveModelDataset(HDF5Map):
 #     phase = ReadPhases.read
 #     priority = 10
 #     exclusive = True
 #
 #     @classmethod
 #     def check(cls, src: H5SourceItem, provider:SchemaProvider, completed: Dict[str, H5ReadResult]) -> bool:
 #         if 'neurodata_type' in src.attrs and src.h5_type == 'dataset':
 #             return True
 #         else:
 #             return False
 #
 #     def apply(cls, src: H5SourceItem, provider:SchemaProvider, completed: Dict[str, H5ReadResult]) -> H5ReadResult:
 #
 class ResolveScalars(HDF5Map):
    phase = ReadPhases.read
    priority = 11 #catchall
@ -266,6 +306,8 @@ class ResolveScalars(HDF5Map):
                    return True
                else:
                    return False
        else:
            return False
    @classmethod
    def apply(cls, src: H5SourceItem, provider:SchemaProvider, completed: Dict[str, H5ReadResult]) -> H5ReadResult:
        with h5py.File(src.h5f_path, 'r') as h5f:
@ -275,9 +317,138 @@ class ResolveScalars(HDF5Map):
            path=src.path,
            source = src,
            completed=True,
-            result = res
+            result = res,
            applied=['ResolveScalars']
        )
 class ResolveContainerGroups(HDF5Map):
    phase = ReadPhases.read
    priority = 9
    @classmethod
    def check(cls, src: H5SourceItem, provider:SchemaProvider, completed: Dict[str, H5ReadResult]) -> bool:
        if src.h5_type == 'group' and 'neurodata_type' not in src.attrs and len(src.attrs) == 0:
            with h5py.File(src.h5f_path, 'r') as h5f:
                obj = h5f.get(src.path)
                if len(obj.keys()) > 0:
                    return True
                else:
                    return False
        else:
            return False
    @classmethod
    def apply(cls, src: H5SourceItem, provider:SchemaProvider, completed: Dict[str, H5ReadResult]) -> H5ReadResult:
        """Simple, just return a dict with references to its children"""
        with h5py.File(src.h5f_path, 'r') as h5f:
            obj = h5f.get(src.path)
            children = {}
            for k, v in obj.items():
                children[k] = HDF5_Path(v.name)
        res = {
            'name': src.parts[-1],
            **children
        }
        return H5ReadResult(
            path=src.path,
            source=src,
            completed=True,
            result=res,
            applied=['ResolveContainerGroups']
        )
 # --------------------------------------------------
 # Completion Steps
 # --------------------------------------------------
 class CompleteDynamicTables(HDF5Map):
    """Nothing to do! already done!"""
    phase = ReadPhases.construct
    priority = 1
    exclusive = True
    @classmethod
    def check(cls, src: H5ReadResult, provider:SchemaProvider, completed: Dict[str, H5ReadResult]) -> bool:
        if 'ResolveDynamicTable' in src.applied:
            return True
        else:
            return False
    @classmethod
    def apply(cls, src: H5ReadResult, provider:SchemaProvider, completed: Dict[str, H5ReadResult]) -> H5ReadResult:
        return src
 class CompleteModelGroups(HDF5Map):
    phase = ReadPhases.construct
    priority = 2
    @classmethod
    def check(cls, src: H5ReadResult, provider:SchemaProvider, completed: Dict[str, H5ReadResult]) -> bool:
        if src.model is not None:
            return True
        else:
            return False
    @classmethod
    def apply(cls, src: H5ReadResult, provider:SchemaProvider, completed: Dict[str, H5ReadResult]) -> H5ReadResult:
        # gather any results that were left for completion elsewhere
        res = {k:v for k,v in src.result.items() if not isinstance(v, HDF5_Path)}
        errors = []
        completes = []
        for path, item in src.result.items():
            if isinstance(item, HDF5_Path):
                other_item = completed.get(item, None)
                if other_item is None:
                    errors.append(f'Couldnt find {item}')
                    continue
                if isinstance(other_item.result, dict):
                    # resolve any other children that it might have...
                    # FIXME: refactor this lmao so bad
                    for k,v in other_item.result.items():
                        if isinstance(v, HDF5_Path):
                            inner_result = completed.get(v, None)
                            if inner_result is None:
                                errors.append(f'Couldnt find inner item {v}')
                                continue
                            other_item.result[k] = inner_result.result
                            completes.append(v)
                    res[other_item.result['name']] = other_item.result
                else:
                    res[path] = other_item.result
                completes.append(other_item.path)
        #try:
        instance = src.model(**res)
        return H5ReadResult(
            path=src.path,
            source=src,
            result=instance,
            model=src.model,
            completed=True,
            completes=completes,
            neurodata_type=src.neurodata_type,
            namespace=src.namespace,
            applied=src.applied + ['CompleteModelGroups'],
            errors=errors
        )
        # except ValidationError:
        #     # didn't get it! try again next time
        #     return H5ReadResult(
        #         path=src.path,
        #         source=src,
        #         result=src,
        #         model=src.model,
        #         completed=True,
        #         completes=completes,
        #         neurodata_type=src.neurodata_type,
        #         namespace=src.namespace,
        #         applied=src.applied + ['CompleteModelGroups']
        #     )
@ -291,7 +462,7 @@ class ReadQueue(BaseModel):
    provider: SchemaProvider = Field(
        description="SchemaProvider used by each of the items in the read queue"
    )
-    queue: Dict[str,H5SourceItem] = Field(
+    queue: Dict[str,H5SourceItem|H5ReadResult] = Field(
        default_factory=dict,
        description="Items left to be instantiated, keyed by hdf5 path",
    )
@ -300,11 +471,16 @@ class ReadQueue(BaseModel):
        description="Items that have already been instantiated, keyed by hdf5 path"
    )
    model_config = ConfigDict(arbitrary_types_allowed=True)
    phases_completed: List[ReadPhases] = Field(
        default_factory=list,
        description="Phases that have already been completed")
    def apply_phase(self, phase:ReadPhases):
        phase_maps = [m for m in HDF5Map.__subclasses__() if m.phase == phase]
        phase_maps = sorted(phase_maps, key=lambda x: x.priority)
        # if we've moved to the
        results = []
        # TODO: Thread/multiprocess this
@ -316,6 +492,7 @@ class ReadQueue(BaseModel):
                        break # out of inner iteration
        # remake the source queue and save results
        completes = []
        for res in results:
            # remove the original item
            del self.queue[res.path]
@ -327,16 +504,42 @@ class ReadQueue(BaseModel):
                # just drop it.
                # if we have completed other things, delete them from the queue
-                for also_completed in res.completes:
+                completes.extend(res.completes)
-                    try:
+                # for also_completed in res.completes:
-                        del self.queue[also_completed]
+                #     try:
-                    except KeyError:
+                #         del self.queue[also_completed]
-                        # normal, we might have already deleted this in a previous step
+                #     except KeyError:
-                        pass
+                #         # normal, we might have already deleted this in a previous step
                #         pass
            else:
                # if we didn't complete the item (eg. we found we needed more dependencies),
                # add the updated source to the queue again
                if phase != ReadPhases.construct:
                    self.queue[res.path] = res.source
                else:
                    self.queue[res.path] = res
        # delete the ones that were already completed but might have been
        # incorrectly added back in the pile
        for c in completes:
            try:
                del self.queue[c]
            except KeyError:
                pass
        # if we have nothing left in our queue, we have completed this phase
        # and prepare only ever has one pass
        if phase == ReadPhases.plan:
            self.phases_completed.append(phase)
            return
        if len(self.queue) == 0:
            self.phases_completed.append(phase)
            if phase != ReadPhases.construct:
                # if we're not in the last phase, move our completed to our queue
                self.queue = self.completed.copy()
@ -367,8 +570,8 @@ def flatten_hdf(h5f:h5py.File | h5py.Group, skip='specifications') -> Dict[str,
        # get references in attrs and datasets to populate dependencies
        #depends = get_references(obj)
-        #if not name.startswith('/'):
+        if not name.startswith('/'):
-        #    name = '/' + name
+           name = '/' + name
        attrs = dict(obj.attrs.items())
@ -384,6 +587,8 @@ def flatten_hdf(h5f:h5py.File | h5py.Group, skip='specifications') -> Dict[str,
        )
    h5f.visititems(_itemize)
    # # then add the root item
    # _itemize(h5f.name, h5f)
    return items
--- a/nwb_linkml/src/nwb_linkml/types/hdf5.py
+++ b/nwb_linkml/src/nwb_linkml/types/hdf5.py
@ -1,3 +1,5 @@
 from typing import Annotated
-HDF5_Path = Annotated[str, """Trivial subclass of string to indicate that it is a reference to a location within an HDF5 file"""]
+class HDF5_Path(str):
    """Trivial subclass of string to indicate that it is a reference to a location within an HDF5 file"""
    pass
--- a/nwb_linkml/src/nwb_linkml/types/ndarray.py
+++ b/nwb_linkml/src/nwb_linkml/types/ndarray.py
@ -35,7 +35,8 @@ from nptyping.ndarray import NDArrayMeta
 from nptyping import Shape, Number
 from nptyping.shape_expression import check_shape
-from nwb_linkml.maps.dtype import np_to_python
+from nwb_linkml.maps.dtype import np_to_python, allowed_precisions
 class NDArray(_NDArray):
@ -59,12 +60,14 @@ class NDArray(_NDArray):
        def validate_dtype(value: np.ndarray) -> np.ndarray:
            if dtype is Any:
                return value
-            assert value.dtype == dtype, f"Invalid dtype! expected {dtype}, got {value.dtype}"
+            assert value.dtype == dtype or value.dtype.name in allowed_precisions[dtype.__name__], f"Invalid dtype! expected {dtype}, got {value.dtype}"
            return value
        def validate_array(value: Any) -> np.ndarray:
-            if isinstance(value, np.ndarray):
+            # not using instancecheck because nwb doesnt actually validate precision
-                assert cls.__instancecheck__(value), f'Invalid shape! expected shape {shape.prepared_args}, got shape {value.shape}'
+            # this step is now just validating shape
-            elif isinstance(value, DaskArray):
+            # if isinstance(value, np.ndarray):
            #     assert cls.__instancecheck__(value), f'Invalid shape! expected shape {shape.prepared_args}, got shape {value.shape}'
            # elif isinstance(value, DaskArray):
            assert shape is Any or check_shape(value.shape, shape), f'Invalid shape! expected shape {shape.prepared_args}, got shape {value.shape}'
            return value
@ -146,7 +149,8 @@ class NDArray(_NDArray):
                    core_schema.no_info_plain_validator_function(coerce_list),
                    core_schema.union_schema([
                        core_schema.is_instance_schema(cls=np.ndarray),
-                        core_schema.is_instance_schema(cls=DaskArray)
+                        core_schema.is_instance_schema(cls=DaskArray),
                        core_schema.is_instance_schema(cls=NDArrayProxy)
                        ]),
                    core_schema.no_info_plain_validator_function(validate_dtype),
                    core_schema.no_info_plain_validator_function(validate_array)