handle compound dtype

2025-01-09 13:44:27 +00:00 · 2024-07-03 20:39:49 -07:00 · 2024-07-03 20:39:49 -07:00 · ec81032ae8
commit ec81032ae8
parent c9a2423a9d
9 changed files with 164 additions and 125 deletions
--- a/docs/meta/todo.md
+++ b/docs/meta/todo.md
@ -1,5 +1,13 @@
 # TODO

+## v0.2 - update to linkml-arrays and formal release
+
+NWB schema translation
+- handle `links` field in groups
+- handle compound `dtype` like in ophys.PlaneSegmentation.pixel_mask
+- handle compound `dtype` like in TimeSeriesReferenceVectorData
+- Create a validator that checks if all the lists in a compound dtype dataset are same length
+
 Important things that are not implemented yet!

 - {meth}`nwb_linkml.adapters.classes.ClassAdapter.handle_dtype` does not yet handle compound dtypes,
--- a/nwb_linkml/pdm.lock
+++ b/nwb_linkml/pdm.lock
@ -5,7 +5,7 @@
 groups = ["default", "dev", "tests"]
 strategy = ["cross_platform", "inherit_metadata"]
 lock_version = "4.4.2"
-content_hash = "sha256:8cb98f940354e71443df87fd1702300e1c793627b0a016a6233811a640f47c18"
+content_hash = "sha256:6819a20ed9759b784908ad417ee371b941cfb1d9fbeebc8b29f25038f49ea544"

 [[package]]
 name = "annotated-types"
@ -769,18 +769,21 @@ files = [

 [[package]]
 name = "linkml"
-version = "1.7.10"
+version = "0.0.0"
 requires_python = "<4.0.0,>=3.8.1"
+git = "https://github.com/sneakers-the-rat/linkml"
+ref = "arrays-numpydantic"
+revision = "a3cfb2b82d7519cf9c64d113250c1714db2b3f6e"
 summary = "Linked Open Data Modeling Language"
 groups = ["default", "dev"]
 dependencies = [
-    "antlr4-python3-runtime<4.10,>=4.9.0",
+    "antlr4-python3-runtime<4.10,==4.*,>=4.9.0",
    "click>=7.0",
    "graphviz>=0.10.1",
    "hbreader",
    "isodate>=0.6.0",
    "jinja2>=3.1.0",
-    "jsonasobj2<2.0.0,>=1.0.3",
+    "jsonasobj2==1.*,>=1.0.0,>=1.0.3",
    "jsonschema[format]>=4.0.0",
    "linkml-dataops",
    "linkml-runtime>=1.7.4",
@ -799,10 +802,6 @@ dependencies = [
    "sqlalchemy>=1.4.31",
    "watchdog>=0.9.0",
 ]
-files = [
-    {file = "linkml-1.7.10-py3-none-any.whl", hash = "sha256:bf21cce814e9d1509489f1e6e15a7e86e4f11d949490d9a7a5c3f6b5b412ec62"},
-    {file = "linkml-1.7.10.tar.gz", hash = "sha256:1c38601c3cd495e34490b8cf7277fd3674ec68dcbe9f5efcec2658093801ce91"},
-]

 [[package]]
 name = "linkml-dataops"
@ -1085,6 +1084,22 @@ files = [
    {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"},
 ]

+[[package]]
+name = "numpydantic"
+version = "1.2.1"
+requires_python = "<4.0,>=3.9"
+summary = "Type and shape validation and serialization for numpy arrays in pydantic models"
+groups = ["default", "dev"]
+dependencies = [
+    "nptyping>=2.5.0",
+    "numpy>=1.24.0",
+    "pydantic>=2.3.0",
+]
+files = [
+    {file = "numpydantic-1.2.1-py3-none-any.whl", hash = "sha256:e21d7e272410b3a2013d2a6aeec2ed6efd13ea171b0200e2029d7c2f1453def0"},
+    {file = "numpydantic-1.2.1.tar.gz", hash = "sha256:d8a3e7371d78b99fa4a4733a5b873046f064993431ae63f97edcf9bda4dd5c7f"},
+]
+
 [[package]]
 name = "nwb-schema-language"
 version = "0.1.3"
--- a/nwb_linkml/pyproject.toml
+++ b/nwb_linkml/pyproject.toml
@ -23,6 +23,7 @@ dependencies = [
    "blosc2>=2.2.7",
    "tqdm>=4.66.1",
    'typing-extensions>=4.12.2;python_version<"3.11"',
+    "numpydantic>=1.2.1",
 ]

 [project.urls]
--- a/nwb_linkml/src/nwb_linkml/adapters/classes.py
+++ b/nwb_linkml/src/nwb_linkml/adapters/classes.py
@ -123,6 +123,7 @@ class ClassAdapter(Adapter):

            name_parts.append(self.cls.name)
            name = "__".join(name_parts)
+
        elif self.cls.neurodata_type_inc is not None:
            # again, this is against the schema, but is common
            name = self.cls.neurodata_type_inc
@ -206,12 +207,20 @@ class ClassAdapter(Adapter):
        Returns:

        """
-        if self.cls.name:
+        if self.cls.name or self.cls.default_name:
+            if self.cls.name:
+                # name overrides default_name
+                name = self.cls.name
+                equals_string = name
+            else:
+                name = self.cls.default_name
+                equals_string = None
+
            name_slot = SlotDefinition(
                name="name",
                required=True,
-                ifabsent=f"string({self.cls.name})",
-                equals_string=self.cls.name,
+                ifabsent=f"string({name})",
+                equals_string=equals_string,
                range="string",
                identifier=True,
            )
--- a/nwb_linkml/src/nwb_linkml/adapters/dataset.py
+++ b/nwb_linkml/src/nwb_linkml/adapters/dataset.py
@ -1,9 +1,9 @@
 """
 Adapter for NWB datasets to linkml Classes
 """
-
+import pdb
 from abc import abstractmethod
-from typing import Optional
+from typing import Optional, Type

 from linkml_runtime.linkml_model.meta import (
    ClassDefinition,
@ -16,7 +16,7 @@ from nwb_linkml.adapters.classes import ClassAdapter
 from nwb_linkml.maps import QUANTITY_MAP, Map
 from nwb_linkml.maps.dtype import flat_to_linkml
 from nwb_linkml.maps.naming import camel_to_snake
-from nwb_schema_language import Dataset
+from nwb_schema_language import Dataset, CompoundDtype


 class DatasetMap(Map):
@ -113,6 +113,7 @@ class MapScalar(DatasetMap):
            and not cls.attributes
            and not cls.dims
            and not cls.shape
+            and not is_compound(cls)
            and cls.name
        )

@ -228,7 +229,7 @@ class MapArraylike(DatasetMap):
        """
        Check if we're a plain array
        """
-        return cls.name and all([cls.dims, cls.shape]) and not has_attrs(cls)
+        return cls.name and all([cls.dims, cls.shape]) and not has_attrs(cls) and not is_compound(cls)

    @classmethod
    def apply(
@ -259,15 +260,6 @@ class MapArrayLikeAttributes(DatasetMap):
    """
    The most general case - treat everything that isn't handled by one of the special cases
    as an array!
-
-    Specifically, we make an ``Arraylike`` class such that:
-
-    - Each slot within a subclass indicates a possible dimension.
-    - Only dimensions that are present in all the dimension specifiers in the
-      original schema are required.
-    - Shape requirements are indicated using max/min cardinalities on the slot.
-    - The arraylike object should be stored in the `array` slot on the containing class
-      (since there are already properties named `data`)
    """

    NEEDS_NAME = True
@ -282,6 +274,7 @@ class MapArrayLikeAttributes(DatasetMap):
            all([cls.dims, cls.shape])
            and cls.neurodata_type_inc != "VectorData"
            and has_attrs(cls)
+            and not is_compound(cls)
            and (dtype == "AnyType" or dtype in flat_to_linkml)
        )

@ -311,6 +304,22 @@ class Map1DVector(DatasetMap):
    """
    ``VectorData`` is subclassed with a name but without dims or attributes,
    treat this as a normal 1D array slot that replaces any class that would be built for this
+
+    eg. all the datasets in epoch.TimeIntervals:
+
+    .. code-block:: yaml
+
+        groups:
+        - neurodata_type_def: TimeIntervals
+          neurodata_type_inc: DynamicTable
+          doc: A container for aggregating epoch data and the TimeSeries that each epoch applies
+            to.
+          datasets:
+          - name: start_time
+            neurodata_type_inc: VectorData
+            dtype: float32
+            doc: Start time of epoch, in seconds.
+
    """

    @classmethod
@ -323,6 +332,8 @@ class Map1DVector(DatasetMap):
            and not cls.dims
            and not cls.shape
            and not cls.attributes
+            and not cls.neurodata_type_def
+            and not is_compound(cls)
            and cls.name
        )

@ -381,6 +392,72 @@ class MapNVectors(DatasetMap):
        res = BuildResult(slots=[this_slot])
        return res

+class MapCompoundDtype(DatasetMap):
+    """
+    A ``dtype`` declared as an array of types that function effectively as a row in a table.
+
+    We render them just as a class with each of the dtypes as slots - they are
+    typically used by other datasets to create a table.
+
+    Eg. ``base.TimeSeriesReferenceVectorData``
+
+    .. code-block:: yaml
+
+        datasets:
+        - neurodata_type_def: TimeSeriesReferenceVectorData
+          neurodata_type_inc: VectorData
+          default_name: timeseries
+          dtype:
+          - name: idx_start
+            dtype: int32
+            doc: Start index into the TimeSeries 'data' and 'timestamp' datasets of the referenced
+              TimeSeries. The first dimension of those arrays is always time.
+          - name: count
+            dtype: int32
+            doc: Number of data samples available in this time series, during this epoch
+          - name: timeseries
+            dtype:
+              target_type: TimeSeries
+              reftype: object
+            doc: The TimeSeries that this index applies to
+          doc: Column storing references to a TimeSeries (rows). For each TimeSeries this
+            VectorData column stores the start_index and count to indicate the range in time
+            to be selected as well as an object reference to the TimeSeries.
+
+    """
+
+    @classmethod
+    def check(c, cls: Dataset) -> bool:
+        """
+        Check that we're a dataset with a compound dtype
+        """
+        return is_compound(cls)
+
+    @classmethod
+    def apply(
+        c, cls: Dataset, res: Optional[BuildResult] = None, name: Optional[str] = None
+    ) -> BuildResult:
+        """
+        Make a new class for this dtype, using its sub-dtypes as fields,
+        and use it as the range for the parent class
+        """
+        slots = {}
+        for a_dtype in cls.dtype:
+            slots[a_dtype.name] = SlotDefinition(
+                name=a_dtype.name,
+                description=a_dtype.doc,
+                range=ClassAdapter.handle_dtype(a_dtype.dtype),
+                **QUANTITY_MAP[cls.quantity]
+            )
+        res.classes[0].attributes.update(slots)
+        return res
+
+
+
+
+
+
+

 class DatasetAdapter(ClassAdapter):
    """
@ -395,6 +472,25 @@ class DatasetAdapter(ClassAdapter):
        """
        res = self.build_base()

+        # find a map to use
+        map = self.match()
+
+        # apply matching maps
+        if map is not None:
+            res = map.apply(self.cls, res, self._get_full_name())
+
+        return res
+
+    def match(self) -> Optional[Type[DatasetMap]]:
+        """
+        Find the map class that applies to this class
+
+        Returns:
+            :class:`.DatasetMap`
+
+        Raises:
+            RuntimeError - if more than one map matches
+        """
        # find a map to use
        matches = [m for m in DatasetMap.__subclasses__() if m.check(self.cls)]

@ -403,91 +499,10 @@ class DatasetAdapter(ClassAdapter):
                "Only one map should apply to a dataset, you need to refactor the maps! Got maps:"
                f" {matches}"
            )
-
-        # apply matching maps
-        for m in matches:
-            res = m.apply(self.cls, res, self._get_full_name())
-
-        return res
-
-
-def make_array_range(cls: Dataset, name: Optional[str] = None) -> ClassDefinition:
-    """
-    Create a containing arraylike class
-
-    This is likely deprecated so this docstring is a placeholder to satisfy the linter...
-    """
-    # The schema language doesn't have a way of specifying a dataset/group is "abstract"
-    # and yet hdmf-common says you don't need a dtype if the dataset is "abstract"
-    # so....
-    dtype = ClassAdapter.handle_dtype(cls.dtype)
-
-    # dims and shape are lists of lists. First we couple them
-    # (so each dim has its corresponding shape)..
-    # and then we take unique
-    # (dicts are ordered by default in recent pythons,
-    # while set() doesn't preserve order)
-    dims_shape = []
-    for inner_dim, inner_shape in zip(cls.dims, cls.shape):
-        if isinstance(inner_dim, list):
-            # list of lists
-            dims_shape.extend([(dim, shape) for dim, shape in zip(inner_dim, inner_shape)])
-        elif isinstance(inner_shape, list):
-            # Some badly formatted schema will have the shape be a LoL but the dims won't be...
-            dims_shape.extend([(inner_dim, shape) for shape in inner_shape])
+        elif len(matches) == 0:
+            return None
        else:
-            # single-layer list
-            dims_shape.append((inner_dim, inner_shape))
-
-    dims_shape = tuple(dict.fromkeys(dims_shape).keys())
-
-    # --------------------------------------------------
-    # SPECIAL CASE - allen institute's ndx-aibs-ecephys.extension
-    # confuses "dims" with "shape" , eg shape = [None], dims = [3].
-    # So we hardcode that here...
-    # --------------------------------------------------
-    if len(dims_shape) == 1 and isinstance(dims_shape[0][0], int) and dims_shape[0][1] is None:
-        dims_shape = (("dim", dims_shape[0][0]),)
-
-    # now make slots for each of them
-    slots = []
-    for dims, shape in dims_shape:
-        # if there is just a single list of possible dimensions, it's required
-        if not any([isinstance(inner_dim, list) for inner_dim in cls.dims]) or all(
-            [dims in inner_dim for inner_dim in cls.dims]
-        ):
-            required = True
-        else:
-            required = False
-
-        # use cardinality to do shape
-        cardinality = None if shape == "null" else shape
-
-        slots.append(
-            SlotDefinition(
-                name=dims,
-                required=required,
-                maximum_cardinality=cardinality,
-                minimum_cardinality=cardinality,
-                range=dtype,
-            )
-        )
-
-    # and then the class is just a subclass of `Arraylist`
-    # (which is imported by default from `nwb.language.yaml`)
-    if name:
-        pass
-    elif cls.neurodata_type_def:
-        name = cls.neurodata_type_def
-    elif cls.name:
-        name = cls.name
-    else:
-        raise ValueError("Dataset has no name or type definition, what do call it?")
-
-    name = "__".join([name, "Arraylike"])
-
-    array_class = ClassDefinition(name=name, is_a="Arraylike", attributes=slots)
-    return array_class
+            return matches[0]


 def is_1d(cls: Dataset) -> bool:
@ -502,6 +517,8 @@ def is_1d(cls: Dataset) -> bool:
        and len(cls.dims[0]) == 1
    )

+def is_compound(cls: Dataset) -> bool:
+    return isinstance(cls.dtype, list) and len(cls.dtype)>0 and isinstance(cls.dtype[0], CompoundDtype)

 def has_attrs(cls: Dataset) -> bool:
    """
--- a/nwb_linkml/src/nwb_linkml/lang_elements.py
+++ b/nwb_linkml/src/nwb_linkml/lang_elements.py
@ -34,20 +34,6 @@ for nwbtype, linkmltype in flat_to_linkml.items():
    atype = TypeDefinition(name=nwbtype, minimum_value=amin, typeof=linkmltype)
    DTypeTypes.append(atype)

-Arraylike = ClassDefinition(
-    name="Arraylike",
-    description=(
-        "Container for arraylike information held in the dims, shape, and dtype properties."
-        "this is a special case to be interpreted by downstream i/o. this class has no slots"
-        "and is abstract by default."
-        "- Each slot within a subclass indicates a possible dimension."
-        "- Only dimensions that are present in all the dimension specifiers in the"
-        "  original schema are required."
-        "- Shape requirements are indicated using max/min cardinalities on the slot."
-    ),
-    abstract=True,
-)
-
 AnyType = ClassDefinition(
    name="AnyType",
    class_uri="linkml:Any",
@ -60,7 +46,7 @@ NwbLangSchema = SchemaDefinition(
    id="nwb.language",
    description="Adapter objects to mimic the behavior of elements in the nwb-schema-language",
    enums=[FlatDType],
-    classes=[Arraylike, AnyType],
+    classes=[AnyType],
    types=DTypeTypes,
    imports=["linkml:types"],
    prefixes={"linkml": Prefix("linkml", "https://w3id.org/linkml")},
--- a/nwb_linkml/src/nwb_linkml/providers/git.py
+++ b/nwb_linkml/src/nwb_linkml/providers/git.py
@ -49,7 +49,7 @@ NWB_CORE_REPO = NamespaceRepo(
    name="core",
    repository="https://github.com/NeurodataWithoutBorders/nwb-schema",
    path=Path("core/nwb.namespace.yaml"),
-    versions=["2.2.0", "2.2.1", "2.2.2", "2.2.4", "2.2.5", "2.3.0", "2.4.0", "2.5.0", "2.6.0"],
+    versions=["2.2.0", "2.2.1", "2.2.2", "2.2.4", "2.2.5", "2.3.0", "2.4.0", "2.5.0", "2.6.0", "2.7.0"],
 )

 HDMF_COMMON_REPO = NamespaceRepo(
--- a/nwb_linkml/tests/fixtures.py
+++ b/nwb_linkml/tests/fixtures.py
@ -67,7 +67,7 @@ def tmp_output_dir_mod(tmp_output_dir) -> Path:
    return subpath


-@pytest.fixture(scope="session", params=[{"core_version": "2.6.0", "hdmf_version": "1.5.0"}])
+@pytest.fixture(scope="session", params=[{"core_version": "2.7.0", "hdmf_version": "1.8.0"}])
 def nwb_core_fixture(request) -> NamespacesAdapter:
    nwb_core = io.load_nwb_core(**request.param)
    nwb_core.populate_imports()
--- a/nwb_linkml/tests/test_adapters/test_adapter_dataset.py
+++ b/nwb_linkml/tests/test_adapters/test_adapter_dataset.py
@ -1,6 +1,9 @@
 from nwb_linkml.adapters.dataset import (
    MapScalar,
+    DatasetAdapter
 )
+from nwb_linkml.adapters import NamespacesAdapter
+
 from nwb_schema_language import Dataset