handle compound dtype

2025-01-10 06:04:28 +00:00 · 2024-07-03 20:39:49 -07:00 · 2024-07-03 20:39:49 -07:00 · ec81032ae8
commit ec81032ae8
parent c9a2423a9d
9 changed files with 164 additions and 125 deletions
--- a/docs/meta/todo.md
+++ b/docs/meta/todo.md
@ -1,5 +1,13 @@
 # TODO
 ## v0.2 - update to linkml-arrays and formal release
 NWB schema translation
 - handle `links` field in groups
 - handle compound `dtype` like in ophys.PlaneSegmentation.pixel_mask
 - handle compound `dtype` like in TimeSeriesReferenceVectorData
 - Create a validator that checks if all the lists in a compound dtype dataset are same length
 Important things that are not implemented yet!
 - {meth}`nwb_linkml.adapters.classes.ClassAdapter.handle_dtype` does not yet handle compound dtypes,
--- a/nwb_linkml/pdm.lock
+++ b/nwb_linkml/pdm.lock
@ -5,7 +5,7 @@
 groups = ["default", "dev", "tests"]
 strategy = ["cross_platform", "inherit_metadata"]
 lock_version = "4.4.2"
-content_hash = "sha256:8cb98f940354e71443df87fd1702300e1c793627b0a016a6233811a640f47c18"
+content_hash = "sha256:6819a20ed9759b784908ad417ee371b941cfb1d9fbeebc8b29f25038f49ea544"
 [[package]]
 name = "annotated-types"
@ -769,18 +769,21 @@ files = [
 [[package]]
 name = "linkml"
-version = "1.7.10"
+version = "0.0.0"
 requires_python = "<4.0.0,>=3.8.1"
 git = "https://github.com/sneakers-the-rat/linkml"
 ref = "arrays-numpydantic"
 revision = "a3cfb2b82d7519cf9c64d113250c1714db2b3f6e"
 summary = "Linked Open Data Modeling Language"
 groups = ["default", "dev"]
 dependencies = [
-    "antlr4-python3-runtime<4.10,>=4.9.0",
+    "antlr4-python3-runtime<4.10,==4.*,>=4.9.0",
    "click>=7.0",
    "graphviz>=0.10.1",
    "hbreader",
    "isodate>=0.6.0",
    "jinja2>=3.1.0",
-    "jsonasobj2<2.0.0,>=1.0.3",
+    "jsonasobj2==1.*,>=1.0.0,>=1.0.3",
    "jsonschema[format]>=4.0.0",
    "linkml-dataops",
    "linkml-runtime>=1.7.4",
@ -799,10 +802,6 @@ dependencies = [
    "sqlalchemy>=1.4.31",
    "watchdog>=0.9.0",
 ]
 files = [
    {file = "linkml-1.7.10-py3-none-any.whl", hash = "sha256:bf21cce814e9d1509489f1e6e15a7e86e4f11d949490d9a7a5c3f6b5b412ec62"},
    {file = "linkml-1.7.10.tar.gz", hash = "sha256:1c38601c3cd495e34490b8cf7277fd3674ec68dcbe9f5efcec2658093801ce91"},
 ]
 [[package]]
 name = "linkml-dataops"
@ -1085,6 +1084,22 @@ files = [
    {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"},
 ]
 [[package]]
 name = "numpydantic"
 version = "1.2.1"
 requires_python = "<4.0,>=3.9"
 summary = "Type and shape validation and serialization for numpy arrays in pydantic models"
 groups = ["default", "dev"]
 dependencies = [
    "nptyping>=2.5.0",
    "numpy>=1.24.0",
    "pydantic>=2.3.0",
 ]
 files = [
    {file = "numpydantic-1.2.1-py3-none-any.whl", hash = "sha256:e21d7e272410b3a2013d2a6aeec2ed6efd13ea171b0200e2029d7c2f1453def0"},
    {file = "numpydantic-1.2.1.tar.gz", hash = "sha256:d8a3e7371d78b99fa4a4733a5b873046f064993431ae63f97edcf9bda4dd5c7f"},
 ]
 [[package]]
 name = "nwb-schema-language"
 version = "0.1.3"
--- a/nwb_linkml/pyproject.toml
+++ b/nwb_linkml/pyproject.toml
@ -23,6 +23,7 @@ dependencies = [
    "blosc2>=2.2.7",
    "tqdm>=4.66.1",
    'typing-extensions>=4.12.2;python_version<"3.11"',
    "numpydantic>=1.2.1",
 ]
 [project.urls]
--- a/nwb_linkml/src/nwb_linkml/adapters/classes.py
+++ b/nwb_linkml/src/nwb_linkml/adapters/classes.py
@ -123,6 +123,7 @@ class ClassAdapter(Adapter):
            name_parts.append(self.cls.name)
            name = "__".join(name_parts)
        elif self.cls.neurodata_type_inc is not None:
            # again, this is against the schema, but is common
            name = self.cls.neurodata_type_inc
@ -206,12 +207,20 @@ class ClassAdapter(Adapter):
        Returns:
        """
        if self.cls.name or self.cls.default_name:
            if self.cls.name:
                # name overrides default_name
                name = self.cls.name
                equals_string = name
            else:
                name = self.cls.default_name
                equals_string = None
            name_slot = SlotDefinition(
                name="name",
                required=True,
-                ifabsent=f"string({self.cls.name})",
+                ifabsent=f"string({name})",
-                equals_string=self.cls.name,
+                equals_string=equals_string,
                range="string",
                identifier=True,
            )
--- a/nwb_linkml/src/nwb_linkml/adapters/dataset.py
+++ b/nwb_linkml/src/nwb_linkml/adapters/dataset.py
@ -1,9 +1,9 @@
 """
 Adapter for NWB datasets to linkml Classes
 """
-
+import pdb
 from abc import abstractmethod
-from typing import Optional
+from typing import Optional, Type
 from linkml_runtime.linkml_model.meta import (
    ClassDefinition,
@ -16,7 +16,7 @@ from nwb_linkml.adapters.classes import ClassAdapter
 from nwb_linkml.maps import QUANTITY_MAP, Map
 from nwb_linkml.maps.dtype import flat_to_linkml
 from nwb_linkml.maps.naming import camel_to_snake
-from nwb_schema_language import Dataset
+from nwb_schema_language import Dataset, CompoundDtype
 class DatasetMap(Map):
@ -113,6 +113,7 @@ class MapScalar(DatasetMap):
            and not cls.attributes
            and not cls.dims
            and not cls.shape
            and not is_compound(cls)
            and cls.name
        )
@ -228,7 +229,7 @@ class MapArraylike(DatasetMap):
        """
        Check if we're a plain array
        """
-        return cls.name and all([cls.dims, cls.shape]) and not has_attrs(cls)
+        return cls.name and all([cls.dims, cls.shape]) and not has_attrs(cls) and not is_compound(cls)
    @classmethod
    def apply(
@ -259,15 +260,6 @@ class MapArrayLikeAttributes(DatasetMap):
    """
    The most general case - treat everything that isn't handled by one of the special cases
    as an array!
    Specifically, we make an ``Arraylike`` class such that:
    - Each slot within a subclass indicates a possible dimension.
    - Only dimensions that are present in all the dimension specifiers in the
      original schema are required.
    - Shape requirements are indicated using max/min cardinalities on the slot.
    - The arraylike object should be stored in the `array` slot on the containing class
      (since there are already properties named `data`)
    """
    NEEDS_NAME = True
@ -282,6 +274,7 @@ class MapArrayLikeAttributes(DatasetMap):
            all([cls.dims, cls.shape])
            and cls.neurodata_type_inc != "VectorData"
            and has_attrs(cls)
            and not is_compound(cls)
            and (dtype == "AnyType" or dtype in flat_to_linkml)
        )
@ -311,6 +304,22 @@ class Map1DVector(DatasetMap):
    """
    ``VectorData`` is subclassed with a name but without dims or attributes,
    treat this as a normal 1D array slot that replaces any class that would be built for this
    eg. all the datasets in epoch.TimeIntervals:
    .. code-block:: yaml
        groups:
        - neurodata_type_def: TimeIntervals
          neurodata_type_inc: DynamicTable
          doc: A container for aggregating epoch data and the TimeSeries that each epoch applies
            to.
          datasets:
          - name: start_time
            neurodata_type_inc: VectorData
            dtype: float32
            doc: Start time of epoch, in seconds.
    """
    @classmethod
@ -323,6 +332,8 @@ class Map1DVector(DatasetMap):
            and not cls.dims
            and not cls.shape
            and not cls.attributes
            and not cls.neurodata_type_def
            and not is_compound(cls)
            and cls.name
        )
@ -381,6 +392,72 @@ class MapNVectors(DatasetMap):
        res = BuildResult(slots=[this_slot])
        return res
 class MapCompoundDtype(DatasetMap):
    """
    A ``dtype`` declared as an array of types that function effectively as a row in a table.
    We render them just as a class with each of the dtypes as slots - they are
    typically used by other datasets to create a table.
    Eg. ``base.TimeSeriesReferenceVectorData``
    .. code-block:: yaml
        datasets:
        - neurodata_type_def: TimeSeriesReferenceVectorData
          neurodata_type_inc: VectorData
          default_name: timeseries
          dtype:
          - name: idx_start
            dtype: int32
            doc: Start index into the TimeSeries 'data' and 'timestamp' datasets of the referenced
              TimeSeries. The first dimension of those arrays is always time.
          - name: count
            dtype: int32
            doc: Number of data samples available in this time series, during this epoch
          - name: timeseries
            dtype:
              target_type: TimeSeries
              reftype: object
            doc: The TimeSeries that this index applies to
          doc: Column storing references to a TimeSeries (rows). For each TimeSeries this
            VectorData column stores the start_index and count to indicate the range in time
            to be selected as well as an object reference to the TimeSeries.
    """
    @classmethod
    def check(c, cls: Dataset) -> bool:
        """
        Check that we're a dataset with a compound dtype
        """
        return is_compound(cls)
    @classmethod
    def apply(
        c, cls: Dataset, res: Optional[BuildResult] = None, name: Optional[str] = None
    ) -> BuildResult:
        """
        Make a new class for this dtype, using its sub-dtypes as fields,
        and use it as the range for the parent class
        """
        slots = {}
        for a_dtype in cls.dtype:
            slots[a_dtype.name] = SlotDefinition(
                name=a_dtype.name,
                description=a_dtype.doc,
                range=ClassAdapter.handle_dtype(a_dtype.dtype),
                **QUANTITY_MAP[cls.quantity]
            )
        res.classes[0].attributes.update(slots)
        return res
 class DatasetAdapter(ClassAdapter):
    """
@ -395,6 +472,25 @@ class DatasetAdapter(ClassAdapter):
        """
        res = self.build_base()
        # find a map to use
        map = self.match()
        # apply matching maps
        if map is not None:
            res = map.apply(self.cls, res, self._get_full_name())
        return res
    def match(self) -> Optional[Type[DatasetMap]]:
        """
        Find the map class that applies to this class
        Returns:
            :class:`.DatasetMap`
        Raises:
            RuntimeError - if more than one map matches
        """
        # find a map to use
        matches = [m for m in DatasetMap.__subclasses__() if m.check(self.cls)]
@ -403,91 +499,10 @@ class DatasetAdapter(ClassAdapter):
                "Only one map should apply to a dataset, you need to refactor the maps! Got maps:"
                f" {matches}"
            )
-
+        elif len(matches) == 0:
-        # apply matching maps
+            return None
        for m in matches:
            res = m.apply(self.cls, res, self._get_full_name())
        return res
 def make_array_range(cls: Dataset, name: Optional[str] = None) -> ClassDefinition:
    """
    Create a containing arraylike class
    This is likely deprecated so this docstring is a placeholder to satisfy the linter...
    """
    # The schema language doesn't have a way of specifying a dataset/group is "abstract"
    # and yet hdmf-common says you don't need a dtype if the dataset is "abstract"
    # so....
    dtype = ClassAdapter.handle_dtype(cls.dtype)
    # dims and shape are lists of lists. First we couple them
    # (so each dim has its corresponding shape)..
    # and then we take unique
    # (dicts are ordered by default in recent pythons,
    # while set() doesn't preserve order)
    dims_shape = []
    for inner_dim, inner_shape in zip(cls.dims, cls.shape):
        if isinstance(inner_dim, list):
            # list of lists
            dims_shape.extend([(dim, shape) for dim, shape in zip(inner_dim, inner_shape)])
        elif isinstance(inner_shape, list):
            # Some badly formatted schema will have the shape be a LoL but the dims won't be...
            dims_shape.extend([(inner_dim, shape) for shape in inner_shape])
        else:
-            # single-layer list
+            return matches[0]
            dims_shape.append((inner_dim, inner_shape))
    dims_shape = tuple(dict.fromkeys(dims_shape).keys())
    # --------------------------------------------------
    # SPECIAL CASE - allen institute's ndx-aibs-ecephys.extension
    # confuses "dims" with "shape" , eg shape = [None], dims = [3].
    # So we hardcode that here...
    # --------------------------------------------------
    if len(dims_shape) == 1 and isinstance(dims_shape[0][0], int) and dims_shape[0][1] is None:
        dims_shape = (("dim", dims_shape[0][0]),)
    # now make slots for each of them
    slots = []
    for dims, shape in dims_shape:
        # if there is just a single list of possible dimensions, it's required
        if not any([isinstance(inner_dim, list) for inner_dim in cls.dims]) or all(
            [dims in inner_dim for inner_dim in cls.dims]
        ):
            required = True
        else:
            required = False
        # use cardinality to do shape
        cardinality = None if shape == "null" else shape
        slots.append(
            SlotDefinition(
                name=dims,
                required=required,
                maximum_cardinality=cardinality,
                minimum_cardinality=cardinality,
                range=dtype,
            )
        )
    # and then the class is just a subclass of `Arraylist`
    # (which is imported by default from `nwb.language.yaml`)
    if name:
        pass
    elif cls.neurodata_type_def:
        name = cls.neurodata_type_def
    elif cls.name:
        name = cls.name
    else:
        raise ValueError("Dataset has no name or type definition, what do call it?")
    name = "__".join([name, "Arraylike"])
    array_class = ClassDefinition(name=name, is_a="Arraylike", attributes=slots)
    return array_class
 def is_1d(cls: Dataset) -> bool:
@ -502,6 +517,8 @@ def is_1d(cls: Dataset) -> bool:
        and len(cls.dims[0]) == 1
    )
 def is_compound(cls: Dataset) -> bool:
    return isinstance(cls.dtype, list) and len(cls.dtype)>0 and isinstance(cls.dtype[0], CompoundDtype)
 def has_attrs(cls: Dataset) -> bool:
    """
--- a/nwb_linkml/src/nwb_linkml/lang_elements.py
+++ b/nwb_linkml/src/nwb_linkml/lang_elements.py
@ -34,20 +34,6 @@ for nwbtype, linkmltype in flat_to_linkml.items():
    atype = TypeDefinition(name=nwbtype, minimum_value=amin, typeof=linkmltype)
    DTypeTypes.append(atype)
 Arraylike = ClassDefinition(
    name="Arraylike",
    description=(
        "Container for arraylike information held in the dims, shape, and dtype properties."
        "this is a special case to be interpreted by downstream i/o. this class has no slots"
        "and is abstract by default."
        "- Each slot within a subclass indicates a possible dimension."
        "- Only dimensions that are present in all the dimension specifiers in the"
        "  original schema are required."
        "- Shape requirements are indicated using max/min cardinalities on the slot."
    ),
    abstract=True,
 )
 AnyType = ClassDefinition(
    name="AnyType",
    class_uri="linkml:Any",
@ -60,7 +46,7 @@ NwbLangSchema = SchemaDefinition(
    id="nwb.language",
    description="Adapter objects to mimic the behavior of elements in the nwb-schema-language",
    enums=[FlatDType],
-    classes=[Arraylike, AnyType],
+    classes=[AnyType],
    types=DTypeTypes,
    imports=["linkml:types"],
    prefixes={"linkml": Prefix("linkml", "https://w3id.org/linkml")},
--- a/nwb_linkml/src/nwb_linkml/providers/git.py
+++ b/nwb_linkml/src/nwb_linkml/providers/git.py
@ -49,7 +49,7 @@ NWB_CORE_REPO = NamespaceRepo(
    name="core",
    repository="https://github.com/NeurodataWithoutBorders/nwb-schema",
    path=Path("core/nwb.namespace.yaml"),
-    versions=["2.2.0", "2.2.1", "2.2.2", "2.2.4", "2.2.5", "2.3.0", "2.4.0", "2.5.0", "2.6.0"],
+    versions=["2.2.0", "2.2.1", "2.2.2", "2.2.4", "2.2.5", "2.3.0", "2.4.0", "2.5.0", "2.6.0", "2.7.0"],
 )
 HDMF_COMMON_REPO = NamespaceRepo(
--- a/nwb_linkml/tests/fixtures.py
+++ b/nwb_linkml/tests/fixtures.py
@ -67,7 +67,7 @@ def tmp_output_dir_mod(tmp_output_dir) -> Path:
    return subpath
-@pytest.fixture(scope="session", params=[{"core_version": "2.6.0", "hdmf_version": "1.5.0"}])
+@pytest.fixture(scope="session", params=[{"core_version": "2.7.0", "hdmf_version": "1.8.0"}])
 def nwb_core_fixture(request) -> NamespacesAdapter:
    nwb_core = io.load_nwb_core(**request.param)
    nwb_core.populate_imports()
--- a/nwb_linkml/tests/test_adapters/test_adapter_dataset.py
+++ b/nwb_linkml/tests/test_adapters/test_adapter_dataset.py
@ -1,6 +1,9 @@
 from nwb_linkml.adapters.dataset import (
    MapScalar,
    DatasetAdapter
 )
 from nwb_linkml.adapters import NamespacesAdapter
 from nwb_schema_language import Dataset