From ec81032ae8be58755933d4d80cd2dd9a01931e7a Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Wed, 3 Jul 2024 20:39:49 -0700 Subject: [PATCH] handle compound dtype --- docs/meta/todo.md | 8 + nwb_linkml/pdm.lock | 31 ++- nwb_linkml/pyproject.toml | 1 + nwb_linkml/src/nwb_linkml/adapters/classes.py | 15 +- nwb_linkml/src/nwb_linkml/adapters/dataset.py | 211 ++++++++++-------- nwb_linkml/src/nwb_linkml/lang_elements.py | 16 +- nwb_linkml/src/nwb_linkml/providers/git.py | 2 +- nwb_linkml/tests/fixtures.py | 2 +- .../test_adapters/test_adapter_dataset.py | 3 + 9 files changed, 164 insertions(+), 125 deletions(-) diff --git a/docs/meta/todo.md b/docs/meta/todo.md index f6cdd6f..cf6cf6e 100644 --- a/docs/meta/todo.md +++ b/docs/meta/todo.md @@ -1,5 +1,13 @@ # TODO +## v0.2 - update to linkml-arrays and formal release + +NWB schema translation +- handle `links` field in groups +- handle compound `dtype` like in ophys.PlaneSegmentation.pixel_mask +- handle compound `dtype` like in TimeSeriesReferenceVectorData +- Create a validator that checks if all the lists in a compound dtype dataset are same length + Important things that are not implemented yet! - {meth}`nwb_linkml.adapters.classes.ClassAdapter.handle_dtype` does not yet handle compound dtypes, diff --git a/nwb_linkml/pdm.lock b/nwb_linkml/pdm.lock index 1d543d2..0269026 100644 --- a/nwb_linkml/pdm.lock +++ b/nwb_linkml/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "dev", "tests"] strategy = ["cross_platform", "inherit_metadata"] lock_version = "4.4.2" -content_hash = "sha256:8cb98f940354e71443df87fd1702300e1c793627b0a016a6233811a640f47c18" +content_hash = "sha256:6819a20ed9759b784908ad417ee371b941cfb1d9fbeebc8b29f25038f49ea544" [[package]] name = "annotated-types" @@ -769,18 +769,21 @@ files = [ [[package]] name = "linkml" -version = "1.7.10" +version = "0.0.0" requires_python = "<4.0.0,>=3.8.1" +git = "https://github.com/sneakers-the-rat/linkml" +ref = "arrays-numpydantic" +revision = "a3cfb2b82d7519cf9c64d113250c1714db2b3f6e" summary = "Linked Open Data Modeling Language" groups = ["default", "dev"] dependencies = [ - "antlr4-python3-runtime<4.10,>=4.9.0", + "antlr4-python3-runtime<4.10,==4.*,>=4.9.0", "click>=7.0", "graphviz>=0.10.1", "hbreader", "isodate>=0.6.0", "jinja2>=3.1.0", - "jsonasobj2<2.0.0,>=1.0.3", + "jsonasobj2==1.*,>=1.0.0,>=1.0.3", "jsonschema[format]>=4.0.0", "linkml-dataops", "linkml-runtime>=1.7.4", @@ -799,10 +802,6 @@ dependencies = [ "sqlalchemy>=1.4.31", "watchdog>=0.9.0", ] -files = [ - {file = "linkml-1.7.10-py3-none-any.whl", hash = "sha256:bf21cce814e9d1509489f1e6e15a7e86e4f11d949490d9a7a5c3f6b5b412ec62"}, - {file = "linkml-1.7.10.tar.gz", hash = "sha256:1c38601c3cd495e34490b8cf7277fd3674ec68dcbe9f5efcec2658093801ce91"}, -] [[package]] name = "linkml-dataops" @@ -1085,6 +1084,22 @@ files = [ {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"}, ] +[[package]] +name = "numpydantic" +version = "1.2.1" +requires_python = "<4.0,>=3.9" +summary = "Type and shape validation and serialization for numpy arrays in pydantic models" +groups = ["default", "dev"] +dependencies = [ + "nptyping>=2.5.0", + "numpy>=1.24.0", + "pydantic>=2.3.0", +] +files = [ + {file = "numpydantic-1.2.1-py3-none-any.whl", hash = "sha256:e21d7e272410b3a2013d2a6aeec2ed6efd13ea171b0200e2029d7c2f1453def0"}, + {file = "numpydantic-1.2.1.tar.gz", hash = "sha256:d8a3e7371d78b99fa4a4733a5b873046f064993431ae63f97edcf9bda4dd5c7f"}, +] + [[package]] name = "nwb-schema-language" version = "0.1.3" diff --git a/nwb_linkml/pyproject.toml b/nwb_linkml/pyproject.toml index d478065..d996f88 100644 --- a/nwb_linkml/pyproject.toml +++ b/nwb_linkml/pyproject.toml @@ -23,6 +23,7 @@ dependencies = [ "blosc2>=2.2.7", "tqdm>=4.66.1", 'typing-extensions>=4.12.2;python_version<"3.11"', + "numpydantic>=1.2.1", ] [project.urls] diff --git a/nwb_linkml/src/nwb_linkml/adapters/classes.py b/nwb_linkml/src/nwb_linkml/adapters/classes.py index 5870d61..bd252b2 100644 --- a/nwb_linkml/src/nwb_linkml/adapters/classes.py +++ b/nwb_linkml/src/nwb_linkml/adapters/classes.py @@ -123,6 +123,7 @@ class ClassAdapter(Adapter): name_parts.append(self.cls.name) name = "__".join(name_parts) + elif self.cls.neurodata_type_inc is not None: # again, this is against the schema, but is common name = self.cls.neurodata_type_inc @@ -206,12 +207,20 @@ class ClassAdapter(Adapter): Returns: """ - if self.cls.name: + if self.cls.name or self.cls.default_name: + if self.cls.name: + # name overrides default_name + name = self.cls.name + equals_string = name + else: + name = self.cls.default_name + equals_string = None + name_slot = SlotDefinition( name="name", required=True, - ifabsent=f"string({self.cls.name})", - equals_string=self.cls.name, + ifabsent=f"string({name})", + equals_string=equals_string, range="string", identifier=True, ) diff --git a/nwb_linkml/src/nwb_linkml/adapters/dataset.py b/nwb_linkml/src/nwb_linkml/adapters/dataset.py index 5c8419d..f6422d1 100644 --- a/nwb_linkml/src/nwb_linkml/adapters/dataset.py +++ b/nwb_linkml/src/nwb_linkml/adapters/dataset.py @@ -1,9 +1,9 @@ """ Adapter for NWB datasets to linkml Classes """ - +import pdb from abc import abstractmethod -from typing import Optional +from typing import Optional, Type from linkml_runtime.linkml_model.meta import ( ClassDefinition, @@ -16,7 +16,7 @@ from nwb_linkml.adapters.classes import ClassAdapter from nwb_linkml.maps import QUANTITY_MAP, Map from nwb_linkml.maps.dtype import flat_to_linkml from nwb_linkml.maps.naming import camel_to_snake -from nwb_schema_language import Dataset +from nwb_schema_language import Dataset, CompoundDtype class DatasetMap(Map): @@ -113,6 +113,7 @@ class MapScalar(DatasetMap): and not cls.attributes and not cls.dims and not cls.shape + and not is_compound(cls) and cls.name ) @@ -228,7 +229,7 @@ class MapArraylike(DatasetMap): """ Check if we're a plain array """ - return cls.name and all([cls.dims, cls.shape]) and not has_attrs(cls) + return cls.name and all([cls.dims, cls.shape]) and not has_attrs(cls) and not is_compound(cls) @classmethod def apply( @@ -259,15 +260,6 @@ class MapArrayLikeAttributes(DatasetMap): """ The most general case - treat everything that isn't handled by one of the special cases as an array! - - Specifically, we make an ``Arraylike`` class such that: - - - Each slot within a subclass indicates a possible dimension. - - Only dimensions that are present in all the dimension specifiers in the - original schema are required. - - Shape requirements are indicated using max/min cardinalities on the slot. - - The arraylike object should be stored in the `array` slot on the containing class - (since there are already properties named `data`) """ NEEDS_NAME = True @@ -282,6 +274,7 @@ class MapArrayLikeAttributes(DatasetMap): all([cls.dims, cls.shape]) and cls.neurodata_type_inc != "VectorData" and has_attrs(cls) + and not is_compound(cls) and (dtype == "AnyType" or dtype in flat_to_linkml) ) @@ -311,6 +304,22 @@ class Map1DVector(DatasetMap): """ ``VectorData`` is subclassed with a name but without dims or attributes, treat this as a normal 1D array slot that replaces any class that would be built for this + + eg. all the datasets in epoch.TimeIntervals: + + .. code-block:: yaml + + groups: + - neurodata_type_def: TimeIntervals + neurodata_type_inc: DynamicTable + doc: A container for aggregating epoch data and the TimeSeries that each epoch applies + to. + datasets: + - name: start_time + neurodata_type_inc: VectorData + dtype: float32 + doc: Start time of epoch, in seconds. + """ @classmethod @@ -323,6 +332,8 @@ class Map1DVector(DatasetMap): and not cls.dims and not cls.shape and not cls.attributes + and not cls.neurodata_type_def + and not is_compound(cls) and cls.name ) @@ -381,6 +392,72 @@ class MapNVectors(DatasetMap): res = BuildResult(slots=[this_slot]) return res +class MapCompoundDtype(DatasetMap): + """ + A ``dtype`` declared as an array of types that function effectively as a row in a table. + + We render them just as a class with each of the dtypes as slots - they are + typically used by other datasets to create a table. + + Eg. ``base.TimeSeriesReferenceVectorData`` + + .. code-block:: yaml + + datasets: + - neurodata_type_def: TimeSeriesReferenceVectorData + neurodata_type_inc: VectorData + default_name: timeseries + dtype: + - name: idx_start + dtype: int32 + doc: Start index into the TimeSeries 'data' and 'timestamp' datasets of the referenced + TimeSeries. The first dimension of those arrays is always time. + - name: count + dtype: int32 + doc: Number of data samples available in this time series, during this epoch + - name: timeseries + dtype: + target_type: TimeSeries + reftype: object + doc: The TimeSeries that this index applies to + doc: Column storing references to a TimeSeries (rows). For each TimeSeries this + VectorData column stores the start_index and count to indicate the range in time + to be selected as well as an object reference to the TimeSeries. + + """ + + @classmethod + def check(c, cls: Dataset) -> bool: + """ + Check that we're a dataset with a compound dtype + """ + return is_compound(cls) + + @classmethod + def apply( + c, cls: Dataset, res: Optional[BuildResult] = None, name: Optional[str] = None + ) -> BuildResult: + """ + Make a new class for this dtype, using its sub-dtypes as fields, + and use it as the range for the parent class + """ + slots = {} + for a_dtype in cls.dtype: + slots[a_dtype.name] = SlotDefinition( + name=a_dtype.name, + description=a_dtype.doc, + range=ClassAdapter.handle_dtype(a_dtype.dtype), + **QUANTITY_MAP[cls.quantity] + ) + res.classes[0].attributes.update(slots) + return res + + + + + + + class DatasetAdapter(ClassAdapter): """ @@ -395,6 +472,25 @@ class DatasetAdapter(ClassAdapter): """ res = self.build_base() + # find a map to use + map = self.match() + + # apply matching maps + if map is not None: + res = map.apply(self.cls, res, self._get_full_name()) + + return res + + def match(self) -> Optional[Type[DatasetMap]]: + """ + Find the map class that applies to this class + + Returns: + :class:`.DatasetMap` + + Raises: + RuntimeError - if more than one map matches + """ # find a map to use matches = [m for m in DatasetMap.__subclasses__() if m.check(self.cls)] @@ -403,91 +499,10 @@ class DatasetAdapter(ClassAdapter): "Only one map should apply to a dataset, you need to refactor the maps! Got maps:" f" {matches}" ) - - # apply matching maps - for m in matches: - res = m.apply(self.cls, res, self._get_full_name()) - - return res - - -def make_array_range(cls: Dataset, name: Optional[str] = None) -> ClassDefinition: - """ - Create a containing arraylike class - - This is likely deprecated so this docstring is a placeholder to satisfy the linter... - """ - # The schema language doesn't have a way of specifying a dataset/group is "abstract" - # and yet hdmf-common says you don't need a dtype if the dataset is "abstract" - # so.... - dtype = ClassAdapter.handle_dtype(cls.dtype) - - # dims and shape are lists of lists. First we couple them - # (so each dim has its corresponding shape).. - # and then we take unique - # (dicts are ordered by default in recent pythons, - # while set() doesn't preserve order) - dims_shape = [] - for inner_dim, inner_shape in zip(cls.dims, cls.shape): - if isinstance(inner_dim, list): - # list of lists - dims_shape.extend([(dim, shape) for dim, shape in zip(inner_dim, inner_shape)]) - elif isinstance(inner_shape, list): - # Some badly formatted schema will have the shape be a LoL but the dims won't be... - dims_shape.extend([(inner_dim, shape) for shape in inner_shape]) + elif len(matches) == 0: + return None else: - # single-layer list - dims_shape.append((inner_dim, inner_shape)) - - dims_shape = tuple(dict.fromkeys(dims_shape).keys()) - - # -------------------------------------------------- - # SPECIAL CASE - allen institute's ndx-aibs-ecephys.extension - # confuses "dims" with "shape" , eg shape = [None], dims = [3]. - # So we hardcode that here... - # -------------------------------------------------- - if len(dims_shape) == 1 and isinstance(dims_shape[0][0], int) and dims_shape[0][1] is None: - dims_shape = (("dim", dims_shape[0][0]),) - - # now make slots for each of them - slots = [] - for dims, shape in dims_shape: - # if there is just a single list of possible dimensions, it's required - if not any([isinstance(inner_dim, list) for inner_dim in cls.dims]) or all( - [dims in inner_dim for inner_dim in cls.dims] - ): - required = True - else: - required = False - - # use cardinality to do shape - cardinality = None if shape == "null" else shape - - slots.append( - SlotDefinition( - name=dims, - required=required, - maximum_cardinality=cardinality, - minimum_cardinality=cardinality, - range=dtype, - ) - ) - - # and then the class is just a subclass of `Arraylist` - # (which is imported by default from `nwb.language.yaml`) - if name: - pass - elif cls.neurodata_type_def: - name = cls.neurodata_type_def - elif cls.name: - name = cls.name - else: - raise ValueError("Dataset has no name or type definition, what do call it?") - - name = "__".join([name, "Arraylike"]) - - array_class = ClassDefinition(name=name, is_a="Arraylike", attributes=slots) - return array_class + return matches[0] def is_1d(cls: Dataset) -> bool: @@ -502,6 +517,8 @@ def is_1d(cls: Dataset) -> bool: and len(cls.dims[0]) == 1 ) +def is_compound(cls: Dataset) -> bool: + return isinstance(cls.dtype, list) and len(cls.dtype)>0 and isinstance(cls.dtype[0], CompoundDtype) def has_attrs(cls: Dataset) -> bool: """ diff --git a/nwb_linkml/src/nwb_linkml/lang_elements.py b/nwb_linkml/src/nwb_linkml/lang_elements.py index fa919d3..d038c7e 100644 --- a/nwb_linkml/src/nwb_linkml/lang_elements.py +++ b/nwb_linkml/src/nwb_linkml/lang_elements.py @@ -34,20 +34,6 @@ for nwbtype, linkmltype in flat_to_linkml.items(): atype = TypeDefinition(name=nwbtype, minimum_value=amin, typeof=linkmltype) DTypeTypes.append(atype) -Arraylike = ClassDefinition( - name="Arraylike", - description=( - "Container for arraylike information held in the dims, shape, and dtype properties." - "this is a special case to be interpreted by downstream i/o. this class has no slots" - "and is abstract by default." - "- Each slot within a subclass indicates a possible dimension." - "- Only dimensions that are present in all the dimension specifiers in the" - " original schema are required." - "- Shape requirements are indicated using max/min cardinalities on the slot." - ), - abstract=True, -) - AnyType = ClassDefinition( name="AnyType", class_uri="linkml:Any", @@ -60,7 +46,7 @@ NwbLangSchema = SchemaDefinition( id="nwb.language", description="Adapter objects to mimic the behavior of elements in the nwb-schema-language", enums=[FlatDType], - classes=[Arraylike, AnyType], + classes=[AnyType], types=DTypeTypes, imports=["linkml:types"], prefixes={"linkml": Prefix("linkml", "https://w3id.org/linkml")}, diff --git a/nwb_linkml/src/nwb_linkml/providers/git.py b/nwb_linkml/src/nwb_linkml/providers/git.py index 4edc851..73525c2 100644 --- a/nwb_linkml/src/nwb_linkml/providers/git.py +++ b/nwb_linkml/src/nwb_linkml/providers/git.py @@ -49,7 +49,7 @@ NWB_CORE_REPO = NamespaceRepo( name="core", repository="https://github.com/NeurodataWithoutBorders/nwb-schema", path=Path("core/nwb.namespace.yaml"), - versions=["2.2.0", "2.2.1", "2.2.2", "2.2.4", "2.2.5", "2.3.0", "2.4.0", "2.5.0", "2.6.0"], + versions=["2.2.0", "2.2.1", "2.2.2", "2.2.4", "2.2.5", "2.3.0", "2.4.0", "2.5.0", "2.6.0", "2.7.0"], ) HDMF_COMMON_REPO = NamespaceRepo( diff --git a/nwb_linkml/tests/fixtures.py b/nwb_linkml/tests/fixtures.py index 0490d89..a232122 100644 --- a/nwb_linkml/tests/fixtures.py +++ b/nwb_linkml/tests/fixtures.py @@ -67,7 +67,7 @@ def tmp_output_dir_mod(tmp_output_dir) -> Path: return subpath -@pytest.fixture(scope="session", params=[{"core_version": "2.6.0", "hdmf_version": "1.5.0"}]) +@pytest.fixture(scope="session", params=[{"core_version": "2.7.0", "hdmf_version": "1.8.0"}]) def nwb_core_fixture(request) -> NamespacesAdapter: nwb_core = io.load_nwb_core(**request.param) nwb_core.populate_imports() diff --git a/nwb_linkml/tests/test_adapters/test_adapter_dataset.py b/nwb_linkml/tests/test_adapters/test_adapter_dataset.py index f961464..7f986de 100644 --- a/nwb_linkml/tests/test_adapters/test_adapter_dataset.py +++ b/nwb_linkml/tests/test_adapters/test_adapter_dataset.py @@ -1,6 +1,9 @@ from nwb_linkml.adapters.dataset import ( MapScalar, + DatasetAdapter ) +from nwb_linkml.adapters import NamespacesAdapter + from nwb_schema_language import Dataset