handle compound dtype

This commit is contained in:
sneakers-the-rat 2024-07-03 20:39:49 -07:00
parent c9a2423a9d
commit ec81032ae8
Signed by untrusted user who does not match committer: jonny
GPG key ID: 6DCB96EF1E4D232D
9 changed files with 164 additions and 125 deletions

View file

@ -1,5 +1,13 @@
# TODO # TODO
## v0.2 - update to linkml-arrays and formal release
NWB schema translation
- handle `links` field in groups
- handle compound `dtype` like in ophys.PlaneSegmentation.pixel_mask
- handle compound `dtype` like in TimeSeriesReferenceVectorData
- Create a validator that checks if all the lists in a compound dtype dataset are same length
Important things that are not implemented yet! Important things that are not implemented yet!
- {meth}`nwb_linkml.adapters.classes.ClassAdapter.handle_dtype` does not yet handle compound dtypes, - {meth}`nwb_linkml.adapters.classes.ClassAdapter.handle_dtype` does not yet handle compound dtypes,

View file

@ -5,7 +5,7 @@
groups = ["default", "dev", "tests"] groups = ["default", "dev", "tests"]
strategy = ["cross_platform", "inherit_metadata"] strategy = ["cross_platform", "inherit_metadata"]
lock_version = "4.4.2" lock_version = "4.4.2"
content_hash = "sha256:8cb98f940354e71443df87fd1702300e1c793627b0a016a6233811a640f47c18" content_hash = "sha256:6819a20ed9759b784908ad417ee371b941cfb1d9fbeebc8b29f25038f49ea544"
[[package]] [[package]]
name = "annotated-types" name = "annotated-types"
@ -769,18 +769,21 @@ files = [
[[package]] [[package]]
name = "linkml" name = "linkml"
version = "1.7.10" version = "0.0.0"
requires_python = "<4.0.0,>=3.8.1" requires_python = "<4.0.0,>=3.8.1"
git = "https://github.com/sneakers-the-rat/linkml"
ref = "arrays-numpydantic"
revision = "a3cfb2b82d7519cf9c64d113250c1714db2b3f6e"
summary = "Linked Open Data Modeling Language" summary = "Linked Open Data Modeling Language"
groups = ["default", "dev"] groups = ["default", "dev"]
dependencies = [ dependencies = [
"antlr4-python3-runtime<4.10,>=4.9.0", "antlr4-python3-runtime<4.10,==4.*,>=4.9.0",
"click>=7.0", "click>=7.0",
"graphviz>=0.10.1", "graphviz>=0.10.1",
"hbreader", "hbreader",
"isodate>=0.6.0", "isodate>=0.6.0",
"jinja2>=3.1.0", "jinja2>=3.1.0",
"jsonasobj2<2.0.0,>=1.0.3", "jsonasobj2==1.*,>=1.0.0,>=1.0.3",
"jsonschema[format]>=4.0.0", "jsonschema[format]>=4.0.0",
"linkml-dataops", "linkml-dataops",
"linkml-runtime>=1.7.4", "linkml-runtime>=1.7.4",
@ -799,10 +802,6 @@ dependencies = [
"sqlalchemy>=1.4.31", "sqlalchemy>=1.4.31",
"watchdog>=0.9.0", "watchdog>=0.9.0",
] ]
files = [
{file = "linkml-1.7.10-py3-none-any.whl", hash = "sha256:bf21cce814e9d1509489f1e6e15a7e86e4f11d949490d9a7a5c3f6b5b412ec62"},
{file = "linkml-1.7.10.tar.gz", hash = "sha256:1c38601c3cd495e34490b8cf7277fd3674ec68dcbe9f5efcec2658093801ce91"},
]
[[package]] [[package]]
name = "linkml-dataops" name = "linkml-dataops"
@ -1085,6 +1084,22 @@ files = [
{file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"}, {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"},
] ]
[[package]]
name = "numpydantic"
version = "1.2.1"
requires_python = "<4.0,>=3.9"
summary = "Type and shape validation and serialization for numpy arrays in pydantic models"
groups = ["default", "dev"]
dependencies = [
"nptyping>=2.5.0",
"numpy>=1.24.0",
"pydantic>=2.3.0",
]
files = [
{file = "numpydantic-1.2.1-py3-none-any.whl", hash = "sha256:e21d7e272410b3a2013d2a6aeec2ed6efd13ea171b0200e2029d7c2f1453def0"},
{file = "numpydantic-1.2.1.tar.gz", hash = "sha256:d8a3e7371d78b99fa4a4733a5b873046f064993431ae63f97edcf9bda4dd5c7f"},
]
[[package]] [[package]]
name = "nwb-schema-language" name = "nwb-schema-language"
version = "0.1.3" version = "0.1.3"

View file

@ -23,6 +23,7 @@ dependencies = [
"blosc2>=2.2.7", "blosc2>=2.2.7",
"tqdm>=4.66.1", "tqdm>=4.66.1",
'typing-extensions>=4.12.2;python_version<"3.11"', 'typing-extensions>=4.12.2;python_version<"3.11"',
"numpydantic>=1.2.1",
] ]
[project.urls] [project.urls]

View file

@ -123,6 +123,7 @@ class ClassAdapter(Adapter):
name_parts.append(self.cls.name) name_parts.append(self.cls.name)
name = "__".join(name_parts) name = "__".join(name_parts)
elif self.cls.neurodata_type_inc is not None: elif self.cls.neurodata_type_inc is not None:
# again, this is against the schema, but is common # again, this is against the schema, but is common
name = self.cls.neurodata_type_inc name = self.cls.neurodata_type_inc
@ -206,12 +207,20 @@ class ClassAdapter(Adapter):
Returns: Returns:
""" """
if self.cls.name or self.cls.default_name:
if self.cls.name: if self.cls.name:
# name overrides default_name
name = self.cls.name
equals_string = name
else:
name = self.cls.default_name
equals_string = None
name_slot = SlotDefinition( name_slot = SlotDefinition(
name="name", name="name",
required=True, required=True,
ifabsent=f"string({self.cls.name})", ifabsent=f"string({name})",
equals_string=self.cls.name, equals_string=equals_string,
range="string", range="string",
identifier=True, identifier=True,
) )

View file

@ -1,9 +1,9 @@
""" """
Adapter for NWB datasets to linkml Classes Adapter for NWB datasets to linkml Classes
""" """
import pdb
from abc import abstractmethod from abc import abstractmethod
from typing import Optional from typing import Optional, Type
from linkml_runtime.linkml_model.meta import ( from linkml_runtime.linkml_model.meta import (
ClassDefinition, ClassDefinition,
@ -16,7 +16,7 @@ from nwb_linkml.adapters.classes import ClassAdapter
from nwb_linkml.maps import QUANTITY_MAP, Map from nwb_linkml.maps import QUANTITY_MAP, Map
from nwb_linkml.maps.dtype import flat_to_linkml from nwb_linkml.maps.dtype import flat_to_linkml
from nwb_linkml.maps.naming import camel_to_snake from nwb_linkml.maps.naming import camel_to_snake
from nwb_schema_language import Dataset from nwb_schema_language import Dataset, CompoundDtype
class DatasetMap(Map): class DatasetMap(Map):
@ -113,6 +113,7 @@ class MapScalar(DatasetMap):
and not cls.attributes and not cls.attributes
and not cls.dims and not cls.dims
and not cls.shape and not cls.shape
and not is_compound(cls)
and cls.name and cls.name
) )
@ -228,7 +229,7 @@ class MapArraylike(DatasetMap):
""" """
Check if we're a plain array Check if we're a plain array
""" """
return cls.name and all([cls.dims, cls.shape]) and not has_attrs(cls) return cls.name and all([cls.dims, cls.shape]) and not has_attrs(cls) and not is_compound(cls)
@classmethod @classmethod
def apply( def apply(
@ -259,15 +260,6 @@ class MapArrayLikeAttributes(DatasetMap):
""" """
The most general case - treat everything that isn't handled by one of the special cases The most general case - treat everything that isn't handled by one of the special cases
as an array! as an array!
Specifically, we make an ``Arraylike`` class such that:
- Each slot within a subclass indicates a possible dimension.
- Only dimensions that are present in all the dimension specifiers in the
original schema are required.
- Shape requirements are indicated using max/min cardinalities on the slot.
- The arraylike object should be stored in the `array` slot on the containing class
(since there are already properties named `data`)
""" """
NEEDS_NAME = True NEEDS_NAME = True
@ -282,6 +274,7 @@ class MapArrayLikeAttributes(DatasetMap):
all([cls.dims, cls.shape]) all([cls.dims, cls.shape])
and cls.neurodata_type_inc != "VectorData" and cls.neurodata_type_inc != "VectorData"
and has_attrs(cls) and has_attrs(cls)
and not is_compound(cls)
and (dtype == "AnyType" or dtype in flat_to_linkml) and (dtype == "AnyType" or dtype in flat_to_linkml)
) )
@ -311,6 +304,22 @@ class Map1DVector(DatasetMap):
""" """
``VectorData`` is subclassed with a name but without dims or attributes, ``VectorData`` is subclassed with a name but without dims or attributes,
treat this as a normal 1D array slot that replaces any class that would be built for this treat this as a normal 1D array slot that replaces any class that would be built for this
eg. all the datasets in epoch.TimeIntervals:
.. code-block:: yaml
groups:
- neurodata_type_def: TimeIntervals
neurodata_type_inc: DynamicTable
doc: A container for aggregating epoch data and the TimeSeries that each epoch applies
to.
datasets:
- name: start_time
neurodata_type_inc: VectorData
dtype: float32
doc: Start time of epoch, in seconds.
""" """
@classmethod @classmethod
@ -323,6 +332,8 @@ class Map1DVector(DatasetMap):
and not cls.dims and not cls.dims
and not cls.shape and not cls.shape
and not cls.attributes and not cls.attributes
and not cls.neurodata_type_def
and not is_compound(cls)
and cls.name and cls.name
) )
@ -381,6 +392,72 @@ class MapNVectors(DatasetMap):
res = BuildResult(slots=[this_slot]) res = BuildResult(slots=[this_slot])
return res return res
class MapCompoundDtype(DatasetMap):
"""
A ``dtype`` declared as an array of types that function effectively as a row in a table.
We render them just as a class with each of the dtypes as slots - they are
typically used by other datasets to create a table.
Eg. ``base.TimeSeriesReferenceVectorData``
.. code-block:: yaml
datasets:
- neurodata_type_def: TimeSeriesReferenceVectorData
neurodata_type_inc: VectorData
default_name: timeseries
dtype:
- name: idx_start
dtype: int32
doc: Start index into the TimeSeries 'data' and 'timestamp' datasets of the referenced
TimeSeries. The first dimension of those arrays is always time.
- name: count
dtype: int32
doc: Number of data samples available in this time series, during this epoch
- name: timeseries
dtype:
target_type: TimeSeries
reftype: object
doc: The TimeSeries that this index applies to
doc: Column storing references to a TimeSeries (rows). For each TimeSeries this
VectorData column stores the start_index and count to indicate the range in time
to be selected as well as an object reference to the TimeSeries.
"""
@classmethod
def check(c, cls: Dataset) -> bool:
"""
Check that we're a dataset with a compound dtype
"""
return is_compound(cls)
@classmethod
def apply(
c, cls: Dataset, res: Optional[BuildResult] = None, name: Optional[str] = None
) -> BuildResult:
"""
Make a new class for this dtype, using its sub-dtypes as fields,
and use it as the range for the parent class
"""
slots = {}
for a_dtype in cls.dtype:
slots[a_dtype.name] = SlotDefinition(
name=a_dtype.name,
description=a_dtype.doc,
range=ClassAdapter.handle_dtype(a_dtype.dtype),
**QUANTITY_MAP[cls.quantity]
)
res.classes[0].attributes.update(slots)
return res
class DatasetAdapter(ClassAdapter): class DatasetAdapter(ClassAdapter):
""" """
@ -395,6 +472,25 @@ class DatasetAdapter(ClassAdapter):
""" """
res = self.build_base() res = self.build_base()
# find a map to use
map = self.match()
# apply matching maps
if map is not None:
res = map.apply(self.cls, res, self._get_full_name())
return res
def match(self) -> Optional[Type[DatasetMap]]:
"""
Find the map class that applies to this class
Returns:
:class:`.DatasetMap`
Raises:
RuntimeError - if more than one map matches
"""
# find a map to use # find a map to use
matches = [m for m in DatasetMap.__subclasses__() if m.check(self.cls)] matches = [m for m in DatasetMap.__subclasses__() if m.check(self.cls)]
@ -403,91 +499,10 @@ class DatasetAdapter(ClassAdapter):
"Only one map should apply to a dataset, you need to refactor the maps! Got maps:" "Only one map should apply to a dataset, you need to refactor the maps! Got maps:"
f" {matches}" f" {matches}"
) )
elif len(matches) == 0:
# apply matching maps return None
for m in matches:
res = m.apply(self.cls, res, self._get_full_name())
return res
def make_array_range(cls: Dataset, name: Optional[str] = None) -> ClassDefinition:
"""
Create a containing arraylike class
This is likely deprecated so this docstring is a placeholder to satisfy the linter...
"""
# The schema language doesn't have a way of specifying a dataset/group is "abstract"
# and yet hdmf-common says you don't need a dtype if the dataset is "abstract"
# so....
dtype = ClassAdapter.handle_dtype(cls.dtype)
# dims and shape are lists of lists. First we couple them
# (so each dim has its corresponding shape)..
# and then we take unique
# (dicts are ordered by default in recent pythons,
# while set() doesn't preserve order)
dims_shape = []
for inner_dim, inner_shape in zip(cls.dims, cls.shape):
if isinstance(inner_dim, list):
# list of lists
dims_shape.extend([(dim, shape) for dim, shape in zip(inner_dim, inner_shape)])
elif isinstance(inner_shape, list):
# Some badly formatted schema will have the shape be a LoL but the dims won't be...
dims_shape.extend([(inner_dim, shape) for shape in inner_shape])
else: else:
# single-layer list return matches[0]
dims_shape.append((inner_dim, inner_shape))
dims_shape = tuple(dict.fromkeys(dims_shape).keys())
# --------------------------------------------------
# SPECIAL CASE - allen institute's ndx-aibs-ecephys.extension
# confuses "dims" with "shape" , eg shape = [None], dims = [3].
# So we hardcode that here...
# --------------------------------------------------
if len(dims_shape) == 1 and isinstance(dims_shape[0][0], int) and dims_shape[0][1] is None:
dims_shape = (("dim", dims_shape[0][0]),)
# now make slots for each of them
slots = []
for dims, shape in dims_shape:
# if there is just a single list of possible dimensions, it's required
if not any([isinstance(inner_dim, list) for inner_dim in cls.dims]) or all(
[dims in inner_dim for inner_dim in cls.dims]
):
required = True
else:
required = False
# use cardinality to do shape
cardinality = None if shape == "null" else shape
slots.append(
SlotDefinition(
name=dims,
required=required,
maximum_cardinality=cardinality,
minimum_cardinality=cardinality,
range=dtype,
)
)
# and then the class is just a subclass of `Arraylist`
# (which is imported by default from `nwb.language.yaml`)
if name:
pass
elif cls.neurodata_type_def:
name = cls.neurodata_type_def
elif cls.name:
name = cls.name
else:
raise ValueError("Dataset has no name or type definition, what do call it?")
name = "__".join([name, "Arraylike"])
array_class = ClassDefinition(name=name, is_a="Arraylike", attributes=slots)
return array_class
def is_1d(cls: Dataset) -> bool: def is_1d(cls: Dataset) -> bool:
@ -502,6 +517,8 @@ def is_1d(cls: Dataset) -> bool:
and len(cls.dims[0]) == 1 and len(cls.dims[0]) == 1
) )
def is_compound(cls: Dataset) -> bool:
return isinstance(cls.dtype, list) and len(cls.dtype)>0 and isinstance(cls.dtype[0], CompoundDtype)
def has_attrs(cls: Dataset) -> bool: def has_attrs(cls: Dataset) -> bool:
""" """

View file

@ -34,20 +34,6 @@ for nwbtype, linkmltype in flat_to_linkml.items():
atype = TypeDefinition(name=nwbtype, minimum_value=amin, typeof=linkmltype) atype = TypeDefinition(name=nwbtype, minimum_value=amin, typeof=linkmltype)
DTypeTypes.append(atype) DTypeTypes.append(atype)
Arraylike = ClassDefinition(
name="Arraylike",
description=(
"Container for arraylike information held in the dims, shape, and dtype properties."
"this is a special case to be interpreted by downstream i/o. this class has no slots"
"and is abstract by default."
"- Each slot within a subclass indicates a possible dimension."
"- Only dimensions that are present in all the dimension specifiers in the"
" original schema are required."
"- Shape requirements are indicated using max/min cardinalities on the slot."
),
abstract=True,
)
AnyType = ClassDefinition( AnyType = ClassDefinition(
name="AnyType", name="AnyType",
class_uri="linkml:Any", class_uri="linkml:Any",
@ -60,7 +46,7 @@ NwbLangSchema = SchemaDefinition(
id="nwb.language", id="nwb.language",
description="Adapter objects to mimic the behavior of elements in the nwb-schema-language", description="Adapter objects to mimic the behavior of elements in the nwb-schema-language",
enums=[FlatDType], enums=[FlatDType],
classes=[Arraylike, AnyType], classes=[AnyType],
types=DTypeTypes, types=DTypeTypes,
imports=["linkml:types"], imports=["linkml:types"],
prefixes={"linkml": Prefix("linkml", "https://w3id.org/linkml")}, prefixes={"linkml": Prefix("linkml", "https://w3id.org/linkml")},

View file

@ -49,7 +49,7 @@ NWB_CORE_REPO = NamespaceRepo(
name="core", name="core",
repository="https://github.com/NeurodataWithoutBorders/nwb-schema", repository="https://github.com/NeurodataWithoutBorders/nwb-schema",
path=Path("core/nwb.namespace.yaml"), path=Path("core/nwb.namespace.yaml"),
versions=["2.2.0", "2.2.1", "2.2.2", "2.2.4", "2.2.5", "2.3.0", "2.4.0", "2.5.0", "2.6.0"], versions=["2.2.0", "2.2.1", "2.2.2", "2.2.4", "2.2.5", "2.3.0", "2.4.0", "2.5.0", "2.6.0", "2.7.0"],
) )
HDMF_COMMON_REPO = NamespaceRepo( HDMF_COMMON_REPO = NamespaceRepo(

View file

@ -67,7 +67,7 @@ def tmp_output_dir_mod(tmp_output_dir) -> Path:
return subpath return subpath
@pytest.fixture(scope="session", params=[{"core_version": "2.6.0", "hdmf_version": "1.5.0"}]) @pytest.fixture(scope="session", params=[{"core_version": "2.7.0", "hdmf_version": "1.8.0"}])
def nwb_core_fixture(request) -> NamespacesAdapter: def nwb_core_fixture(request) -> NamespacesAdapter:
nwb_core = io.load_nwb_core(**request.param) nwb_core = io.load_nwb_core(**request.param)
nwb_core.populate_imports() nwb_core.populate_imports()

View file

@ -1,6 +1,9 @@
from nwb_linkml.adapters.dataset import ( from nwb_linkml.adapters.dataset import (
MapScalar, MapScalar,
DatasetAdapter
) )
from nwb_linkml.adapters import NamespacesAdapter
from nwb_schema_language import Dataset from nwb_schema_language import Dataset