handle compound dtype

This commit is contained in:
sneakers-the-rat 2024-07-03 20:39:49 -07:00
parent c9a2423a9d
commit ec81032ae8
Signed by untrusted user who does not match committer: jonny
GPG key ID: 6DCB96EF1E4D232D
9 changed files with 164 additions and 125 deletions

View file

@ -1,5 +1,13 @@
# TODO
## v0.2 - update to linkml-arrays and formal release
NWB schema translation
- handle `links` field in groups
- handle compound `dtype` like in ophys.PlaneSegmentation.pixel_mask
- handle compound `dtype` like in TimeSeriesReferenceVectorData
- Create a validator that checks if all the lists in a compound dtype dataset are same length
Important things that are not implemented yet!
- {meth}`nwb_linkml.adapters.classes.ClassAdapter.handle_dtype` does not yet handle compound dtypes,

View file

@ -5,7 +5,7 @@
groups = ["default", "dev", "tests"]
strategy = ["cross_platform", "inherit_metadata"]
lock_version = "4.4.2"
content_hash = "sha256:8cb98f940354e71443df87fd1702300e1c793627b0a016a6233811a640f47c18"
content_hash = "sha256:6819a20ed9759b784908ad417ee371b941cfb1d9fbeebc8b29f25038f49ea544"
[[package]]
name = "annotated-types"
@ -769,18 +769,21 @@ files = [
[[package]]
name = "linkml"
version = "1.7.10"
version = "0.0.0"
requires_python = "<4.0.0,>=3.8.1"
git = "https://github.com/sneakers-the-rat/linkml"
ref = "arrays-numpydantic"
revision = "a3cfb2b82d7519cf9c64d113250c1714db2b3f6e"
summary = "Linked Open Data Modeling Language"
groups = ["default", "dev"]
dependencies = [
"antlr4-python3-runtime<4.10,>=4.9.0",
"antlr4-python3-runtime<4.10,==4.*,>=4.9.0",
"click>=7.0",
"graphviz>=0.10.1",
"hbreader",
"isodate>=0.6.0",
"jinja2>=3.1.0",
"jsonasobj2<2.0.0,>=1.0.3",
"jsonasobj2==1.*,>=1.0.0,>=1.0.3",
"jsonschema[format]>=4.0.0",
"linkml-dataops",
"linkml-runtime>=1.7.4",
@ -799,10 +802,6 @@ dependencies = [
"sqlalchemy>=1.4.31",
"watchdog>=0.9.0",
]
files = [
{file = "linkml-1.7.10-py3-none-any.whl", hash = "sha256:bf21cce814e9d1509489f1e6e15a7e86e4f11d949490d9a7a5c3f6b5b412ec62"},
{file = "linkml-1.7.10.tar.gz", hash = "sha256:1c38601c3cd495e34490b8cf7277fd3674ec68dcbe9f5efcec2658093801ce91"},
]
[[package]]
name = "linkml-dataops"
@ -1085,6 +1084,22 @@ files = [
{file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"},
]
[[package]]
name = "numpydantic"
version = "1.2.1"
requires_python = "<4.0,>=3.9"
summary = "Type and shape validation and serialization for numpy arrays in pydantic models"
groups = ["default", "dev"]
dependencies = [
"nptyping>=2.5.0",
"numpy>=1.24.0",
"pydantic>=2.3.0",
]
files = [
{file = "numpydantic-1.2.1-py3-none-any.whl", hash = "sha256:e21d7e272410b3a2013d2a6aeec2ed6efd13ea171b0200e2029d7c2f1453def0"},
{file = "numpydantic-1.2.1.tar.gz", hash = "sha256:d8a3e7371d78b99fa4a4733a5b873046f064993431ae63f97edcf9bda4dd5c7f"},
]
[[package]]
name = "nwb-schema-language"
version = "0.1.3"

View file

@ -23,6 +23,7 @@ dependencies = [
"blosc2>=2.2.7",
"tqdm>=4.66.1",
'typing-extensions>=4.12.2;python_version<"3.11"',
"numpydantic>=1.2.1",
]
[project.urls]

View file

@ -123,6 +123,7 @@ class ClassAdapter(Adapter):
name_parts.append(self.cls.name)
name = "__".join(name_parts)
elif self.cls.neurodata_type_inc is not None:
# again, this is against the schema, but is common
name = self.cls.neurodata_type_inc
@ -206,12 +207,20 @@ class ClassAdapter(Adapter):
Returns:
"""
if self.cls.name:
if self.cls.name or self.cls.default_name:
if self.cls.name:
# name overrides default_name
name = self.cls.name
equals_string = name
else:
name = self.cls.default_name
equals_string = None
name_slot = SlotDefinition(
name="name",
required=True,
ifabsent=f"string({self.cls.name})",
equals_string=self.cls.name,
ifabsent=f"string({name})",
equals_string=equals_string,
range="string",
identifier=True,
)

View file

@ -1,9 +1,9 @@
"""
Adapter for NWB datasets to linkml Classes
"""
import pdb
from abc import abstractmethod
from typing import Optional
from typing import Optional, Type
from linkml_runtime.linkml_model.meta import (
ClassDefinition,
@ -16,7 +16,7 @@ from nwb_linkml.adapters.classes import ClassAdapter
from nwb_linkml.maps import QUANTITY_MAP, Map
from nwb_linkml.maps.dtype import flat_to_linkml
from nwb_linkml.maps.naming import camel_to_snake
from nwb_schema_language import Dataset
from nwb_schema_language import Dataset, CompoundDtype
class DatasetMap(Map):
@ -113,6 +113,7 @@ class MapScalar(DatasetMap):
and not cls.attributes
and not cls.dims
and not cls.shape
and not is_compound(cls)
and cls.name
)
@ -228,7 +229,7 @@ class MapArraylike(DatasetMap):
"""
Check if we're a plain array
"""
return cls.name and all([cls.dims, cls.shape]) and not has_attrs(cls)
return cls.name and all([cls.dims, cls.shape]) and not has_attrs(cls) and not is_compound(cls)
@classmethod
def apply(
@ -259,15 +260,6 @@ class MapArrayLikeAttributes(DatasetMap):
"""
The most general case - treat everything that isn't handled by one of the special cases
as an array!
Specifically, we make an ``Arraylike`` class such that:
- Each slot within a subclass indicates a possible dimension.
- Only dimensions that are present in all the dimension specifiers in the
original schema are required.
- Shape requirements are indicated using max/min cardinalities on the slot.
- The arraylike object should be stored in the `array` slot on the containing class
(since there are already properties named `data`)
"""
NEEDS_NAME = True
@ -282,6 +274,7 @@ class MapArrayLikeAttributes(DatasetMap):
all([cls.dims, cls.shape])
and cls.neurodata_type_inc != "VectorData"
and has_attrs(cls)
and not is_compound(cls)
and (dtype == "AnyType" or dtype in flat_to_linkml)
)
@ -311,6 +304,22 @@ class Map1DVector(DatasetMap):
"""
``VectorData`` is subclassed with a name but without dims or attributes,
treat this as a normal 1D array slot that replaces any class that would be built for this
eg. all the datasets in epoch.TimeIntervals:
.. code-block:: yaml
groups:
- neurodata_type_def: TimeIntervals
neurodata_type_inc: DynamicTable
doc: A container for aggregating epoch data and the TimeSeries that each epoch applies
to.
datasets:
- name: start_time
neurodata_type_inc: VectorData
dtype: float32
doc: Start time of epoch, in seconds.
"""
@classmethod
@ -323,6 +332,8 @@ class Map1DVector(DatasetMap):
and not cls.dims
and not cls.shape
and not cls.attributes
and not cls.neurodata_type_def
and not is_compound(cls)
and cls.name
)
@ -381,6 +392,72 @@ class MapNVectors(DatasetMap):
res = BuildResult(slots=[this_slot])
return res
class MapCompoundDtype(DatasetMap):
"""
A ``dtype`` declared as an array of types that function effectively as a row in a table.
We render them just as a class with each of the dtypes as slots - they are
typically used by other datasets to create a table.
Eg. ``base.TimeSeriesReferenceVectorData``
.. code-block:: yaml
datasets:
- neurodata_type_def: TimeSeriesReferenceVectorData
neurodata_type_inc: VectorData
default_name: timeseries
dtype:
- name: idx_start
dtype: int32
doc: Start index into the TimeSeries 'data' and 'timestamp' datasets of the referenced
TimeSeries. The first dimension of those arrays is always time.
- name: count
dtype: int32
doc: Number of data samples available in this time series, during this epoch
- name: timeseries
dtype:
target_type: TimeSeries
reftype: object
doc: The TimeSeries that this index applies to
doc: Column storing references to a TimeSeries (rows). For each TimeSeries this
VectorData column stores the start_index and count to indicate the range in time
to be selected as well as an object reference to the TimeSeries.
"""
@classmethod
def check(c, cls: Dataset) -> bool:
"""
Check that we're a dataset with a compound dtype
"""
return is_compound(cls)
@classmethod
def apply(
c, cls: Dataset, res: Optional[BuildResult] = None, name: Optional[str] = None
) -> BuildResult:
"""
Make a new class for this dtype, using its sub-dtypes as fields,
and use it as the range for the parent class
"""
slots = {}
for a_dtype in cls.dtype:
slots[a_dtype.name] = SlotDefinition(
name=a_dtype.name,
description=a_dtype.doc,
range=ClassAdapter.handle_dtype(a_dtype.dtype),
**QUANTITY_MAP[cls.quantity]
)
res.classes[0].attributes.update(slots)
return res
class DatasetAdapter(ClassAdapter):
"""
@ -395,6 +472,25 @@ class DatasetAdapter(ClassAdapter):
"""
res = self.build_base()
# find a map to use
map = self.match()
# apply matching maps
if map is not None:
res = map.apply(self.cls, res, self._get_full_name())
return res
def match(self) -> Optional[Type[DatasetMap]]:
"""
Find the map class that applies to this class
Returns:
:class:`.DatasetMap`
Raises:
RuntimeError - if more than one map matches
"""
# find a map to use
matches = [m for m in DatasetMap.__subclasses__() if m.check(self.cls)]
@ -403,91 +499,10 @@ class DatasetAdapter(ClassAdapter):
"Only one map should apply to a dataset, you need to refactor the maps! Got maps:"
f" {matches}"
)
# apply matching maps
for m in matches:
res = m.apply(self.cls, res, self._get_full_name())
return res
def make_array_range(cls: Dataset, name: Optional[str] = None) -> ClassDefinition:
"""
Create a containing arraylike class
This is likely deprecated so this docstring is a placeholder to satisfy the linter...
"""
# The schema language doesn't have a way of specifying a dataset/group is "abstract"
# and yet hdmf-common says you don't need a dtype if the dataset is "abstract"
# so....
dtype = ClassAdapter.handle_dtype(cls.dtype)
# dims and shape are lists of lists. First we couple them
# (so each dim has its corresponding shape)..
# and then we take unique
# (dicts are ordered by default in recent pythons,
# while set() doesn't preserve order)
dims_shape = []
for inner_dim, inner_shape in zip(cls.dims, cls.shape):
if isinstance(inner_dim, list):
# list of lists
dims_shape.extend([(dim, shape) for dim, shape in zip(inner_dim, inner_shape)])
elif isinstance(inner_shape, list):
# Some badly formatted schema will have the shape be a LoL but the dims won't be...
dims_shape.extend([(inner_dim, shape) for shape in inner_shape])
elif len(matches) == 0:
return None
else:
# single-layer list
dims_shape.append((inner_dim, inner_shape))
dims_shape = tuple(dict.fromkeys(dims_shape).keys())
# --------------------------------------------------
# SPECIAL CASE - allen institute's ndx-aibs-ecephys.extension
# confuses "dims" with "shape" , eg shape = [None], dims = [3].
# So we hardcode that here...
# --------------------------------------------------
if len(dims_shape) == 1 and isinstance(dims_shape[0][0], int) and dims_shape[0][1] is None:
dims_shape = (("dim", dims_shape[0][0]),)
# now make slots for each of them
slots = []
for dims, shape in dims_shape:
# if there is just a single list of possible dimensions, it's required
if not any([isinstance(inner_dim, list) for inner_dim in cls.dims]) or all(
[dims in inner_dim for inner_dim in cls.dims]
):
required = True
else:
required = False
# use cardinality to do shape
cardinality = None if shape == "null" else shape
slots.append(
SlotDefinition(
name=dims,
required=required,
maximum_cardinality=cardinality,
minimum_cardinality=cardinality,
range=dtype,
)
)
# and then the class is just a subclass of `Arraylist`
# (which is imported by default from `nwb.language.yaml`)
if name:
pass
elif cls.neurodata_type_def:
name = cls.neurodata_type_def
elif cls.name:
name = cls.name
else:
raise ValueError("Dataset has no name or type definition, what do call it?")
name = "__".join([name, "Arraylike"])
array_class = ClassDefinition(name=name, is_a="Arraylike", attributes=slots)
return array_class
return matches[0]
def is_1d(cls: Dataset) -> bool:
@ -502,6 +517,8 @@ def is_1d(cls: Dataset) -> bool:
and len(cls.dims[0]) == 1
)
def is_compound(cls: Dataset) -> bool:
return isinstance(cls.dtype, list) and len(cls.dtype)>0 and isinstance(cls.dtype[0], CompoundDtype)
def has_attrs(cls: Dataset) -> bool:
"""

View file

@ -34,20 +34,6 @@ for nwbtype, linkmltype in flat_to_linkml.items():
atype = TypeDefinition(name=nwbtype, minimum_value=amin, typeof=linkmltype)
DTypeTypes.append(atype)
Arraylike = ClassDefinition(
name="Arraylike",
description=(
"Container for arraylike information held in the dims, shape, and dtype properties."
"this is a special case to be interpreted by downstream i/o. this class has no slots"
"and is abstract by default."
"- Each slot within a subclass indicates a possible dimension."
"- Only dimensions that are present in all the dimension specifiers in the"
" original schema are required."
"- Shape requirements are indicated using max/min cardinalities on the slot."
),
abstract=True,
)
AnyType = ClassDefinition(
name="AnyType",
class_uri="linkml:Any",
@ -60,7 +46,7 @@ NwbLangSchema = SchemaDefinition(
id="nwb.language",
description="Adapter objects to mimic the behavior of elements in the nwb-schema-language",
enums=[FlatDType],
classes=[Arraylike, AnyType],
classes=[AnyType],
types=DTypeTypes,
imports=["linkml:types"],
prefixes={"linkml": Prefix("linkml", "https://w3id.org/linkml")},

View file

@ -49,7 +49,7 @@ NWB_CORE_REPO = NamespaceRepo(
name="core",
repository="https://github.com/NeurodataWithoutBorders/nwb-schema",
path=Path("core/nwb.namespace.yaml"),
versions=["2.2.0", "2.2.1", "2.2.2", "2.2.4", "2.2.5", "2.3.0", "2.4.0", "2.5.0", "2.6.0"],
versions=["2.2.0", "2.2.1", "2.2.2", "2.2.4", "2.2.5", "2.3.0", "2.4.0", "2.5.0", "2.6.0", "2.7.0"],
)
HDMF_COMMON_REPO = NamespaceRepo(

View file

@ -67,7 +67,7 @@ def tmp_output_dir_mod(tmp_output_dir) -> Path:
return subpath
@pytest.fixture(scope="session", params=[{"core_version": "2.6.0", "hdmf_version": "1.5.0"}])
@pytest.fixture(scope="session", params=[{"core_version": "2.7.0", "hdmf_version": "1.8.0"}])
def nwb_core_fixture(request) -> NamespacesAdapter:
nwb_core = io.load_nwb_core(**request.param)
nwb_core.populate_imports()

View file

@ -1,6 +1,9 @@
from nwb_linkml.adapters.dataset import (
MapScalar,
DatasetAdapter
)
from nwb_linkml.adapters import NamespacesAdapter
from nwb_schema_language import Dataset