mirror of
https://github.com/p2p-ld/nwb-linkml.git
synced 2025-01-09 13:44:27 +00:00
handle compound dtype
This commit is contained in:
parent
c9a2423a9d
commit
ec81032ae8
9 changed files with 164 additions and 125 deletions
|
@ -1,5 +1,13 @@
|
|||
# TODO
|
||||
|
||||
## v0.2 - update to linkml-arrays and formal release
|
||||
|
||||
NWB schema translation
|
||||
- handle `links` field in groups
|
||||
- handle compound `dtype` like in ophys.PlaneSegmentation.pixel_mask
|
||||
- handle compound `dtype` like in TimeSeriesReferenceVectorData
|
||||
- Create a validator that checks if all the lists in a compound dtype dataset are same length
|
||||
|
||||
Important things that are not implemented yet!
|
||||
|
||||
- {meth}`nwb_linkml.adapters.classes.ClassAdapter.handle_dtype` does not yet handle compound dtypes,
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
groups = ["default", "dev", "tests"]
|
||||
strategy = ["cross_platform", "inherit_metadata"]
|
||||
lock_version = "4.4.2"
|
||||
content_hash = "sha256:8cb98f940354e71443df87fd1702300e1c793627b0a016a6233811a640f47c18"
|
||||
content_hash = "sha256:6819a20ed9759b784908ad417ee371b941cfb1d9fbeebc8b29f25038f49ea544"
|
||||
|
||||
[[package]]
|
||||
name = "annotated-types"
|
||||
|
@ -769,18 +769,21 @@ files = [
|
|||
|
||||
[[package]]
|
||||
name = "linkml"
|
||||
version = "1.7.10"
|
||||
version = "0.0.0"
|
||||
requires_python = "<4.0.0,>=3.8.1"
|
||||
git = "https://github.com/sneakers-the-rat/linkml"
|
||||
ref = "arrays-numpydantic"
|
||||
revision = "a3cfb2b82d7519cf9c64d113250c1714db2b3f6e"
|
||||
summary = "Linked Open Data Modeling Language"
|
||||
groups = ["default", "dev"]
|
||||
dependencies = [
|
||||
"antlr4-python3-runtime<4.10,>=4.9.0",
|
||||
"antlr4-python3-runtime<4.10,==4.*,>=4.9.0",
|
||||
"click>=7.0",
|
||||
"graphviz>=0.10.1",
|
||||
"hbreader",
|
||||
"isodate>=0.6.0",
|
||||
"jinja2>=3.1.0",
|
||||
"jsonasobj2<2.0.0,>=1.0.3",
|
||||
"jsonasobj2==1.*,>=1.0.0,>=1.0.3",
|
||||
"jsonschema[format]>=4.0.0",
|
||||
"linkml-dataops",
|
||||
"linkml-runtime>=1.7.4",
|
||||
|
@ -799,10 +802,6 @@ dependencies = [
|
|||
"sqlalchemy>=1.4.31",
|
||||
"watchdog>=0.9.0",
|
||||
]
|
||||
files = [
|
||||
{file = "linkml-1.7.10-py3-none-any.whl", hash = "sha256:bf21cce814e9d1509489f1e6e15a7e86e4f11d949490d9a7a5c3f6b5b412ec62"},
|
||||
{file = "linkml-1.7.10.tar.gz", hash = "sha256:1c38601c3cd495e34490b8cf7277fd3674ec68dcbe9f5efcec2658093801ce91"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "linkml-dataops"
|
||||
|
@ -1085,6 +1084,22 @@ files = [
|
|||
{file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "numpydantic"
|
||||
version = "1.2.1"
|
||||
requires_python = "<4.0,>=3.9"
|
||||
summary = "Type and shape validation and serialization for numpy arrays in pydantic models"
|
||||
groups = ["default", "dev"]
|
||||
dependencies = [
|
||||
"nptyping>=2.5.0",
|
||||
"numpy>=1.24.0",
|
||||
"pydantic>=2.3.0",
|
||||
]
|
||||
files = [
|
||||
{file = "numpydantic-1.2.1-py3-none-any.whl", hash = "sha256:e21d7e272410b3a2013d2a6aeec2ed6efd13ea171b0200e2029d7c2f1453def0"},
|
||||
{file = "numpydantic-1.2.1.tar.gz", hash = "sha256:d8a3e7371d78b99fa4a4733a5b873046f064993431ae63f97edcf9bda4dd5c7f"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nwb-schema-language"
|
||||
version = "0.1.3"
|
||||
|
|
|
@ -23,6 +23,7 @@ dependencies = [
|
|||
"blosc2>=2.2.7",
|
||||
"tqdm>=4.66.1",
|
||||
'typing-extensions>=4.12.2;python_version<"3.11"',
|
||||
"numpydantic>=1.2.1",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
|
|
|
@ -123,6 +123,7 @@ class ClassAdapter(Adapter):
|
|||
|
||||
name_parts.append(self.cls.name)
|
||||
name = "__".join(name_parts)
|
||||
|
||||
elif self.cls.neurodata_type_inc is not None:
|
||||
# again, this is against the schema, but is common
|
||||
name = self.cls.neurodata_type_inc
|
||||
|
@ -206,12 +207,20 @@ class ClassAdapter(Adapter):
|
|||
Returns:
|
||||
|
||||
"""
|
||||
if self.cls.name:
|
||||
if self.cls.name or self.cls.default_name:
|
||||
if self.cls.name:
|
||||
# name overrides default_name
|
||||
name = self.cls.name
|
||||
equals_string = name
|
||||
else:
|
||||
name = self.cls.default_name
|
||||
equals_string = None
|
||||
|
||||
name_slot = SlotDefinition(
|
||||
name="name",
|
||||
required=True,
|
||||
ifabsent=f"string({self.cls.name})",
|
||||
equals_string=self.cls.name,
|
||||
ifabsent=f"string({name})",
|
||||
equals_string=equals_string,
|
||||
range="string",
|
||||
identifier=True,
|
||||
)
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
"""
|
||||
Adapter for NWB datasets to linkml Classes
|
||||
"""
|
||||
|
||||
import pdb
|
||||
from abc import abstractmethod
|
||||
from typing import Optional
|
||||
from typing import Optional, Type
|
||||
|
||||
from linkml_runtime.linkml_model.meta import (
|
||||
ClassDefinition,
|
||||
|
@ -16,7 +16,7 @@ from nwb_linkml.adapters.classes import ClassAdapter
|
|||
from nwb_linkml.maps import QUANTITY_MAP, Map
|
||||
from nwb_linkml.maps.dtype import flat_to_linkml
|
||||
from nwb_linkml.maps.naming import camel_to_snake
|
||||
from nwb_schema_language import Dataset
|
||||
from nwb_schema_language import Dataset, CompoundDtype
|
||||
|
||||
|
||||
class DatasetMap(Map):
|
||||
|
@ -113,6 +113,7 @@ class MapScalar(DatasetMap):
|
|||
and not cls.attributes
|
||||
and not cls.dims
|
||||
and not cls.shape
|
||||
and not is_compound(cls)
|
||||
and cls.name
|
||||
)
|
||||
|
||||
|
@ -228,7 +229,7 @@ class MapArraylike(DatasetMap):
|
|||
"""
|
||||
Check if we're a plain array
|
||||
"""
|
||||
return cls.name and all([cls.dims, cls.shape]) and not has_attrs(cls)
|
||||
return cls.name and all([cls.dims, cls.shape]) and not has_attrs(cls) and not is_compound(cls)
|
||||
|
||||
@classmethod
|
||||
def apply(
|
||||
|
@ -259,15 +260,6 @@ class MapArrayLikeAttributes(DatasetMap):
|
|||
"""
|
||||
The most general case - treat everything that isn't handled by one of the special cases
|
||||
as an array!
|
||||
|
||||
Specifically, we make an ``Arraylike`` class such that:
|
||||
|
||||
- Each slot within a subclass indicates a possible dimension.
|
||||
- Only dimensions that are present in all the dimension specifiers in the
|
||||
original schema are required.
|
||||
- Shape requirements are indicated using max/min cardinalities on the slot.
|
||||
- The arraylike object should be stored in the `array` slot on the containing class
|
||||
(since there are already properties named `data`)
|
||||
"""
|
||||
|
||||
NEEDS_NAME = True
|
||||
|
@ -282,6 +274,7 @@ class MapArrayLikeAttributes(DatasetMap):
|
|||
all([cls.dims, cls.shape])
|
||||
and cls.neurodata_type_inc != "VectorData"
|
||||
and has_attrs(cls)
|
||||
and not is_compound(cls)
|
||||
and (dtype == "AnyType" or dtype in flat_to_linkml)
|
||||
)
|
||||
|
||||
|
@ -311,6 +304,22 @@ class Map1DVector(DatasetMap):
|
|||
"""
|
||||
``VectorData`` is subclassed with a name but without dims or attributes,
|
||||
treat this as a normal 1D array slot that replaces any class that would be built for this
|
||||
|
||||
eg. all the datasets in epoch.TimeIntervals:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
groups:
|
||||
- neurodata_type_def: TimeIntervals
|
||||
neurodata_type_inc: DynamicTable
|
||||
doc: A container for aggregating epoch data and the TimeSeries that each epoch applies
|
||||
to.
|
||||
datasets:
|
||||
- name: start_time
|
||||
neurodata_type_inc: VectorData
|
||||
dtype: float32
|
||||
doc: Start time of epoch, in seconds.
|
||||
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
|
@ -323,6 +332,8 @@ class Map1DVector(DatasetMap):
|
|||
and not cls.dims
|
||||
and not cls.shape
|
||||
and not cls.attributes
|
||||
and not cls.neurodata_type_def
|
||||
and not is_compound(cls)
|
||||
and cls.name
|
||||
)
|
||||
|
||||
|
@ -381,6 +392,72 @@ class MapNVectors(DatasetMap):
|
|||
res = BuildResult(slots=[this_slot])
|
||||
return res
|
||||
|
||||
class MapCompoundDtype(DatasetMap):
|
||||
"""
|
||||
A ``dtype`` declared as an array of types that function effectively as a row in a table.
|
||||
|
||||
We render them just as a class with each of the dtypes as slots - they are
|
||||
typically used by other datasets to create a table.
|
||||
|
||||
Eg. ``base.TimeSeriesReferenceVectorData``
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
datasets:
|
||||
- neurodata_type_def: TimeSeriesReferenceVectorData
|
||||
neurodata_type_inc: VectorData
|
||||
default_name: timeseries
|
||||
dtype:
|
||||
- name: idx_start
|
||||
dtype: int32
|
||||
doc: Start index into the TimeSeries 'data' and 'timestamp' datasets of the referenced
|
||||
TimeSeries. The first dimension of those arrays is always time.
|
||||
- name: count
|
||||
dtype: int32
|
||||
doc: Number of data samples available in this time series, during this epoch
|
||||
- name: timeseries
|
||||
dtype:
|
||||
target_type: TimeSeries
|
||||
reftype: object
|
||||
doc: The TimeSeries that this index applies to
|
||||
doc: Column storing references to a TimeSeries (rows). For each TimeSeries this
|
||||
VectorData column stores the start_index and count to indicate the range in time
|
||||
to be selected as well as an object reference to the TimeSeries.
|
||||
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def check(c, cls: Dataset) -> bool:
|
||||
"""
|
||||
Check that we're a dataset with a compound dtype
|
||||
"""
|
||||
return is_compound(cls)
|
||||
|
||||
@classmethod
|
||||
def apply(
|
||||
c, cls: Dataset, res: Optional[BuildResult] = None, name: Optional[str] = None
|
||||
) -> BuildResult:
|
||||
"""
|
||||
Make a new class for this dtype, using its sub-dtypes as fields,
|
||||
and use it as the range for the parent class
|
||||
"""
|
||||
slots = {}
|
||||
for a_dtype in cls.dtype:
|
||||
slots[a_dtype.name] = SlotDefinition(
|
||||
name=a_dtype.name,
|
||||
description=a_dtype.doc,
|
||||
range=ClassAdapter.handle_dtype(a_dtype.dtype),
|
||||
**QUANTITY_MAP[cls.quantity]
|
||||
)
|
||||
res.classes[0].attributes.update(slots)
|
||||
return res
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class DatasetAdapter(ClassAdapter):
|
||||
"""
|
||||
|
@ -395,6 +472,25 @@ class DatasetAdapter(ClassAdapter):
|
|||
"""
|
||||
res = self.build_base()
|
||||
|
||||
# find a map to use
|
||||
map = self.match()
|
||||
|
||||
# apply matching maps
|
||||
if map is not None:
|
||||
res = map.apply(self.cls, res, self._get_full_name())
|
||||
|
||||
return res
|
||||
|
||||
def match(self) -> Optional[Type[DatasetMap]]:
|
||||
"""
|
||||
Find the map class that applies to this class
|
||||
|
||||
Returns:
|
||||
:class:`.DatasetMap`
|
||||
|
||||
Raises:
|
||||
RuntimeError - if more than one map matches
|
||||
"""
|
||||
# find a map to use
|
||||
matches = [m for m in DatasetMap.__subclasses__() if m.check(self.cls)]
|
||||
|
||||
|
@ -403,91 +499,10 @@ class DatasetAdapter(ClassAdapter):
|
|||
"Only one map should apply to a dataset, you need to refactor the maps! Got maps:"
|
||||
f" {matches}"
|
||||
)
|
||||
|
||||
# apply matching maps
|
||||
for m in matches:
|
||||
res = m.apply(self.cls, res, self._get_full_name())
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def make_array_range(cls: Dataset, name: Optional[str] = None) -> ClassDefinition:
|
||||
"""
|
||||
Create a containing arraylike class
|
||||
|
||||
This is likely deprecated so this docstring is a placeholder to satisfy the linter...
|
||||
"""
|
||||
# The schema language doesn't have a way of specifying a dataset/group is "abstract"
|
||||
# and yet hdmf-common says you don't need a dtype if the dataset is "abstract"
|
||||
# so....
|
||||
dtype = ClassAdapter.handle_dtype(cls.dtype)
|
||||
|
||||
# dims and shape are lists of lists. First we couple them
|
||||
# (so each dim has its corresponding shape)..
|
||||
# and then we take unique
|
||||
# (dicts are ordered by default in recent pythons,
|
||||
# while set() doesn't preserve order)
|
||||
dims_shape = []
|
||||
for inner_dim, inner_shape in zip(cls.dims, cls.shape):
|
||||
if isinstance(inner_dim, list):
|
||||
# list of lists
|
||||
dims_shape.extend([(dim, shape) for dim, shape in zip(inner_dim, inner_shape)])
|
||||
elif isinstance(inner_shape, list):
|
||||
# Some badly formatted schema will have the shape be a LoL but the dims won't be...
|
||||
dims_shape.extend([(inner_dim, shape) for shape in inner_shape])
|
||||
elif len(matches) == 0:
|
||||
return None
|
||||
else:
|
||||
# single-layer list
|
||||
dims_shape.append((inner_dim, inner_shape))
|
||||
|
||||
dims_shape = tuple(dict.fromkeys(dims_shape).keys())
|
||||
|
||||
# --------------------------------------------------
|
||||
# SPECIAL CASE - allen institute's ndx-aibs-ecephys.extension
|
||||
# confuses "dims" with "shape" , eg shape = [None], dims = [3].
|
||||
# So we hardcode that here...
|
||||
# --------------------------------------------------
|
||||
if len(dims_shape) == 1 and isinstance(dims_shape[0][0], int) and dims_shape[0][1] is None:
|
||||
dims_shape = (("dim", dims_shape[0][0]),)
|
||||
|
||||
# now make slots for each of them
|
||||
slots = []
|
||||
for dims, shape in dims_shape:
|
||||
# if there is just a single list of possible dimensions, it's required
|
||||
if not any([isinstance(inner_dim, list) for inner_dim in cls.dims]) or all(
|
||||
[dims in inner_dim for inner_dim in cls.dims]
|
||||
):
|
||||
required = True
|
||||
else:
|
||||
required = False
|
||||
|
||||
# use cardinality to do shape
|
||||
cardinality = None if shape == "null" else shape
|
||||
|
||||
slots.append(
|
||||
SlotDefinition(
|
||||
name=dims,
|
||||
required=required,
|
||||
maximum_cardinality=cardinality,
|
||||
minimum_cardinality=cardinality,
|
||||
range=dtype,
|
||||
)
|
||||
)
|
||||
|
||||
# and then the class is just a subclass of `Arraylist`
|
||||
# (which is imported by default from `nwb.language.yaml`)
|
||||
if name:
|
||||
pass
|
||||
elif cls.neurodata_type_def:
|
||||
name = cls.neurodata_type_def
|
||||
elif cls.name:
|
||||
name = cls.name
|
||||
else:
|
||||
raise ValueError("Dataset has no name or type definition, what do call it?")
|
||||
|
||||
name = "__".join([name, "Arraylike"])
|
||||
|
||||
array_class = ClassDefinition(name=name, is_a="Arraylike", attributes=slots)
|
||||
return array_class
|
||||
return matches[0]
|
||||
|
||||
|
||||
def is_1d(cls: Dataset) -> bool:
|
||||
|
@ -502,6 +517,8 @@ def is_1d(cls: Dataset) -> bool:
|
|||
and len(cls.dims[0]) == 1
|
||||
)
|
||||
|
||||
def is_compound(cls: Dataset) -> bool:
|
||||
return isinstance(cls.dtype, list) and len(cls.dtype)>0 and isinstance(cls.dtype[0], CompoundDtype)
|
||||
|
||||
def has_attrs(cls: Dataset) -> bool:
|
||||
"""
|
||||
|
|
|
@ -34,20 +34,6 @@ for nwbtype, linkmltype in flat_to_linkml.items():
|
|||
atype = TypeDefinition(name=nwbtype, minimum_value=amin, typeof=linkmltype)
|
||||
DTypeTypes.append(atype)
|
||||
|
||||
Arraylike = ClassDefinition(
|
||||
name="Arraylike",
|
||||
description=(
|
||||
"Container for arraylike information held in the dims, shape, and dtype properties."
|
||||
"this is a special case to be interpreted by downstream i/o. this class has no slots"
|
||||
"and is abstract by default."
|
||||
"- Each slot within a subclass indicates a possible dimension."
|
||||
"- Only dimensions that are present in all the dimension specifiers in the"
|
||||
" original schema are required."
|
||||
"- Shape requirements are indicated using max/min cardinalities on the slot."
|
||||
),
|
||||
abstract=True,
|
||||
)
|
||||
|
||||
AnyType = ClassDefinition(
|
||||
name="AnyType",
|
||||
class_uri="linkml:Any",
|
||||
|
@ -60,7 +46,7 @@ NwbLangSchema = SchemaDefinition(
|
|||
id="nwb.language",
|
||||
description="Adapter objects to mimic the behavior of elements in the nwb-schema-language",
|
||||
enums=[FlatDType],
|
||||
classes=[Arraylike, AnyType],
|
||||
classes=[AnyType],
|
||||
types=DTypeTypes,
|
||||
imports=["linkml:types"],
|
||||
prefixes={"linkml": Prefix("linkml", "https://w3id.org/linkml")},
|
||||
|
|
|
@ -49,7 +49,7 @@ NWB_CORE_REPO = NamespaceRepo(
|
|||
name="core",
|
||||
repository="https://github.com/NeurodataWithoutBorders/nwb-schema",
|
||||
path=Path("core/nwb.namespace.yaml"),
|
||||
versions=["2.2.0", "2.2.1", "2.2.2", "2.2.4", "2.2.5", "2.3.0", "2.4.0", "2.5.0", "2.6.0"],
|
||||
versions=["2.2.0", "2.2.1", "2.2.2", "2.2.4", "2.2.5", "2.3.0", "2.4.0", "2.5.0", "2.6.0", "2.7.0"],
|
||||
)
|
||||
|
||||
HDMF_COMMON_REPO = NamespaceRepo(
|
||||
|
|
|
@ -67,7 +67,7 @@ def tmp_output_dir_mod(tmp_output_dir) -> Path:
|
|||
return subpath
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", params=[{"core_version": "2.6.0", "hdmf_version": "1.5.0"}])
|
||||
@pytest.fixture(scope="session", params=[{"core_version": "2.7.0", "hdmf_version": "1.8.0"}])
|
||||
def nwb_core_fixture(request) -> NamespacesAdapter:
|
||||
nwb_core = io.load_nwb_core(**request.param)
|
||||
nwb_core.populate_imports()
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
from nwb_linkml.adapters.dataset import (
|
||||
MapScalar,
|
||||
DatasetAdapter
|
||||
)
|
||||
from nwb_linkml.adapters import NamespacesAdapter
|
||||
|
||||
from nwb_schema_language import Dataset
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue