nwb-linkml/nwb_linkml/adapters/dataset.py

246 lines
9 KiB
Python

"""
Adapter for NWB datasets to linkml Classes
"""
from typing import Optional, List
from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
from pydantic import PrivateAttr
from nwb_schema_language import Dataset, ReferenceDtype, CompoundDtype, DTypeType
from nwb_linkml.adapters.classes import ClassAdapter
from nwb_linkml.adapters.adapter import BuildResult
from nwb_linkml.maps import QUANTITY_MAP
class DatasetAdapter(ClassAdapter):
cls: Dataset
_handlers: List[str] = PrivateAttr(default_factory=list)
"""Keep track of which handlers have been called"""
def build(self) -> BuildResult:
res = self.build_base()
res = self.handle_arraylike(res, self.cls, self._get_full_name())
res = self.handle_1d_vector(res)
res = self.handle_listlike(res)
res = self.handle_scalar(res)
if len(self._handlers) > 1:
raise RuntimeError(f"Only one handler should have been triggered, instead triggered {self._handlers}")
return res
def handle_scalar(self, res:BuildResult) -> BuildResult:
# Simplify datasets that are just a single value
if self.cls.neurodata_type_inc != 'VectorData' and \
not self.cls.neurodata_type_inc and \
not self.cls.attributes and \
not self.cls.dims and \
not self.cls.shape and \
self.cls.name:
self._handlers.append('scalar')
# throw out the class that would have been made for us
# we just need a slot
this_slot = SlotDefinition(
name=self.cls.name,
description=self.cls.doc,
range=self.handle_dtype(self.cls.dtype),
**QUANTITY_MAP[self.cls.quantity]
)
res = BuildResult(slots = [this_slot])
return res
def handle_1d_vector(self, res: BuildResult) -> BuildResult:
# handle the special case where `VectorData` is subclasssed without any dims or attributes
# which just gets instantiated as a 1-d array in HDF5
if self.cls.neurodata_type_inc == 'VectorData' and \
not self.cls.dims and \
not self.cls.shape and \
not self.cls.attributes \
and self.cls.name:
self._handlers.append('1d_vector')
this_slot = SlotDefinition(
name=self.cls.name,
description=self.cls.doc,
range=self.handle_dtype(self.cls.dtype),
multivalued=True
)
# No need to make a class for us, so we replace the existing build results
res = BuildResult(slots=[this_slot])
return res
def handle_listlike(self, res:BuildResult) -> BuildResult:
"""
Handle cases where the dataset is just a list of a specific type.
Examples:
datasets:
- name: file_create_date
dtype: isodatetime
dims:
- num_modifications
shape:
- null
"""
if self.cls.name and ((
# single-layer list
not any([isinstance(dim, list) for dim in self.cls.dims]) and
len(self.cls.dims) == 1
) or (
# nested list
all([isinstance(dim, list) for dim in self.cls.dims]) and
len(self.cls.dims) == 1 and
len(self.cls.dims[0]) == 1
)):
res = BuildResult(
slots = [
SlotDefinition(
name = self.cls.name,
multivalued=True,
range=self.handle_dtype(self.cls.dtype),
description=self.cls.doc,
required=False if self.cls.quantity in ('*', '?') else True
)
]
)
return res
else:
return res
def handle_arraylike(self, res: BuildResult, dataset: Dataset, name: Optional[str] = None) -> BuildResult:
"""
Handling the
- dims
- shape
- dtype
fields as they are used in datasets. We'll use the :class:`.Arraylike` class to imitate them.
Specifically:
- Each slot within a subclass indicates a possible dimension.
- Only dimensions that are present in all the dimension specifiers in the
original schema are required.
- Shape requirements are indicated using max/min cardinalities on the slot.
- The arraylike object should be stored in the `array` slot on the containing class
(since there are already properties named `data`)
If any of `dims`, `shape`, or `dtype` are undefined, return `None`
Args:
dataset (:class:`nwb_schema_language.Dataset`): The dataset defining the arraylike
name (str): If present, override the name of the class before appending _Array
(we don't use _get_full_name here because we want to eventually decouple these functions from this adapter
class, which is sort of a development crutch. Ideally all these methods would just work on base nwb schema language types)
"""
if not any((dataset.dims, dataset.shape)):
# none of the required properties are defined, that's fine.
return res
elif not all((dataset.dims, dataset.shape)):
# need to have both if one is present!
raise ValueError(f"A dataset needs both dims and shape to define an arraylike object")
# Special cases
if dataset.neurodata_type_inc == 'VectorData':
# Handle this in `handle_vectorlike` instead
return res
# The schema language doesn't have a way of specifying a dataset/group is "abstract"
# and yet hdmf-common says you don't need a dtype if the dataset is "abstract"
# so....
dtype = self.handle_dtype(dataset.dtype)
# dims and shape are lists of lists. First we couple them
# (so each dim has its corresponding shape)..
# and then we take unique
# (dicts are ordered by default in recent pythons,
# while set() doesn't preserve order)
dims_shape = []
for inner_dim, inner_shape in zip(dataset.dims, dataset.shape):
if isinstance(inner_dim, list):
# list of lists
dims_shape.extend([(dim, shape) for dim, shape in zip(inner_dim, inner_shape)])
else:
# single-layer list
dims_shape.append((inner_dim, inner_shape))
dims_shape = tuple(dict.fromkeys(dims_shape).keys())
# if we only have one possible dimension, it's equivalent to a list, so we just return the slot
if len(dims_shape) == 1 and self.parent:
quantity = QUANTITY_MAP[dataset.quantity]
slot = SlotDefinition(
name=dataset.name,
range=dtype,
description=dataset.doc,
required=quantity['required'],
multivalued=True
)
res.classes[0].attributes.update({dataset.name: slot})
self._handlers.append('arraylike-1d')
return res
# now make slots for each of them
slots = []
for dims, shape in dims_shape:
# if a dim is present in all possible combinations of dims, make it required
if all([dims in inner_dim for inner_dim in dataset.dims]):
required = True
# or if there is just a single list of possible dimensions
elif not any([isinstance(inner_dim, list) for inner_dim in dataset.dims]):
required = True
else:
required = False
# use cardinality to do shape
if shape == 'null':
cardinality = None
else:
cardinality = shape
slots.append(SlotDefinition(
name=dims,
required=required,
maximum_cardinality=cardinality,
minimum_cardinality=cardinality,
range=dtype
))
# and then the class is just a subclass of `Arraylike` (which is imported by default from `nwb.language.yaml`)
if name:
pass
elif dataset.neurodata_type_def:
name = dataset.neurodata_type_def
elif dataset.name:
name = dataset.name
else:
raise ValueError(f"Dataset has no name or type definition, what do call it?")
name = '__'.join([name, 'Array'])
array_class = ClassDefinition(
name=name,
is_a="Arraylike",
attributes=slots
)
# make a slot for the arraylike class
array_slot = SlotDefinition(
name='array',
range=array_class.name
)
res.classes.append(array_class)
res.classes[0].attributes.update({'array': array_slot})
#res.slots.append(array_slot)
self._handlers.append('arraylike')
return res