nwb-linkml/nwb_linkml/adapters/classes.py

328 lines
12 KiB
Python
Raw Normal View History

"""
Adapters to linkML classes
"""
import pdb
from typing import List, Optional
from nwb_schema_language import Dataset, Group, ReferenceDtype, CompoundDtype, DTypeType
from nwb_linkml.adapters.adapter import Adapter, BuildResult
from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
from nwb_linkml.maps import QUANTITY_MAP
from nwb_linkml.lang_elements import Arraylike
class ClassAdapter(Adapter):
"""
Adapter to class-like things in linkml, including datasets and groups
"""
cls: Dataset | Group
parent: Optional['ClassAdapter'] = None
def _get_full_name(self) -> str:
"""The full name of the object in the generated linkml
Distinct from 'name' which is the thing that's often used in """
if self.cls.neurodata_type_def:
name = self.cls.neurodata_type_def
elif self.cls.name is not None:
# not necessarily a unique name, so we combine parent names
name_parts = []
if self.parent is not None:
name_parts.append(self.parent._get_full_name())
name_parts.append(self.cls.name)
name = '__'.join(name_parts)
elif self.cls.neurodata_type_inc is not None:
# again, this is against the schema, but is common
name = self.cls.neurodata_type_inc
else:
raise ValueError('Not sure what our name is!')
return name
def _get_name(self) -> str:
"""
Get the "regular" name, which is used as the name of the attr
Returns:
"""
# return self._get_full_name()
name = None
if self.cls.neurodata_type_def:
name = self.cls.neurodata_type_def
elif self.cls.name is not None:
# we do have a unique name
name = self.cls.name
elif self.cls.neurodata_type_inc:
# group members can be anonymous? this violates the schema but is common
name = self.cls.neurodata_type_inc
if name is None:
raise ValueError(f'Class has no name!: {self.cls}')
return name
def handle_arraylike(self, dataset: Dataset, name:Optional[str]=None) -> Optional[ClassDefinition | SlotDefinition]:
"""
Handling the
- dims
- shape
- dtype
fields as they are used in datasets. We'll use the :class:`.Arraylike` class to imitate them.
Specifically:
- Each slot within a subclass indicates a possible dimension.
- Only dimensions that are present in all the dimension specifiers in the
original schema are required.
- Shape requirements are indicated using max/min cardinalities on the slot.
- The arraylike object should be stored in the `array` slot on the containing class
(since there are already properties named `data`)
If any of `dims`, `shape`, or `dtype` are undefined, return `None`
Args:
dataset (:class:`nwb_schema_language.Dataset`): The dataset defining the arraylike
name (str): If present, override the name of the class before appending _Array
(we don't use _get_full_name here because we want to eventually decouple these functions from this adapter
class, which is sort of a development crutch. Ideally all these methods would just work on base nwb schema language types)
"""
if not any((dataset.dims, dataset.shape)):
# none of the required properties are defined, that's fine.
return
elif not all((dataset.dims, dataset.shape)):
# need to have both if one is present!
raise ValueError(f"A dataset needs both dims and shape to define an arraylike object")
# Special cases
if dataset.neurodata_type_inc == 'VectorData':
# Handle this in `handle_vectorlike` instead
return None
# The schema language doesn't have a way of specifying a dataset/group is "abstract"
# and yet hdmf-common says you don't need a dtype if the dataset is "abstract"
# so....
dtype = self.handle_dtype(dataset.dtype)
# dims and shape are lists of lists. First we couple them
# (so each dim has its corresponding shape)..
# and then we take unique
# (dicts are ordered by default in recent pythons,
# while set() doesn't preserve order)
dims_shape = []
for inner_dim, inner_shape in zip(dataset.dims, dataset.shape):
if isinstance(inner_dim, list):
# list of lists
dims_shape.extend([(dim, shape) for dim, shape in zip(inner_dim, inner_shape)])
else:
# single-layer list
dims_shape.append((inner_dim, inner_shape))
dims_shape = tuple(dict.fromkeys(dims_shape).keys())
# if we only have one possible dimension, it's equivalent to a list, so we just return the slot
if len(dims_shape) == 1 and self.parent:
quantity = QUANTITY_MAP[dataset.quantity]
slot = SlotDefinition(
name=dataset.name,
range = dtype,
description=dataset.doc,
required=quantity['required'],
multivalued=True
)
return slot
# now make slots for each of them
slots = []
for dims, shape in dims_shape:
# if a dim is present in all possible combinations of dims, make it required
if all([dims in inner_dim for inner_dim in dataset.dims]):
required = True
else:
required = False
# use cardinality to do shape
if shape == 'null':
cardinality = None
else:
cardinality = shape
slots.append(SlotDefinition(
name=dims,
required=required,
maximum_cardinality=cardinality,
minimum_cardinality=cardinality,
range=dtype
))
# and then the class is just a subclass of `Arraylike` (which is imported by default from `nwb.language.yaml`)
if name:
pass
elif dataset.neurodata_type_def:
name = dataset.neurodata_type_def
elif dataset.name:
name = dataset.name
else:
raise ValueError(f"Dataset has no name or type definition, what do call it?")
name = '__'.join([name, 'Array'])
array_class = ClassDefinition(
name=name,
is_a="Arraylike",
attributes=slots
)
return array_class
def handle_dtype(self, dtype: DTypeType | None) -> str:
if isinstance(dtype, ReferenceDtype):
return dtype.target_type
elif dtype is None or dtype == []:
# Some ill-defined datasets are "abstract" despite that not being in the schema language
return 'AnyType'
elif isinstance(dtype, list) and isinstance(dtype[0], CompoundDtype):
# there is precisely one class that uses compound dtypes:
# TimeSeriesReferenceVectorData
# compoundDtypes are able to define a ragged table according to the schema
# but are used in this single case equivalently to attributes.
# so we'll... uh... treat them as slots.
# TODO
return 'AnyType'
#raise NotImplementedError('got distracted, need to implement')
else:
# flat dtype
return dtype
def build_attrs(self, cls: Dataset | Group) -> List[SlotDefinition]:
attrs = [
SlotDefinition(
name=attr.name,
description=attr.doc,
range=self.handle_dtype(attr.dtype),
) for attr in cls.attributes
]
return attrs
def build_subclasses(self, cls: Dataset | Group) -> BuildResult:
"""
Build nested groups and datasets
Create ClassDefinitions for each, but then also create SlotDefinitions that
will be used as attributes linking the main class to the subclasses
"""
# build and flatten nested classes
nested_classes = [ClassAdapter(cls=dset, parent=self) for dset in cls.datasets]
nested_classes.extend([ClassAdapter(cls=grp, parent=self) for grp in cls.groups])
nested_res = BuildResult()
for subclass in nested_classes:
# handle the special case where `VectorData` is subclasssed without any dims or attributes
# which just gets instantiated as a 1-d array in HDF5
if subclass.cls.neurodata_type_inc == 'VectorData' and \
not subclass.cls.dims and \
not subclass.cls.shape and \
not subclass.cls.attributes \
and subclass.cls.name:
this_slot = SlotDefinition(
name=subclass.cls.name,
description=subclass.cls.doc,
range=self.handle_dtype(subclass.cls.dtype),
multivalued=True
)
nested_res.slots.append(this_slot)
continue
# Simplify datasets that are just a single value
elif isinstance(subclass.cls, Dataset) and \
not subclass.cls.neurodata_type_inc and \
not subclass.cls.attributes and \
not subclass.cls.dims and \
not subclass.cls.shape and \
subclass.cls.name:
this_slot = SlotDefinition(
name=subclass.cls.name,
description=subclass.cls.doc,
range=self.handle_dtype(subclass.cls.dtype),
**QUANTITY_MAP[subclass.cls.quantity]
)
nested_res.slots.append(this_slot)
continue
else:
this_slot = SlotDefinition(
name=subclass._get_name(),
description=subclass.cls.doc,
range=subclass._get_full_name(),
**QUANTITY_MAP[subclass.cls.quantity]
)
nested_res.slots.append(this_slot)
if subclass.cls.name is None and subclass.cls.neurodata_type_def is None:
# anonymous group that's just an inc, we only need the slot since the class is defined elsewhere
continue
this_build = subclass.build()
nested_res += this_build
return nested_res
def build(self) -> BuildResult:
# Build this class
if self.parent is not None:
name = self._get_full_name()
else:
name = self._get_name()
# Get vanilla top-level attributes
attrs = self.build_attrs(self.cls)
# unnest and build subclasses in datasets and groups
if isinstance(self.cls, Group):
# only groups have sub-datasets and sub-groups
# split out the recursion step rather than making purely recursive because
# top-level datasets and groups are handled differently - they have names,
# and so we need to split out which things we unnest and which things
# can just be slots because they are already defined without knowing about
# the global state of the schema build.
nested_res = self.build_subclasses(self.cls)
attrs.extend(nested_res.slots)
else:
# must be a dataset
nested_res = BuildResult()
arraylike = self.handle_arraylike(self.cls, self._get_full_name())
if arraylike:
# if the arraylike thing can only have one dimension, it's equivalent to a list, so
# we just add a multivalued slot
if isinstance(arraylike, SlotDefinition):
attrs.append(arraylike)
else:
# make a slot for the arraylike class
attrs.append(
SlotDefinition(
name='array',
range=arraylike.name
)
)
nested_res.classes.append(arraylike)
cls = ClassDefinition(
name = name,
is_a = self.cls.neurodata_type_inc,
description=self.cls.doc,
attributes=attrs,
)
res = BuildResult(
classes = [cls, *nested_res.classes]
)
return res