nwb-linkml/nwb_linkml/adapters/dataset.py

321 lines
12 KiB
Python

"""
Adapter for NWB datasets to linkml Classes
"""
import pdb
from typing import Optional, List
import warnings
from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
from pydantic import PrivateAttr
from nwb_schema_language import Dataset, ReferenceDtype, CompoundDtype, DTypeType
from nwb_linkml.adapters.classes import ClassAdapter, camel_to_snake
from nwb_linkml.adapters.adapter import BuildResult
from nwb_linkml.maps import QUANTITY_MAP
class DatasetAdapter(ClassAdapter):
cls: Dataset
_handlers: List[str] = PrivateAttr(default_factory=list)
"""Keep track of which handlers have been called"""
def build(self) -> BuildResult:
res = self.build_base()
res = self.drop_dynamic_table(res)
res = self.handle_arraylike(res, self.cls, self._get_full_name())
res = self.handle_1d_vector(res)
res = self.handle_listlike(res)
res = self.handle_scalar(res)
if len(self._handlers) > 1:
raise RuntimeError(f"Only one handler should have been triggered, instead triggered {self._handlers}")
return res
def handle_scalar(self, res:BuildResult) -> BuildResult:
# Simplify datasets that are just a single value
if self.cls.neurodata_type_inc != 'VectorData' and \
not self.cls.neurodata_type_inc and \
not self.cls.attributes and \
not self.cls.dims and \
not self.cls.shape and \
self.cls.name:
self._handlers.append('scalar')
# throw out the class that would have been made for us
# we just need a slot
this_slot = SlotDefinition(
name=self.cls.name,
description=self.cls.doc,
range=self.handle_dtype(self.cls.dtype),
**QUANTITY_MAP[self.cls.quantity]
)
res = BuildResult(slots = [this_slot])
# if the scalar-valued class has attributes, append a
# 'value' slot that holds the (scalar) value of the dataset
elif self.cls.neurodata_type_inc != 'VectorData' and \
not self.cls.neurodata_type_inc and \
self.cls.attributes and \
not self.cls.dims and \
not self.cls.shape and \
self.cls.name:
self._handlers.append('scalar_class')
# quantity (including requirement) is handled by the
# parent slot - the value is required if the value class is
# supplied.
# ie.
# Optional[ScalarClass] = None
# class ScalarClass:
# value: dtype
value_slot = SlotDefinition(
name='value',
range=self.handle_dtype(self.cls.dtype),
required=True
)
res.classes[0].attributes['value'] = value_slot
return res
def handle_1d_vector(self, res: BuildResult) -> BuildResult:
# handle the special case where `VectorData` is subclasssed without any dims or attributes
# which just gets instantiated as a 1-d array in HDF5
if self.cls.neurodata_type_inc == 'VectorData' and \
not self.cls.dims and \
not self.cls.shape and \
not self.cls.attributes \
and self.cls.name:
self._handlers.append('1d_vector')
this_slot = SlotDefinition(
name=self.cls.name,
description=self.cls.doc,
range=self.handle_dtype(self.cls.dtype),
multivalued=True
)
# No need to make a class for us, so we replace the existing build results
res = BuildResult(slots=[this_slot])
return res
def handle_listlike(self, res:BuildResult) -> BuildResult:
"""
Handle cases where the dataset is just a list of a specific type.
Examples:
datasets:
- name: file_create_date
dtype: isodatetime
dims:
- num_modifications
shape:
- null
"""
if self.cls.name and ((
# single-layer list
not any([isinstance(dim, list) for dim in self.cls.dims]) and
len(self.cls.dims) == 1
) or (
# nested list
all([isinstance(dim, list) for dim in self.cls.dims]) and
len(self.cls.dims) == 1 and
len(self.cls.dims[0]) == 1
)):
res = BuildResult(
slots = [
SlotDefinition(
name = self.cls.name,
multivalued=True,
range=self.handle_dtype(self.cls.dtype),
description=self.cls.doc,
required=False if self.cls.quantity in ('*', '?') else True
)
]
)
return res
else:
return res
def handle_arraylike(self, res: BuildResult, dataset: Dataset, name: Optional[str] = None) -> BuildResult:
"""
Handling the
- dims
- shape
- dtype
fields as they are used in datasets. We'll use the :class:`.Arraylike` class to imitate them.
Specifically:
- Each slot within a subclass indicates a possible dimension.
- Only dimensions that are present in all the dimension specifiers in the
original schema are required.
- Shape requirements are indicated using max/min cardinalities on the slot.
- The arraylike object should be stored in the `array` slot on the containing class
(since there are already properties named `data`)
If any of `dims`, `shape`, or `dtype` are undefined, return `None`
Args:
dataset (:class:`nwb_schema_language.Dataset`): The dataset defining the arraylike
name (str): If present, override the name of the class before appending _Array
(we don't use _get_full_name here because we want to eventually decouple these functions from this adapter
class, which is sort of a development crutch. Ideally all these methods would just work on base nwb schema language types)
"""
if not any((dataset.dims, dataset.shape)):
# none of the required properties are defined, that's fine.
return res
elif not all((dataset.dims, dataset.shape)):
# need to have both if one is present!
warnings.warn(f"A dataset needs both dims and shape to define an arraylike object. This is allowed for compatibility with some badly formatted NWB files, but should in general be avoided. Treating like we dont have an array")
return res
# Special cases
if dataset.neurodata_type_inc == 'VectorData':
# Handle this in `handle_vectorlike` instead
return res
# The schema language doesn't have a way of specifying a dataset/group is "abstract"
# and yet hdmf-common says you don't need a dtype if the dataset is "abstract"
# so....
dtype = self.handle_dtype(dataset.dtype)
# dims and shape are lists of lists. First we couple them
# (so each dim has its corresponding shape)..
# and then we take unique
# (dicts are ordered by default in recent pythons,
# while set() doesn't preserve order)
dims_shape = []
for inner_dim, inner_shape in zip(dataset.dims, dataset.shape):
if isinstance(inner_dim, list):
# list of lists
dims_shape.extend([(dim, shape) for dim, shape in zip(inner_dim, inner_shape)])
elif isinstance(inner_shape, list):
# Some badly formatted schema will have the shape be a LoL but the dims won't be...
dims_shape.extend([(inner_dim, shape) for shape in inner_shape])
else:
# single-layer list
dims_shape.append((inner_dim, inner_shape))
dims_shape = tuple(dict.fromkeys(dims_shape).keys())
# if we only have one possible dimension, it's equivalent to a list, so we just return the slot
if len(dims_shape) == 1 and self.parent:
quantity = QUANTITY_MAP[dataset.quantity]
slot = SlotDefinition(
name=dataset.name,
range=dtype,
description=dataset.doc,
required=quantity['required'],
multivalued=True
)
res.classes[0].attributes.update({dataset.name: slot})
self._handlers.append('arraylike-1d')
return res
# now make slots for each of them
slots = []
for dims, shape in dims_shape:
# if a dim is present in all possible combinations of dims, make it required
if all([dims in inner_dim for inner_dim in dataset.dims]):
required = True
# or if there is just a single list of possible dimensions
elif not any([isinstance(inner_dim, list) for inner_dim in dataset.dims]):
required = True
else:
required = False
# use cardinality to do shape
if shape == 'null':
cardinality = None
else:
cardinality = shape
slots.append(SlotDefinition(
name=dims,
required=required,
maximum_cardinality=cardinality,
minimum_cardinality=cardinality,
range=dtype
))
# and then the class is just a subclass of `Arraylist` (which is imported by default from `nwb.language.yaml`)
if name:
pass
elif dataset.neurodata_type_def:
name = dataset.neurodata_type_def
elif dataset.name:
name = dataset.name
else:
raise ValueError(f"Dataset has no name or type definition, what do call it?")
name = '__'.join([name, 'Array'])
array_class = ClassDefinition(
name=name,
is_a="Arraylike",
attributes=slots
)
# make a slot for the arraylike class
array_slot = SlotDefinition(
name='array',
range=array_class.name
)
res.classes.append(array_class)
res.classes[0].attributes.update({'array': array_slot})
#res.slots.append(array_slot)
self._handlers.append('arraylike')
return res
def drop_dynamic_table(self, res:BuildResult) -> BuildResult:
"""
DynamicTables in hdmf are so special-cased that we have to just special-case them ourselves.
Typically they include a '*' quantitied, unnamed VectorData object to contain arbitrary columns,
this would normally get converted to its own container class, but since they're unnamed they conflict with
names in the containing scope.
We just convert them into multivalued slots and don't use them
"""
if self.cls.name is None and \
self.cls.neurodata_type_def is None and \
self.cls.neurodata_type_inc in ('VectorIndex', 'VectorData') and \
self.cls.quantity == '*':
self._handlers.append('dynamic_table')
this_slot = SlotDefinition(
name=camel_to_snake(self.cls.neurodata_type_inc),
description=self.cls.doc,
range=self.cls.neurodata_type_inc,
required=False,
multivalued=True
)
# No need to make a class for us, so we replace the existing build results
res = BuildResult(slots=[this_slot])
return res
elif self.cls.name is None and \
self.cls.neurodata_type_def is None and \
self.cls.neurodata_type_inc and \
self.cls.quantity in ('*', '+'):
self._handlers.append('generic_container')
this_slot = SlotDefinition(
name=camel_to_snake(self.cls.neurodata_type_inc),
description=self.cls.doc,
range=self.cls.neurodata_type_inc,
**QUANTITY_MAP[self.cls.quantity]
)
# No need to make a class for us, so we replace the existing build results
res = BuildResult(slots=[this_slot])
return res
else:
return res