nwb-linkml/nwb_linkml/adapters/dataset.py

"""
Adapter for NWB datasets to linkml Classes
"""
from typing import Optional, List

from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
from pydantic import PrivateAttr

from nwb_schema_language import Dataset, ReferenceDtype, CompoundDtype, DTypeType
from nwb_linkml.adapters.classes import ClassAdapter
from nwb_linkml.adapters.adapter import BuildResult
from nwb_linkml.maps import QUANTITY_MAP

class DatasetAdapter(ClassAdapter):
    cls: Dataset

    _handlers: List[str] = PrivateAttr(default_factory=list)
    """Keep track of which handlers have been called"""


    def build(self) -> BuildResult:
        res = self.build_base()

        res = self.handle_arraylike(res, self.cls, self._get_full_name())
        res = self.handle_1d_vector(res)
        res = self.handle_listlike(res)
        res = self.handle_scalar(res)

        if len(self._handlers) > 1:
            raise RuntimeError(f"Only one handler should have been triggered, instead triggered {self._handlers}")

        return res

    def handle_scalar(self, res:BuildResult) -> BuildResult:

        # Simplify datasets that are just a single value
        if self.cls.neurodata_type_inc != 'VectorData' and \
             not self.cls.neurodata_type_inc and \
             not self.cls.attributes and \
             not self.cls.dims and \
             not self.cls.shape and \
             self.cls.name:
            self._handlers.append('scalar')
            # throw out the class that would have been made for us
            # we just need a slot
            this_slot = SlotDefinition(
                name=self.cls.name,
                description=self.cls.doc,
                range=self.handle_dtype(self.cls.dtype),
                **QUANTITY_MAP[self.cls.quantity]
            )
            res = BuildResult(slots = [this_slot])

        # if the scalar-valued class has attributes, append a
        # 'value' slot that holds the (scalar) value of the dataset
        elif self.cls.neurodata_type_inc != 'VectorData' and \
             not self.cls.neurodata_type_inc and \
             self.cls.attributes and \
             not self.cls.dims and \
             not self.cls.shape and \
             self.cls.name:
            self._handlers.append('scalar_class')

            # quantity (including requirement) is handled by the
            # parent slot - the value is required if the value class is
            # supplied.
            # ie.
            # Optional[ScalarClass] = None
            # class ScalarClass:
            #     value: dtype
            value_slot = SlotDefinition(
                name='value',
                range=self.handle_dtype(self.cls.dtype),
                required=True
            )
            res.classes[0].attributes['value'] = value_slot

        return res


    def handle_1d_vector(self, res: BuildResult) -> BuildResult:
        # handle the special case where `VectorData` is subclasssed without any dims or attributes
        # which just gets instantiated as a 1-d array in HDF5
        if self.cls.neurodata_type_inc == 'VectorData' and \
                not self.cls.dims and \
                not self.cls.shape and \
                not self.cls.attributes \
                and self.cls.name:
            self._handlers.append('1d_vector')
            this_slot = SlotDefinition(
                name=self.cls.name,
                description=self.cls.doc,
                range=self.handle_dtype(self.cls.dtype),
                multivalued=True
            )
            # No need to make a class for us, so we replace the existing build results
            res = BuildResult(slots=[this_slot])

        return res

    def handle_listlike(self, res:BuildResult) -> BuildResult:
        """
        Handle cases where the dataset is just a list of a specific type.

        Examples:

              datasets:
              - name: file_create_date
                dtype: isodatetime
                dims:
                - num_modifications
                shape:
                - null

        """
        if self.cls.name and ((
                # single-layer list
                not any([isinstance(dim, list) for dim in self.cls.dims]) and
                len(self.cls.dims) == 1
            ) or (
                # nested list
                all([isinstance(dim, list) for dim in self.cls.dims]) and
                len(self.cls.dims) == 1 and
                len(self.cls.dims[0]) == 1
            )):
            res = BuildResult(
                slots = [
                    SlotDefinition(
                        name = self.cls.name,
                        multivalued=True,
                        range=self.handle_dtype(self.cls.dtype),
                        description=self.cls.doc,
                        required=False if self.cls.quantity in ('*', '?') else True
                    )
                ]
            )
            return res
        else:
            return res


    def handle_arraylike(self, res: BuildResult, dataset: Dataset, name: Optional[str] = None) -> BuildResult:
        """
        Handling the

        - dims
        - shape
        - dtype

        fields as they are used in datasets. We'll use the :class:`.Arraylike` class to imitate them.

        Specifically:

        - Each slot within a subclass indicates a possible dimension.
        - Only dimensions that are present in all the dimension specifiers in the
          original schema are required.
        - Shape requirements are indicated using max/min cardinalities on the slot.
        - The arraylike object should be stored in the `array` slot on the containing class
          (since there are already properties named `data`)

        If any of `dims`, `shape`, or `dtype` are undefined, return `None`

        Args:
            dataset (:class:`nwb_schema_language.Dataset`): The dataset defining the arraylike
            name (str): If present, override the name of the class before appending _Array
                (we don't use _get_full_name here because we want to eventually decouple these functions from this adapter
                class, which is sort of a development crutch. Ideally all these methods would just work on base nwb schema language types)
        """
        if not any((dataset.dims, dataset.shape)):
            # none of the required properties are defined, that's fine.
            return res
        elif not all((dataset.dims, dataset.shape)):
            # need to have both if one is present!
            raise ValueError(f"A dataset needs both dims and shape to define an arraylike object")

        # Special cases
        if dataset.neurodata_type_inc == 'VectorData':
            # Handle this in `handle_vectorlike` instead
            return res

        # The schema language doesn't have a way of specifying a dataset/group is "abstract"
        # and yet hdmf-common says you don't need a dtype if the dataset is "abstract"
        # so....
        dtype = self.handle_dtype(dataset.dtype)

        # dims and shape are lists of lists. First we couple them
        # (so each dim has its corresponding shape)..
        # and then we take unique
        # (dicts are ordered by default in recent pythons,
        # while set() doesn't preserve order)
        dims_shape = []
        for inner_dim, inner_shape in zip(dataset.dims, dataset.shape):
            if isinstance(inner_dim, list):
                # list of lists
                dims_shape.extend([(dim, shape) for dim, shape in zip(inner_dim, inner_shape)])
            else:
                # single-layer list
                dims_shape.append((inner_dim, inner_shape))

        dims_shape = tuple(dict.fromkeys(dims_shape).keys())

        # if we only have one possible dimension, it's equivalent to a list, so we just return the slot
        if len(dims_shape) == 1 and self.parent:
            quantity = QUANTITY_MAP[dataset.quantity]
            slot = SlotDefinition(
                name=dataset.name,
                range=dtype,
                description=dataset.doc,
                required=quantity['required'],
                multivalued=True
            )
            res.classes[0].attributes.update({dataset.name: slot})
            self._handlers.append('arraylike-1d')
            return res

        # now make slots for each of them
        slots = []
        for dims, shape in dims_shape:
            # if a dim is present in all possible combinations of dims, make it required
            if all([dims in inner_dim for inner_dim in dataset.dims]):
                required = True
            # or if there is just a single list of possible dimensions
            elif not any([isinstance(inner_dim, list) for inner_dim in dataset.dims]):
                required = True
            else:
                required = False

            # use cardinality to do shape
            if shape == 'null':
                cardinality = None
            else:
                cardinality = shape

            slots.append(SlotDefinition(
                name=dims,
                required=required,
                maximum_cardinality=cardinality,
                minimum_cardinality=cardinality,
                range=dtype
            ))

        # and then the class is just a subclass of `Arraylike` (which is imported by default from `nwb.language.yaml`)
        if name:
            pass
        elif dataset.neurodata_type_def:
            name = dataset.neurodata_type_def
        elif dataset.name:
            name = dataset.name
        else:
            raise ValueError(f"Dataset has no name or type definition, what do call it?")

        name = '__'.join([name, 'Array'])

        array_class = ClassDefinition(
            name=name,
            is_a="Arraylike",
            attributes=slots
        )
        # make a slot for the arraylike class
        array_slot = SlotDefinition(
                name='array',
                range=array_class.name
            )

        res.classes.append(array_class)
        res.classes[0].attributes.update({'array': array_slot})
        #res.slots.append(array_slot)
        self._handlers.append('arraylike')

        return res
Need to go home, in the middle of refactoring group and dataset as well as implementing the name property correctly 2023-08-31 07:01:43 +00:00			`"""`
			`Adapter for NWB datasets to linkml Classes`
			`"""`
			`from typing import Optional, List`

			`from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition`
			`from pydantic import PrivateAttr`

			`from nwb_schema_language import Dataset, ReferenceDtype, CompoundDtype, DTypeType`
			`from nwb_linkml.adapters.classes import ClassAdapter`
			`from nwb_linkml.adapters.adapter import BuildResult`
			`from nwb_linkml.maps import QUANTITY_MAP`

			`class DatasetAdapter(ClassAdapter):`
			`cls: Dataset`

			`_handlers: List[str] = PrivateAttr(default_factory=list)`
			`"""Keep track of which handlers have been called"""`


			`def build(self) -> BuildResult:`
			`res = self.build_base()`

			`res = self.handle_arraylike(res, self.cls, self._get_full_name())`
			`res = self.handle_1d_vector(res)`
Working recursion! and also handling multiple array shapes, and properly handling unnamed classes 2023-09-01 03:56:21 +00:00			`res = self.handle_listlike(res)`
Need to go home, in the middle of refactoring group and dataset as well as implementing the name property correctly 2023-08-31 07:01:43 +00:00			`res = self.handle_scalar(res)`

Working recursion! and also handling multiple array shapes, and properly handling unnamed classes 2023-09-01 03:56:21 +00:00			`if len(self._handlers) > 1:`
			`raise RuntimeError(f"Only one handler should have been triggered, instead triggered {self._handlers}")`

Need to go home, in the middle of refactoring group and dataset as well as implementing the name property correctly 2023-08-31 07:01:43 +00:00			`return res`

			`def handle_scalar(self, res:BuildResult) -> BuildResult:`

			`# Simplify datasets that are just a single value`
			`if self.cls.neurodata_type_inc != 'VectorData' and \`
			`not self.cls.neurodata_type_inc and \`
			`not self.cls.attributes and \`
			`not self.cls.dims and \`
			`not self.cls.shape and \`
			`self.cls.name:`
			`self._handlers.append('scalar')`
			`# throw out the class that would have been made for us`
			`# we just need a slot`
			`this_slot = SlotDefinition(`
			`name=self.cls.name,`
			`description=self.cls.doc,`
			`range=self.handle_dtype(self.cls.dtype),`
			`**QUANTITY_MAP[self.cls.quantity]`
			`)`
			`res = BuildResult(slots = [this_slot])`

- Get imports for parent class slots - Handle scalar valued datasets with subattributes - start on hdf5 2023-09-04 23:16:29 +00:00			`# if the scalar-valued class has attributes, append a`
			`# 'value' slot that holds the (scalar) value of the dataset`
			`elif self.cls.neurodata_type_inc != 'VectorData' and \`
			`not self.cls.neurodata_type_inc and \`
			`self.cls.attributes and \`
			`not self.cls.dims and \`
			`not self.cls.shape and \`
			`self.cls.name:`
			`self._handlers.append('scalar_class')`

			`# quantity (including requirement) is handled by the`
			`# parent slot - the value is required if the value class is`
			`# supplied.`
			`# ie.`
			`# Optional[ScalarClass] = None`
			`# class ScalarClass:`
			`# value: dtype`
			`value_slot = SlotDefinition(`
			`name='value',`
			`range=self.handle_dtype(self.cls.dtype),`
			`required=True`
			`)`
			`res.classes[0].attributes['value'] = value_slot`

Need to go home, in the middle of refactoring group and dataset as well as implementing the name property correctly 2023-08-31 07:01:43 +00:00			`return res`


			`def handle_1d_vector(self, res: BuildResult) -> BuildResult:`
			# handle the special case where `VectorData` is subclasssed without any dims or attributes
			`# which just gets instantiated as a 1-d array in HDF5`
			`if self.cls.neurodata_type_inc == 'VectorData' and \`
			`not self.cls.dims and \`
			`not self.cls.shape and \`
			`not self.cls.attributes \`
			`and self.cls.name:`
			`self._handlers.append('1d_vector')`
			`this_slot = SlotDefinition(`
			`name=self.cls.name,`
			`description=self.cls.doc,`
			`range=self.handle_dtype(self.cls.dtype),`
			`multivalued=True`
			`)`
			`# No need to make a class for us, so we replace the existing build results`
			`res = BuildResult(slots=[this_slot])`

			`return res`

Working recursion! and also handling multiple array shapes, and properly handling unnamed classes 2023-09-01 03:56:21 +00:00			`def handle_listlike(self, res:BuildResult) -> BuildResult:`
			`"""`
			`Handle cases where the dataset is just a list of a specific type.`

			`Examples:`

			`datasets:`
			`- name: file_create_date`
			`dtype: isodatetime`
			`dims:`
			`- num_modifications`
			`shape:`
			`- null`

			`"""`
			`if self.cls.name and ((`
			`# single-layer list`
			`not any([isinstance(dim, list) for dim in self.cls.dims]) and`
			`len(self.cls.dims) == 1`
			`) or (`
			`# nested list`
			`all([isinstance(dim, list) for dim in self.cls.dims]) and`
			`len(self.cls.dims) == 1 and`
			`len(self.cls.dims[0]) == 1`
			`)):`
			`res = BuildResult(`
			`slots = [`
			`SlotDefinition(`
			`name = self.cls.name,`
			`multivalued=True,`
			`range=self.handle_dtype(self.cls.dtype),`
			`description=self.cls.doc,`
			`required=False if self.cls.quantity in ('*', '?') else True`
			`)`
			`]`
			`)`
			`return res`
			`else:`
			`return res`


Need to go home, in the middle of refactoring group and dataset as well as implementing the name property correctly 2023-08-31 07:01:43 +00:00			`def handle_arraylike(self, res: BuildResult, dataset: Dataset, name: Optional[str] = None) -> BuildResult:`
			`"""`
			`Handling the`

			`- dims`
			`- shape`
			`- dtype`

			fields as they are used in datasets. We'll use the :class:`.Arraylike` class to imitate them.

			`Specifically:`

			`- Each slot within a subclass indicates a possible dimension.`
			`- Only dimensions that are present in all the dimension specifiers in the`
			`original schema are required.`
			`- Shape requirements are indicated using max/min cardinalities on the slot.`
			- The arraylike object should be stored in the `array` slot on the containing class
			(since there are already properties named `data`)

			If any of `dims`, `shape`, or `dtype` are undefined, return `None`

			`Args:`
			dataset (:class:`nwb_schema_language.Dataset`): The dataset defining the arraylike
			`name (str): If present, override the name of the class before appending _Array`
			`(we don't use _get_full_name here because we want to eventually decouple these functions from this adapter`
			`class, which is sort of a development crutch. Ideally all these methods would just work on base nwb schema language types)`
			`"""`
			`if not any((dataset.dims, dataset.shape)):`
			`# none of the required properties are defined, that's fine.`
			`return res`
			`elif not all((dataset.dims, dataset.shape)):`
			`# need to have both if one is present!`
			`raise ValueError(f"A dataset needs both dims and shape to define an arraylike object")`

			`# Special cases`
			`if dataset.neurodata_type_inc == 'VectorData':`
			# Handle this in `handle_vectorlike` instead
			`return res`

			`# The schema language doesn't have a way of specifying a dataset/group is "abstract"`
			`# and yet hdmf-common says you don't need a dtype if the dataset is "abstract"`
			`# so....`
			`dtype = self.handle_dtype(dataset.dtype)`

			`# dims and shape are lists of lists. First we couple them`
			`# (so each dim has its corresponding shape)..`
			`# and then we take unique`
			`# (dicts are ordered by default in recent pythons,`
			`# while set() doesn't preserve order)`
			`dims_shape = []`
			`for inner_dim, inner_shape in zip(dataset.dims, dataset.shape):`
			`if isinstance(inner_dim, list):`
			`# list of lists`
			`dims_shape.extend([(dim, shape) for dim, shape in zip(inner_dim, inner_shape)])`
			`else:`
			`# single-layer list`
			`dims_shape.append((inner_dim, inner_shape))`

			`dims_shape = tuple(dict.fromkeys(dims_shape).keys())`

			`# if we only have one possible dimension, it's equivalent to a list, so we just return the slot`
			`if len(dims_shape) == 1 and self.parent:`
			`quantity = QUANTITY_MAP[dataset.quantity]`
			`slot = SlotDefinition(`
			`name=dataset.name,`
			`range=dtype,`
			`description=dataset.doc,`
			`required=quantity['required'],`
			`multivalued=True`
			`)`
			`res.classes[0].attributes.update({dataset.name: slot})`
			`self._handlers.append('arraylike-1d')`
			`return res`

			`# now make slots for each of them`
			`slots = []`
			`for dims, shape in dims_shape:`
			`# if a dim is present in all possible combinations of dims, make it required`
			`if all([dims in inner_dim for inner_dim in dataset.dims]):`
			`required = True`
Working recursion! and also handling multiple array shapes, and properly handling unnamed classes 2023-09-01 03:56:21 +00:00			`# or if there is just a single list of possible dimensions`
			`elif not any([isinstance(inner_dim, list) for inner_dim in dataset.dims]):`
			`required = True`
Need to go home, in the middle of refactoring group and dataset as well as implementing the name property correctly 2023-08-31 07:01:43 +00:00			`else:`
			`required = False`

			`# use cardinality to do shape`
			`if shape == 'null':`
			`cardinality = None`
			`else:`
			`cardinality = shape`

			`slots.append(SlotDefinition(`
			`name=dims,`
			`required=required,`
			`maximum_cardinality=cardinality,`
			`minimum_cardinality=cardinality,`
			`range=dtype`
			`))`

			# and then the class is just a subclass of `Arraylike` (which is imported by default from `nwb.language.yaml`)
			`if name:`
			`pass`
			`elif dataset.neurodata_type_def:`
			`name = dataset.neurodata_type_def`
			`elif dataset.name:`
			`name = dataset.name`
			`else:`
			`raise ValueError(f"Dataset has no name or type definition, what do call it?")`

			`name = '__'.join([name, 'Array'])`

			`array_class = ClassDefinition(`
			`name=name,`
			`is_a="Arraylike",`
			`attributes=slots`
			`)`
			`# make a slot for the arraylike class`
			`array_slot = SlotDefinition(`
			`name='array',`
			`range=array_class.name`
			`)`

			`res.classes.append(array_class)`
			`res.classes[0].attributes.update({'array': array_slot})`
			`#res.slots.append(array_slot)`
			`self._handlers.append('arraylike')`

			`return res`