nwb-linkml/nwb_linkml/adapters/dataset.py

"""
Adapter for NWB datasets to linkml Classes
"""
import pdb
from typing import Optional, List
import warnings

from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
from pydantic import PrivateAttr

from nwb_schema_language import Dataset, ReferenceDtype, CompoundDtype, DTypeType
from nwb_linkml.adapters.classes import ClassAdapter, camel_to_snake
from nwb_linkml.adapters.adapter import BuildResult
from nwb_linkml.maps import QUANTITY_MAP

class DatasetAdapter(ClassAdapter):
    cls: Dataset

    _handlers: List[str] = PrivateAttr(default_factory=list)
    """Keep track of which handlers have been called"""


    def build(self) -> BuildResult:
        res = self.build_base()

        res = self.drop_dynamic_table(res)
        res = self.handle_arraylike(res, self.cls, self._get_full_name())
        res = self.handle_1d_vector(res)
        res = self.handle_listlike(res)
        res = self.handle_scalar(res)


        if len(self._handlers) > 1:
            raise RuntimeError(f"Only one handler should have been triggered, instead triggered {self._handlers}")

        return res

    def handle_scalar(self, res:BuildResult) -> BuildResult:

        # Simplify datasets that are just a single value
        if self.cls.neurodata_type_inc != 'VectorData' and \
             not self.cls.neurodata_type_inc and \
             not self.cls.attributes and \
             not self.cls.dims and \
             not self.cls.shape and \
             self.cls.name:
            self._handlers.append('scalar')
            # throw out the class that would have been made for us
            # we just need a slot
            this_slot = SlotDefinition(
                name=self.cls.name,
                description=self.cls.doc,
                range=self.handle_dtype(self.cls.dtype),
                **QUANTITY_MAP[self.cls.quantity]
            )
            res = BuildResult(slots = [this_slot])

        # if the scalar-valued class has attributes, append a
        # 'value' slot that holds the (scalar) value of the dataset
        elif self.cls.neurodata_type_inc != 'VectorData' and \
             not self.cls.neurodata_type_inc and \
             self.cls.attributes and \
             not self.cls.dims and \
             not self.cls.shape and \
             self.cls.name:
            self._handlers.append('scalar_class')

            # quantity (including requirement) is handled by the
            # parent slot - the value is required if the value class is
            # supplied.
            # ie.
            # Optional[ScalarClass] = None
            # class ScalarClass:
            #     value: dtype
            value_slot = SlotDefinition(
                name='value',
                range=self.handle_dtype(self.cls.dtype),
                required=True
            )
            res.classes[0].attributes['value'] = value_slot

        return res


    def handle_1d_vector(self, res: BuildResult) -> BuildResult:
        # handle the special case where `VectorData` is subclasssed without any dims or attributes
        # which just gets instantiated as a 1-d array in HDF5
        if self.cls.neurodata_type_inc == 'VectorData' and \
                not self.cls.dims and \
                not self.cls.shape and \
                not self.cls.attributes \
                and self.cls.name:
            self._handlers.append('1d_vector')
            this_slot = SlotDefinition(
                name=self.cls.name,
                description=self.cls.doc,
                range=self.handle_dtype(self.cls.dtype),
                multivalued=True
            )
            # No need to make a class for us, so we replace the existing build results
            res = BuildResult(slots=[this_slot])

        return res

    def handle_listlike(self, res:BuildResult) -> BuildResult:
        """
        Handle cases where the dataset is just a list of a specific type.

        Examples:

              datasets:
              - name: file_create_date
                dtype: isodatetime
                dims:
                - num_modifications
                shape:
                - null

        """
        if self.cls.name and ((
                # single-layer list
                not any([isinstance(dim, list) for dim in self.cls.dims]) and
                len(self.cls.dims) == 1
            ) or (
                # nested list
                all([isinstance(dim, list) for dim in self.cls.dims]) and
                len(self.cls.dims) == 1 and
                len(self.cls.dims[0]) == 1
            )):
            res = BuildResult(
                slots = [
                    SlotDefinition(
                        name = self.cls.name,
                        multivalued=True,
                        range=self.handle_dtype(self.cls.dtype),
                        description=self.cls.doc,
                        required=False if self.cls.quantity in ('*', '?') else True
                    )
                ]
            )
            return res
        else:
            return res


    def handle_arraylike(self, res: BuildResult, dataset: Dataset, name: Optional[str] = None) -> BuildResult:
        """
        Handling the

        - dims
        - shape
        - dtype

        fields as they are used in datasets. We'll use the :class:`.Arraylike` class to imitate them.

        Specifically:

        - Each slot within a subclass indicates a possible dimension.
        - Only dimensions that are present in all the dimension specifiers in the
          original schema are required.
        - Shape requirements are indicated using max/min cardinalities on the slot.
        - The arraylike object should be stored in the `array` slot on the containing class
          (since there are already properties named `data`)

        If any of `dims`, `shape`, or `dtype` are undefined, return `None`

        Args:
            dataset (:class:`nwb_schema_language.Dataset`): The dataset defining the arraylike
            name (str): If present, override the name of the class before appending _Array
                (we don't use _get_full_name here because we want to eventually decouple these functions from this adapter
                class, which is sort of a development crutch. Ideally all these methods would just work on base nwb schema language types)
        """
        if not any((dataset.dims, dataset.shape)):
            # none of the required properties are defined, that's fine.
            return res
        elif not all((dataset.dims, dataset.shape)):
            # need to have both if one is present!
            warnings.warn(f"A dataset needs both dims and shape to define an arraylike object. This is allowed for compatibility with some badly formatted NWB files, but should in general be avoided. Treating like we dont have an array")
            return res

        # Special cases
        if dataset.neurodata_type_inc == 'VectorData':
            # Handle this in `handle_vectorlike` instead
            return res

        # The schema language doesn't have a way of specifying a dataset/group is "abstract"
        # and yet hdmf-common says you don't need a dtype if the dataset is "abstract"
        # so....
        dtype = self.handle_dtype(dataset.dtype)

        # dims and shape are lists of lists. First we couple them
        # (so each dim has its corresponding shape)..
        # and then we take unique
        # (dicts are ordered by default in recent pythons,
        # while set() doesn't preserve order)
        dims_shape = []
        for inner_dim, inner_shape in zip(dataset.dims, dataset.shape):
            if isinstance(inner_dim, list):
                # list of lists
                dims_shape.extend([(dim, shape) for dim, shape in zip(inner_dim, inner_shape)])
            elif isinstance(inner_shape, list):
                # Some badly formatted schema will have the shape be a LoL but the dims won't be...
                dims_shape.extend([(inner_dim, shape) for shape in inner_shape])
            else:
                # single-layer list
                dims_shape.append((inner_dim, inner_shape))

        dims_shape = tuple(dict.fromkeys(dims_shape).keys())

        # if we only have one possible dimension, it's equivalent to a list, so we just return the slot
        if len(dims_shape) == 1 and self.parent:
            quantity = QUANTITY_MAP[dataset.quantity]
            slot = SlotDefinition(
                name=dataset.name,
                range=dtype,
                description=dataset.doc,
                required=quantity['required'],
                multivalued=True
            )
            res.classes[0].attributes.update({dataset.name: slot})
            self._handlers.append('arraylike-1d')
            return res

        # now make slots for each of them
        slots = []
        for dims, shape in dims_shape:
            # if a dim is present in all possible combinations of dims, make it required
            if all([dims in inner_dim for inner_dim in dataset.dims]):
                required = True
            # or if there is just a single list of possible dimensions
            elif not any([isinstance(inner_dim, list) for inner_dim in dataset.dims]):
                required = True
            else:
                required = False

            # use cardinality to do shape
            if shape == 'null':
                cardinality = None
            else:
                cardinality = shape

            slots.append(SlotDefinition(
                name=dims,
                required=required,
                maximum_cardinality=cardinality,
                minimum_cardinality=cardinality,
                range=dtype
            ))

        # and then the class is just a subclass of `Arraylist` (which is imported by default from `nwb.language.yaml`)
        if name:
            pass
        elif dataset.neurodata_type_def:
            name = dataset.neurodata_type_def
        elif dataset.name:
            name = dataset.name
        else:
            raise ValueError(f"Dataset has no name or type definition, what do call it?")

        name = '__'.join([name, 'Array'])

        array_class = ClassDefinition(
            name=name,
            is_a="Arraylike",
            attributes=slots
        )
        # make a slot for the arraylike class
        array_slot = SlotDefinition(
                name='array',
                range=array_class.name
            )

        res.classes.append(array_class)
        res.classes[0].attributes.update({'array': array_slot})
        #res.slots.append(array_slot)
        self._handlers.append('arraylike')

        return res

    def drop_dynamic_table(self, res:BuildResult) -> BuildResult:
        """
        DynamicTables in hdmf are so special-cased that we have to just special-case them ourselves.

        Typically they include a '*' quantitied, unnamed VectorData object to contain arbitrary columns,
        this would normally get converted to its own container class, but since they're unnamed they conflict with
        names in the containing scope.

        We just convert them into multivalued slots and don't use them
        """
        if self.cls.name is None and \
            self.cls.neurodata_type_def is None and \
            self.cls.neurodata_type_inc in ('VectorIndex', 'VectorData') and \
            self.cls.quantity == '*':
            self._handlers.append('dynamic_table')
            this_slot = SlotDefinition(
                name=camel_to_snake(self.cls.neurodata_type_inc),
                description=self.cls.doc,
                range=self.cls.neurodata_type_inc,
                required=False,
                multivalued=True
            )
            # No need to make a class for us, so we replace the existing build results
            res = BuildResult(slots=[this_slot])
            return res
        elif self.cls.name is None and \
            self.cls.neurodata_type_def is None and \
            self.cls.neurodata_type_inc and \
            self.cls.quantity in ('*', '+'):
            self._handlers.append('generic_container')
            this_slot = SlotDefinition(
                name=camel_to_snake(self.cls.neurodata_type_inc),
                description=self.cls.doc,
                range=self.cls.neurodata_type_inc,
                **QUANTITY_MAP[self.cls.quantity]
            )
            # No need to make a class for us, so we replace the existing build results
            res = BuildResult(slots=[this_slot])
            return res
        else:
            return res