From 3568037a1e1326d46f95f644453aaacdd5c093ce Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Thu, 31 Aug 2023 00:01:43 -0700 Subject: [PATCH] Need to go home, in the middle of refactoring group and dataset as well as implementing the name property correctly --- nwb_linkml/adapters/classes.py | 369 +++++++++----------------- nwb_linkml/adapters/dataset.py | 198 ++++++++++++++ nwb_linkml/adapters/group.py | 89 +++++++ nwb_linkml/adapters/schema.py | 27 +- nwb_schema_language/docs/CHANGELOG.md | 6 + 5 files changed, 432 insertions(+), 257 deletions(-) create mode 100644 nwb_linkml/adapters/dataset.py create mode 100644 nwb_linkml/adapters/group.py create mode 100644 nwb_schema_language/docs/CHANGELOG.md diff --git a/nwb_linkml/adapters/classes.py b/nwb_linkml/adapters/classes.py index 130d0eb..1dd0f8a 100644 --- a/nwb_linkml/adapters/classes.py +++ b/nwb_linkml/adapters/classes.py @@ -2,6 +2,8 @@ Adapters to linkML classes """ import pdb +import re +from abc import abstractmethod from typing import List, Optional from nwb_schema_language import Dataset, Group, ReferenceDtype, CompoundDtype, DTypeType from nwb_linkml.adapters.adapter import Adapter, BuildResult @@ -9,17 +11,95 @@ from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition from nwb_linkml.maps import QUANTITY_MAP from nwb_linkml.lang_elements import Arraylike +CAMEL_TO_SNAKE = re.compile('((?<=[a-z0-9])[A-Z]|(?!^)[A-Z](?=[a-z]))') +""" +Convert camel case to snake case + +courtesy of: https://stackoverflow.com/a/12867228 +""" + +def camel_to_snake(name:str) -> str: + """ + Convert camel case to snake case + + courtesy of: https://stackoverflow.com/a/12867228 + """ + return CAMEL_TO_SNAKE.sub(r'_\1', name).lower() + class ClassAdapter(Adapter): """ - Adapter to class-like things in linkml, including datasets and groups + Abstract adapter to class-like things in linkml, holds methods common to + both DatasetAdapter and GroupAdapter """ cls: Dataset | Group parent: Optional['ClassAdapter'] = None + @abstractmethod + def build(self) -> BuildResult: + """ + Make this abstract so it can't be instantiated directly. + + Subclasses call :meth:`.build_base` to get the basics true of both groups and datasets + """ + + + def build_base(self, extra_attrs: Optional[List[SlotDefinition]]=None) -> BuildResult: + """ + Build the basic class and attributes before adding any specific + modifications for groups or datasets. + """ + + # Build this class + #name = self._get_full_name() + if self.parent is not None: + name = self._get_full_name() + else: + name = self._get_attr_name() + + # Get vanilla top-level attributes + attrs = self.build_attrs(self.cls) + name_slot = self.build_name_slot() + attrs.append(name_slot) + if extra_attrs is not None: + if isinstance(extra_attrs, SlotDefinition): + extra_attrs = [extra_attrs] + attrs.extend(extra_attrs) + + cls = ClassDefinition( + name = name, + is_a = self.cls.neurodata_type_inc, + description=self.cls.doc, + attributes=attrs, + ) + + slots = [] + if self.parent is not None: + slots.append(self.build_self_slot()) + + res = BuildResult( + classes = [cls], + slots = slots + ) + + return res + + def build_attrs(self, cls: Dataset | Group) -> List[SlotDefinition]: + attrs = [ + SlotDefinition( + name=attr.name, + description=attr.doc, + range=self.handle_dtype(attr.dtype), + ) for attr in cls.attributes + ] + + return attrs + def _get_full_name(self) -> str: """The full name of the object in the generated linkml - Distinct from 'name' which is the thing that's often used in """ + Distinct from 'name' which is the thing that's used to define position in + a hierarchical data setting + """ if self.cls.neurodata_type_def: name = self.cls.neurodata_type_def elif self.cls.name is not None: @@ -39,22 +119,21 @@ class ClassAdapter(Adapter): return name - def _get_name(self) -> str: + def _get_attr_name(self) -> str: """ - Get the "regular" name, which is used as the name of the attr - - Returns: - + Get the name to use as the attribute name, + again distinct from the actual name of the instantiated object """ # return self._get_full_name() name = None if self.cls.neurodata_type_def: + #name = camel_to_snake(self.cls.neurodata_type_def) name = self.cls.neurodata_type_def elif self.cls.name is not None: # we do have a unique name name = self.cls.name elif self.cls.neurodata_type_inc: - # group members can be anonymous? this violates the schema but is common + #name = camel_to_snake(self.cls.neurodata_type_inc) name = self.cls.neurodata_type_inc if name is None: @@ -62,125 +141,6 @@ class ClassAdapter(Adapter): return name - - - def handle_arraylike(self, dataset: Dataset, name:Optional[str]=None) -> Optional[ClassDefinition | SlotDefinition]: - """ - Handling the - - - dims - - shape - - dtype - - fields as they are used in datasets. We'll use the :class:`.Arraylike` class to imitate them. - - Specifically: - - - Each slot within a subclass indicates a possible dimension. - - Only dimensions that are present in all the dimension specifiers in the - original schema are required. - - Shape requirements are indicated using max/min cardinalities on the slot. - - The arraylike object should be stored in the `array` slot on the containing class - (since there are already properties named `data`) - - If any of `dims`, `shape`, or `dtype` are undefined, return `None` - - Args: - dataset (:class:`nwb_schema_language.Dataset`): The dataset defining the arraylike - name (str): If present, override the name of the class before appending _Array - (we don't use _get_full_name here because we want to eventually decouple these functions from this adapter - class, which is sort of a development crutch. Ideally all these methods would just work on base nwb schema language types) - """ - if not any((dataset.dims, dataset.shape)): - # none of the required properties are defined, that's fine. - return - elif not all((dataset.dims, dataset.shape)): - # need to have both if one is present! - raise ValueError(f"A dataset needs both dims and shape to define an arraylike object") - - # Special cases - if dataset.neurodata_type_inc == 'VectorData': - # Handle this in `handle_vectorlike` instead - return None - - # The schema language doesn't have a way of specifying a dataset/group is "abstract" - # and yet hdmf-common says you don't need a dtype if the dataset is "abstract" - # so.... - dtype = self.handle_dtype(dataset.dtype) - - # dims and shape are lists of lists. First we couple them - # (so each dim has its corresponding shape).. - # and then we take unique - # (dicts are ordered by default in recent pythons, - # while set() doesn't preserve order) - dims_shape = [] - for inner_dim, inner_shape in zip(dataset.dims, dataset.shape): - if isinstance(inner_dim, list): - # list of lists - dims_shape.extend([(dim, shape) for dim, shape in zip(inner_dim, inner_shape)]) - else: - # single-layer list - dims_shape.append((inner_dim, inner_shape)) - - dims_shape = tuple(dict.fromkeys(dims_shape).keys()) - - # if we only have one possible dimension, it's equivalent to a list, so we just return the slot - if len(dims_shape) == 1 and self.parent: - quantity = QUANTITY_MAP[dataset.quantity] - slot = SlotDefinition( - name=dataset.name, - range = dtype, - description=dataset.doc, - required=quantity['required'], - multivalued=True - ) - return slot - - # now make slots for each of them - slots = [] - for dims, shape in dims_shape: - # if a dim is present in all possible combinations of dims, make it required - if all([dims in inner_dim for inner_dim in dataset.dims]): - required = True - else: - required = False - - # use cardinality to do shape - if shape == 'null': - cardinality = None - else: - cardinality = shape - - slots.append(SlotDefinition( - name=dims, - required=required, - maximum_cardinality=cardinality, - minimum_cardinality=cardinality, - range=dtype - )) - - - - # and then the class is just a subclass of `Arraylike` (which is imported by default from `nwb.language.yaml`) - if name: - pass - elif dataset.neurodata_type_def: - name = dataset.neurodata_type_def - elif dataset.name: - name = dataset.name - else: - raise ValueError(f"Dataset has no name or type definition, what do call it?") - - name = '__'.join([name, 'Array']) - - array_class = ClassDefinition( - name=name, - is_a="Arraylike", - attributes=slots - ) - return array_class - - def handle_dtype(self, dtype: DTypeType | None) -> str: if isinstance(dtype, ReferenceDtype): return dtype.target_type @@ -201,128 +161,49 @@ class ClassAdapter(Adapter): # flat dtype return dtype - def build_attrs(self, cls: Dataset | Group) -> List[SlotDefinition]: - attrs = [ - SlotDefinition( - name=attr.name, - description=attr.doc, - range=self.handle_dtype(attr.dtype), - ) for attr in cls.attributes - ] - - return attrs - - def build_subclasses(self, cls: Dataset | Group) -> BuildResult: + def build_name_slot(self) -> SlotDefinition: """ - Build nested groups and datasets + If a class has a name, then that name should be a slot with a + fixed value. + + If a class does not have a name, then name should be a required attribute + + References: + https://github.com/NeurodataWithoutBorders/nwb-schema/issues/552#issuecomment-1700319001 + + Returns: - Create ClassDefinitions for each, but then also create SlotDefinitions that - will be used as attributes linking the main class to the subclasses """ - # build and flatten nested classes - nested_classes = [ClassAdapter(cls=dset, parent=self) for dset in cls.datasets] - nested_classes.extend([ClassAdapter(cls=grp, parent=self) for grp in cls.groups]) - nested_res = BuildResult() - for subclass in nested_classes: - # handle the special case where `VectorData` is subclasssed without any dims or attributes - # which just gets instantiated as a 1-d array in HDF5 - if subclass.cls.neurodata_type_inc == 'VectorData' and \ - not subclass.cls.dims and \ - not subclass.cls.shape and \ - not subclass.cls.attributes \ - and subclass.cls.name: - this_slot = SlotDefinition( - name=subclass.cls.name, - description=subclass.cls.doc, - range=self.handle_dtype(subclass.cls.dtype), - multivalued=True - ) - nested_res.slots.append(this_slot) - continue - - # Simplify datasets that are just a single value - elif isinstance(subclass.cls, Dataset) and \ - not subclass.cls.neurodata_type_inc and \ - not subclass.cls.attributes and \ - not subclass.cls.dims and \ - not subclass.cls.shape and \ - subclass.cls.name: - this_slot = SlotDefinition( - name=subclass.cls.name, - description=subclass.cls.doc, - range=self.handle_dtype(subclass.cls.dtype), - **QUANTITY_MAP[subclass.cls.quantity] - ) - nested_res.slots.append(this_slot) - continue - - else: - this_slot = SlotDefinition( - name=subclass._get_name(), - description=subclass.cls.doc, - range=subclass._get_full_name(), - **QUANTITY_MAP[subclass.cls.quantity] - ) - nested_res.slots.append(this_slot) - - if subclass.cls.name is None and subclass.cls.neurodata_type_def is None: - # anonymous group that's just an inc, we only need the slot since the class is defined elsewhere - continue - - this_build = subclass.build() - nested_res += this_build - return nested_res - - - def build(self) -> BuildResult: - - # Build this class - if self.parent is not None: - name = self._get_full_name() + if self.cls.name: + name_slot = SlotDefinition( + name='name', + required=True, + ifabsent=self.cls.name, + equals_string=self.cls.name, + range='string' + ) else: - name = self._get_name() + name_slot = SlotDefinition( + name='name', + required=True, + range='string' + ) + return name_slot - # Get vanilla top-level attributes - attrs = self.build_attrs(self.cls) - - # unnest and build subclasses in datasets and groups - if isinstance(self.cls, Group): - # only groups have sub-datasets and sub-groups - # split out the recursion step rather than making purely recursive because - # top-level datasets and groups are handled differently - they have names, - # and so we need to split out which things we unnest and which things - # can just be slots because they are already defined without knowing about - # the global state of the schema build. - nested_res = self.build_subclasses(self.cls) - attrs.extend(nested_res.slots) - else: - # must be a dataset - nested_res = BuildResult() - arraylike = self.handle_arraylike(self.cls, self._get_full_name()) - if arraylike: - # if the arraylike thing can only have one dimension, it's equivalent to a list, so - # we just add a multivalued slot - if isinstance(arraylike, SlotDefinition): - attrs.append(arraylike) - else: - # make a slot for the arraylike class - attrs.append( - SlotDefinition( - name='array', - range=arraylike.name - ) - ) - nested_res.classes.append(arraylike) - - - cls = ClassDefinition( - name = name, - is_a = self.cls.neurodata_type_inc, + def build_self_slot(self) -> SlotDefinition: + """ + If we are a child class, we make a slot so our parent can refer to us + """ + return SlotDefinition( + name=self._get_attr_name(), description=self.cls.doc, - attributes=attrs, - ) - res = BuildResult( - classes = [cls, *nested_res.classes] + range=self._get_full_name(), + **QUANTITY_MAP[self.cls.quantity] ) - return res \ No newline at end of file + + + + + + diff --git a/nwb_linkml/adapters/dataset.py b/nwb_linkml/adapters/dataset.py new file mode 100644 index 0000000..793eba0 --- /dev/null +++ b/nwb_linkml/adapters/dataset.py @@ -0,0 +1,198 @@ +""" +Adapter for NWB datasets to linkml Classes +""" +from typing import Optional, List + +from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition +from pydantic import PrivateAttr + +from nwb_schema_language import Dataset, ReferenceDtype, CompoundDtype, DTypeType +from nwb_linkml.adapters.classes import ClassAdapter +from nwb_linkml.adapters.adapter import BuildResult +from nwb_linkml.maps import QUANTITY_MAP + +class DatasetAdapter(ClassAdapter): + cls: Dataset + + _handlers: List[str] = PrivateAttr(default_factory=list) + """Keep track of which handlers have been called""" + + + def build(self) -> BuildResult: + res = self.build_base() + + res = self.handle_arraylike(res, self.cls, self._get_full_name()) + res = self.handle_1d_vector(res) + res = self.handle_scalar(res) + + return res + + def handle_scalar(self, res:BuildResult) -> BuildResult: + + # Simplify datasets that are just a single value + if self.cls.neurodata_type_inc != 'VectorData' and \ + not self.cls.neurodata_type_inc and \ + not self.cls.attributes and \ + not self.cls.dims and \ + not self.cls.shape and \ + self.cls.name: + self._handlers.append('scalar') + # throw out the class that would have been made for us + # we just need a slot + this_slot = SlotDefinition( + name=self.cls.name, + description=self.cls.doc, + range=self.handle_dtype(self.cls.dtype), + **QUANTITY_MAP[self.cls.quantity] + ) + res = BuildResult(slots = [this_slot]) + + return res + + + def handle_1d_vector(self, res: BuildResult) -> BuildResult: + # handle the special case where `VectorData` is subclasssed without any dims or attributes + # which just gets instantiated as a 1-d array in HDF5 + if self.cls.neurodata_type_inc == 'VectorData' and \ + not self.cls.dims and \ + not self.cls.shape and \ + not self.cls.attributes \ + and self.cls.name: + self._handlers.append('1d_vector') + this_slot = SlotDefinition( + name=self.cls.name, + description=self.cls.doc, + range=self.handle_dtype(self.cls.dtype), + multivalued=True + ) + # No need to make a class for us, so we replace the existing build results + res = BuildResult(slots=[this_slot]) + + return res + + def handle_arraylike(self, res: BuildResult, dataset: Dataset, name: Optional[str] = None) -> BuildResult: + """ + Handling the + + - dims + - shape + - dtype + + fields as they are used in datasets. We'll use the :class:`.Arraylike` class to imitate them. + + Specifically: + + - Each slot within a subclass indicates a possible dimension. + - Only dimensions that are present in all the dimension specifiers in the + original schema are required. + - Shape requirements are indicated using max/min cardinalities on the slot. + - The arraylike object should be stored in the `array` slot on the containing class + (since there are already properties named `data`) + + If any of `dims`, `shape`, or `dtype` are undefined, return `None` + + Args: + dataset (:class:`nwb_schema_language.Dataset`): The dataset defining the arraylike + name (str): If present, override the name of the class before appending _Array + (we don't use _get_full_name here because we want to eventually decouple these functions from this adapter + class, which is sort of a development crutch. Ideally all these methods would just work on base nwb schema language types) + """ + if not any((dataset.dims, dataset.shape)): + # none of the required properties are defined, that's fine. + return res + elif not all((dataset.dims, dataset.shape)): + # need to have both if one is present! + raise ValueError(f"A dataset needs both dims and shape to define an arraylike object") + + # Special cases + if dataset.neurodata_type_inc == 'VectorData': + # Handle this in `handle_vectorlike` instead + return res + + # The schema language doesn't have a way of specifying a dataset/group is "abstract" + # and yet hdmf-common says you don't need a dtype if the dataset is "abstract" + # so.... + dtype = self.handle_dtype(dataset.dtype) + + # dims and shape are lists of lists. First we couple them + # (so each dim has its corresponding shape).. + # and then we take unique + # (dicts are ordered by default in recent pythons, + # while set() doesn't preserve order) + dims_shape = [] + for inner_dim, inner_shape in zip(dataset.dims, dataset.shape): + if isinstance(inner_dim, list): + # list of lists + dims_shape.extend([(dim, shape) for dim, shape in zip(inner_dim, inner_shape)]) + else: + # single-layer list + dims_shape.append((inner_dim, inner_shape)) + + dims_shape = tuple(dict.fromkeys(dims_shape).keys()) + + # if we only have one possible dimension, it's equivalent to a list, so we just return the slot + if len(dims_shape) == 1 and self.parent: + quantity = QUANTITY_MAP[dataset.quantity] + slot = SlotDefinition( + name=dataset.name, + range=dtype, + description=dataset.doc, + required=quantity['required'], + multivalued=True + ) + res.classes[0].attributes.update({dataset.name: slot}) + self._handlers.append('arraylike-1d') + return res + + # now make slots for each of them + slots = [] + for dims, shape in dims_shape: + # if a dim is present in all possible combinations of dims, make it required + if all([dims in inner_dim for inner_dim in dataset.dims]): + required = True + else: + required = False + + # use cardinality to do shape + if shape == 'null': + cardinality = None + else: + cardinality = shape + + slots.append(SlotDefinition( + name=dims, + required=required, + maximum_cardinality=cardinality, + minimum_cardinality=cardinality, + range=dtype + )) + + # and then the class is just a subclass of `Arraylike` (which is imported by default from `nwb.language.yaml`) + if name: + pass + elif dataset.neurodata_type_def: + name = dataset.neurodata_type_def + elif dataset.name: + name = dataset.name + else: + raise ValueError(f"Dataset has no name or type definition, what do call it?") + + name = '__'.join([name, 'Array']) + + array_class = ClassDefinition( + name=name, + is_a="Arraylike", + attributes=slots + ) + # make a slot for the arraylike class + array_slot = SlotDefinition( + name='array', + range=array_class.name + ) + + res.classes.append(array_class) + res.classes[0].attributes.update({'array': array_slot}) + #res.slots.append(array_slot) + self._handlers.append('arraylike') + + return res diff --git a/nwb_linkml/adapters/group.py b/nwb_linkml/adapters/group.py new file mode 100644 index 0000000..a7d5bff --- /dev/null +++ b/nwb_linkml/adapters/group.py @@ -0,0 +1,89 @@ +""" +Adapter for NWB groups to linkml Classes +""" +import pdb +from typing import List +from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition + +from nwb_schema_language import Dataset, Group, ReferenceDtype, CompoundDtype, DTypeType +from nwb_linkml.adapters.classes import ClassAdapter +from nwb_linkml.adapters.dataset import DatasetAdapter +from nwb_linkml.adapters.adapter import BuildResult +from nwb_linkml.maps import QUANTITY_MAP + +class GroupAdapter(ClassAdapter): + cls: Group + + def build(self) -> BuildResult: + + + nested_res = self.build_subclasses() + # we don't propagate slots up to the next level since they are meant for this + # level (ie. a way to refer to our children) + res = self.build_base(extra_attrs=nested_res.slots) + # we do propagate classes tho + res.classes.extend(nested_res.classes) + + return res + + def handle_children(self, children: List[Group]) -> BuildResult: + """ + Make a special LinkML `children` slot that can + have any number of the objects that are of `neurodata_type_inc` class + + Args: + children (List[:class:`.Group`]): Child groups + + """ + child_slot = SlotDefinition( + name='children', + multivalued=True, + any_of=[{'range': cls.neurodata_type_inc} for cls in children] + ) + return BuildResult(slots=[child_slot]) + + def build_subclasses(self) -> BuildResult: + """ + Build nested groups and datasets + + Create ClassDefinitions for each, but then also create SlotDefinitions that + will be used as attributes linking the main class to the subclasses + """ + # Datasets are simple, they are terminal classes, and all logic + # for creating slots vs. classes is handled by the adapter class + dataset_res = BuildResult() + for dset in self.cls.datasets: + # if dset.name == 'timestamps': + # pdb.set_trace() + dset_adapter = DatasetAdapter(cls=dset, parent=self) + dataset_res += dset_adapter.build() + + # Actually i'm not sure we have to special case this, we could handle it in + # i/o instead + + # Groups are a bit more complicated because they can also behave like + # range declarations: + # eg. a group can have multiple groups with `neurodata_type_inc`, no name, and quantity of *, + # the group can then contain any number of groups of those included types as direct children + + # group_res = BuildResult() + # children = [] + # for group in self.cls.groups: + # if not group.name and \ + # group.quantity == '*' and \ + # group.neurodata_type_inc: + # children.append(group) + # else: + # group_adapter = GroupAdapter(cls=group, parent=self) + # group_res += group_adapter.build() + # + # group_res += self.handle_children(children) + + group_res = BuildResult() + for group in self.cls.groups: + group_adapter = GroupAdapter(cls=group, parent=self) + group_res += group_adapter.build() + + res = dataset_res + group_res + + return res \ No newline at end of file diff --git a/nwb_linkml/adapters/schema.py b/nwb_linkml/adapters/schema.py index 1111beb..1c4a878 100644 --- a/nwb_linkml/adapters/schema.py +++ b/nwb_linkml/adapters/schema.py @@ -8,7 +8,8 @@ from pathlib import Path from pydantic import Field from nwb_linkml.adapters.adapter import Adapter, BuildResult -from nwb_linkml.adapters.classes import ClassAdapter +from nwb_linkml.adapters.dataset import DatasetAdapter +from nwb_linkml.adapters.group import GroupAdapter if TYPE_CHECKING: from nwb_linkml.adapters.namespaces import NamespacesAdapter @@ -68,17 +69,17 @@ class SchemaAdapter(Adapter): """ - classes = [ClassAdapter(cls=dset) for dset in self.datasets] - classes.extend(ClassAdapter(cls=group) for group in self.groups) - built_results = None - for cls in classes: - if built_results is None: - built_results = cls.build() - else: - built_results += cls.build() + res = BuildResult() + for dset in self.datasets: + res += DatasetAdapter(cls=dset).build() + for group in self.groups: + res += GroupAdapter(cls=group).build() + + if len(res.slots) > 0: + raise RuntimeError('Generated schema in this translation can only have classes, all slots should be attributes within a class') if self.split: - sch_split = self.split_subclasses(built_results) + sch_split = self.split_subclasses(res) return sch_split else: @@ -86,9 +87,9 @@ class SchemaAdapter(Adapter): name = self.name, id = self.name, imports = [i.name for i in self.imports], - classes=built_results.classes, - slots=built_results.slots, - types=built_results.types + classes=res.classes, + slots=res.slots, + types=res.types ) # every schema needs the language elements sch.imports.append('nwb.language') diff --git a/nwb_schema_language/docs/CHANGELOG.md b/nwb_schema_language/docs/CHANGELOG.md new file mode 100644 index 0000000..06796a7 --- /dev/null +++ b/nwb_schema_language/docs/CHANGELOG.md @@ -0,0 +1,6 @@ +# 0.1.1 + +Revised models to make `name` an optional slot regardless of presence/absence +of `neurodata_type_def`, the naming of individual classes within the schema will be +handled by `nwb_linkml` - see: +https://github.com/NeurodataWithoutBorders/nwb-schema/issues/552 \ No newline at end of file