Need to go home, in the middle of refactoring group and dataset as well as implementing the name property correctly

This commit is contained in:
sneakers-the-rat 2023-08-31 00:01:43 -07:00
parent fd9aef9531
commit 3568037a1e
5 changed files with 432 additions and 257 deletions

View file

@ -2,6 +2,8 @@
Adapters to linkML classes Adapters to linkML classes
""" """
import pdb import pdb
import re
from abc import abstractmethod
from typing import List, Optional from typing import List, Optional
from nwb_schema_language import Dataset, Group, ReferenceDtype, CompoundDtype, DTypeType from nwb_schema_language import Dataset, Group, ReferenceDtype, CompoundDtype, DTypeType
from nwb_linkml.adapters.adapter import Adapter, BuildResult from nwb_linkml.adapters.adapter import Adapter, BuildResult
@ -9,17 +11,95 @@ from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
from nwb_linkml.maps import QUANTITY_MAP from nwb_linkml.maps import QUANTITY_MAP
from nwb_linkml.lang_elements import Arraylike from nwb_linkml.lang_elements import Arraylike
CAMEL_TO_SNAKE = re.compile('((?<=[a-z0-9])[A-Z]|(?!^)[A-Z](?=[a-z]))')
"""
Convert camel case to snake case
courtesy of: https://stackoverflow.com/a/12867228
"""
def camel_to_snake(name:str) -> str:
"""
Convert camel case to snake case
courtesy of: https://stackoverflow.com/a/12867228
"""
return CAMEL_TO_SNAKE.sub(r'_\1', name).lower()
class ClassAdapter(Adapter): class ClassAdapter(Adapter):
""" """
Adapter to class-like things in linkml, including datasets and groups Abstract adapter to class-like things in linkml, holds methods common to
both DatasetAdapter and GroupAdapter
""" """
cls: Dataset | Group cls: Dataset | Group
parent: Optional['ClassAdapter'] = None parent: Optional['ClassAdapter'] = None
@abstractmethod
def build(self) -> BuildResult:
"""
Make this abstract so it can't be instantiated directly.
Subclasses call :meth:`.build_base` to get the basics true of both groups and datasets
"""
def build_base(self, extra_attrs: Optional[List[SlotDefinition]]=None) -> BuildResult:
"""
Build the basic class and attributes before adding any specific
modifications for groups or datasets.
"""
# Build this class
#name = self._get_full_name()
if self.parent is not None:
name = self._get_full_name()
else:
name = self._get_attr_name()
# Get vanilla top-level attributes
attrs = self.build_attrs(self.cls)
name_slot = self.build_name_slot()
attrs.append(name_slot)
if extra_attrs is not None:
if isinstance(extra_attrs, SlotDefinition):
extra_attrs = [extra_attrs]
attrs.extend(extra_attrs)
cls = ClassDefinition(
name = name,
is_a = self.cls.neurodata_type_inc,
description=self.cls.doc,
attributes=attrs,
)
slots = []
if self.parent is not None:
slots.append(self.build_self_slot())
res = BuildResult(
classes = [cls],
slots = slots
)
return res
def build_attrs(self, cls: Dataset | Group) -> List[SlotDefinition]:
attrs = [
SlotDefinition(
name=attr.name,
description=attr.doc,
range=self.handle_dtype(attr.dtype),
) for attr in cls.attributes
]
return attrs
def _get_full_name(self) -> str: def _get_full_name(self) -> str:
"""The full name of the object in the generated linkml """The full name of the object in the generated linkml
Distinct from 'name' which is the thing that's often used in """ Distinct from 'name' which is the thing that's used to define position in
a hierarchical data setting
"""
if self.cls.neurodata_type_def: if self.cls.neurodata_type_def:
name = self.cls.neurodata_type_def name = self.cls.neurodata_type_def
elif self.cls.name is not None: elif self.cls.name is not None:
@ -39,22 +119,21 @@ class ClassAdapter(Adapter):
return name return name
def _get_name(self) -> str: def _get_attr_name(self) -> str:
""" """
Get the "regular" name, which is used as the name of the attr Get the name to use as the attribute name,
again distinct from the actual name of the instantiated object
Returns:
""" """
# return self._get_full_name() # return self._get_full_name()
name = None name = None
if self.cls.neurodata_type_def: if self.cls.neurodata_type_def:
#name = camel_to_snake(self.cls.neurodata_type_def)
name = self.cls.neurodata_type_def name = self.cls.neurodata_type_def
elif self.cls.name is not None: elif self.cls.name is not None:
# we do have a unique name # we do have a unique name
name = self.cls.name name = self.cls.name
elif self.cls.neurodata_type_inc: elif self.cls.neurodata_type_inc:
# group members can be anonymous? this violates the schema but is common #name = camel_to_snake(self.cls.neurodata_type_inc)
name = self.cls.neurodata_type_inc name = self.cls.neurodata_type_inc
if name is None: if name is None:
@ -62,125 +141,6 @@ class ClassAdapter(Adapter):
return name return name
def handle_arraylike(self, dataset: Dataset, name:Optional[str]=None) -> Optional[ClassDefinition | SlotDefinition]:
"""
Handling the
- dims
- shape
- dtype
fields as they are used in datasets. We'll use the :class:`.Arraylike` class to imitate them.
Specifically:
- Each slot within a subclass indicates a possible dimension.
- Only dimensions that are present in all the dimension specifiers in the
original schema are required.
- Shape requirements are indicated using max/min cardinalities on the slot.
- The arraylike object should be stored in the `array` slot on the containing class
(since there are already properties named `data`)
If any of `dims`, `shape`, or `dtype` are undefined, return `None`
Args:
dataset (:class:`nwb_schema_language.Dataset`): The dataset defining the arraylike
name (str): If present, override the name of the class before appending _Array
(we don't use _get_full_name here because we want to eventually decouple these functions from this adapter
class, which is sort of a development crutch. Ideally all these methods would just work on base nwb schema language types)
"""
if not any((dataset.dims, dataset.shape)):
# none of the required properties are defined, that's fine.
return
elif not all((dataset.dims, dataset.shape)):
# need to have both if one is present!
raise ValueError(f"A dataset needs both dims and shape to define an arraylike object")
# Special cases
if dataset.neurodata_type_inc == 'VectorData':
# Handle this in `handle_vectorlike` instead
return None
# The schema language doesn't have a way of specifying a dataset/group is "abstract"
# and yet hdmf-common says you don't need a dtype if the dataset is "abstract"
# so....
dtype = self.handle_dtype(dataset.dtype)
# dims and shape are lists of lists. First we couple them
# (so each dim has its corresponding shape)..
# and then we take unique
# (dicts are ordered by default in recent pythons,
# while set() doesn't preserve order)
dims_shape = []
for inner_dim, inner_shape in zip(dataset.dims, dataset.shape):
if isinstance(inner_dim, list):
# list of lists
dims_shape.extend([(dim, shape) for dim, shape in zip(inner_dim, inner_shape)])
else:
# single-layer list
dims_shape.append((inner_dim, inner_shape))
dims_shape = tuple(dict.fromkeys(dims_shape).keys())
# if we only have one possible dimension, it's equivalent to a list, so we just return the slot
if len(dims_shape) == 1 and self.parent:
quantity = QUANTITY_MAP[dataset.quantity]
slot = SlotDefinition(
name=dataset.name,
range = dtype,
description=dataset.doc,
required=quantity['required'],
multivalued=True
)
return slot
# now make slots for each of them
slots = []
for dims, shape in dims_shape:
# if a dim is present in all possible combinations of dims, make it required
if all([dims in inner_dim for inner_dim in dataset.dims]):
required = True
else:
required = False
# use cardinality to do shape
if shape == 'null':
cardinality = None
else:
cardinality = shape
slots.append(SlotDefinition(
name=dims,
required=required,
maximum_cardinality=cardinality,
minimum_cardinality=cardinality,
range=dtype
))
# and then the class is just a subclass of `Arraylike` (which is imported by default from `nwb.language.yaml`)
if name:
pass
elif dataset.neurodata_type_def:
name = dataset.neurodata_type_def
elif dataset.name:
name = dataset.name
else:
raise ValueError(f"Dataset has no name or type definition, what do call it?")
name = '__'.join([name, 'Array'])
array_class = ClassDefinition(
name=name,
is_a="Arraylike",
attributes=slots
)
return array_class
def handle_dtype(self, dtype: DTypeType | None) -> str: def handle_dtype(self, dtype: DTypeType | None) -> str:
if isinstance(dtype, ReferenceDtype): if isinstance(dtype, ReferenceDtype):
return dtype.target_type return dtype.target_type
@ -201,128 +161,49 @@ class ClassAdapter(Adapter):
# flat dtype # flat dtype
return dtype return dtype
def build_attrs(self, cls: Dataset | Group) -> List[SlotDefinition]: def build_name_slot(self) -> SlotDefinition:
attrs = [
SlotDefinition(
name=attr.name,
description=attr.doc,
range=self.handle_dtype(attr.dtype),
) for attr in cls.attributes
]
return attrs
def build_subclasses(self, cls: Dataset | Group) -> BuildResult:
""" """
Build nested groups and datasets If a class has a name, then that name should be a slot with a
fixed value.
If a class does not have a name, then name should be a required attribute
References:
https://github.com/NeurodataWithoutBorders/nwb-schema/issues/552#issuecomment-1700319001
Returns:
Create ClassDefinitions for each, but then also create SlotDefinitions that
will be used as attributes linking the main class to the subclasses
""" """
# build and flatten nested classes if self.cls.name:
nested_classes = [ClassAdapter(cls=dset, parent=self) for dset in cls.datasets] name_slot = SlotDefinition(
nested_classes.extend([ClassAdapter(cls=grp, parent=self) for grp in cls.groups]) name='name',
nested_res = BuildResult() required=True,
for subclass in nested_classes: ifabsent=self.cls.name,
# handle the special case where `VectorData` is subclasssed without any dims or attributes equals_string=self.cls.name,
# which just gets instantiated as a 1-d array in HDF5 range='string'
if subclass.cls.neurodata_type_inc == 'VectorData' and \ )
not subclass.cls.dims and \
not subclass.cls.shape and \
not subclass.cls.attributes \
and subclass.cls.name:
this_slot = SlotDefinition(
name=subclass.cls.name,
description=subclass.cls.doc,
range=self.handle_dtype(subclass.cls.dtype),
multivalued=True
)
nested_res.slots.append(this_slot)
continue
# Simplify datasets that are just a single value
elif isinstance(subclass.cls, Dataset) and \
not subclass.cls.neurodata_type_inc and \
not subclass.cls.attributes and \
not subclass.cls.dims and \
not subclass.cls.shape and \
subclass.cls.name:
this_slot = SlotDefinition(
name=subclass.cls.name,
description=subclass.cls.doc,
range=self.handle_dtype(subclass.cls.dtype),
**QUANTITY_MAP[subclass.cls.quantity]
)
nested_res.slots.append(this_slot)
continue
else:
this_slot = SlotDefinition(
name=subclass._get_name(),
description=subclass.cls.doc,
range=subclass._get_full_name(),
**QUANTITY_MAP[subclass.cls.quantity]
)
nested_res.slots.append(this_slot)
if subclass.cls.name is None and subclass.cls.neurodata_type_def is None:
# anonymous group that's just an inc, we only need the slot since the class is defined elsewhere
continue
this_build = subclass.build()
nested_res += this_build
return nested_res
def build(self) -> BuildResult:
# Build this class
if self.parent is not None:
name = self._get_full_name()
else: else:
name = self._get_name() name_slot = SlotDefinition(
name='name',
required=True,
range='string'
)
return name_slot
# Get vanilla top-level attributes def build_self_slot(self) -> SlotDefinition:
attrs = self.build_attrs(self.cls) """
If we are a child class, we make a slot so our parent can refer to us
# unnest and build subclasses in datasets and groups """
if isinstance(self.cls, Group): return SlotDefinition(
# only groups have sub-datasets and sub-groups name=self._get_attr_name(),
# split out the recursion step rather than making purely recursive because
# top-level datasets and groups are handled differently - they have names,
# and so we need to split out which things we unnest and which things
# can just be slots because they are already defined without knowing about
# the global state of the schema build.
nested_res = self.build_subclasses(self.cls)
attrs.extend(nested_res.slots)
else:
# must be a dataset
nested_res = BuildResult()
arraylike = self.handle_arraylike(self.cls, self._get_full_name())
if arraylike:
# if the arraylike thing can only have one dimension, it's equivalent to a list, so
# we just add a multivalued slot
if isinstance(arraylike, SlotDefinition):
attrs.append(arraylike)
else:
# make a slot for the arraylike class
attrs.append(
SlotDefinition(
name='array',
range=arraylike.name
)
)
nested_res.classes.append(arraylike)
cls = ClassDefinition(
name = name,
is_a = self.cls.neurodata_type_inc,
description=self.cls.doc, description=self.cls.doc,
attributes=attrs, range=self._get_full_name(),
) **QUANTITY_MAP[self.cls.quantity]
res = BuildResult(
classes = [cls, *nested_res.classes]
) )
return res

View file

@ -0,0 +1,198 @@
"""
Adapter for NWB datasets to linkml Classes
"""
from typing import Optional, List
from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
from pydantic import PrivateAttr
from nwb_schema_language import Dataset, ReferenceDtype, CompoundDtype, DTypeType
from nwb_linkml.adapters.classes import ClassAdapter
from nwb_linkml.adapters.adapter import BuildResult
from nwb_linkml.maps import QUANTITY_MAP
class DatasetAdapter(ClassAdapter):
cls: Dataset
_handlers: List[str] = PrivateAttr(default_factory=list)
"""Keep track of which handlers have been called"""
def build(self) -> BuildResult:
res = self.build_base()
res = self.handle_arraylike(res, self.cls, self._get_full_name())
res = self.handle_1d_vector(res)
res = self.handle_scalar(res)
return res
def handle_scalar(self, res:BuildResult) -> BuildResult:
# Simplify datasets that are just a single value
if self.cls.neurodata_type_inc != 'VectorData' and \
not self.cls.neurodata_type_inc and \
not self.cls.attributes and \
not self.cls.dims and \
not self.cls.shape and \
self.cls.name:
self._handlers.append('scalar')
# throw out the class that would have been made for us
# we just need a slot
this_slot = SlotDefinition(
name=self.cls.name,
description=self.cls.doc,
range=self.handle_dtype(self.cls.dtype),
**QUANTITY_MAP[self.cls.quantity]
)
res = BuildResult(slots = [this_slot])
return res
def handle_1d_vector(self, res: BuildResult) -> BuildResult:
# handle the special case where `VectorData` is subclasssed without any dims or attributes
# which just gets instantiated as a 1-d array in HDF5
if self.cls.neurodata_type_inc == 'VectorData' and \
not self.cls.dims and \
not self.cls.shape and \
not self.cls.attributes \
and self.cls.name:
self._handlers.append('1d_vector')
this_slot = SlotDefinition(
name=self.cls.name,
description=self.cls.doc,
range=self.handle_dtype(self.cls.dtype),
multivalued=True
)
# No need to make a class for us, so we replace the existing build results
res = BuildResult(slots=[this_slot])
return res
def handle_arraylike(self, res: BuildResult, dataset: Dataset, name: Optional[str] = None) -> BuildResult:
"""
Handling the
- dims
- shape
- dtype
fields as they are used in datasets. We'll use the :class:`.Arraylike` class to imitate them.
Specifically:
- Each slot within a subclass indicates a possible dimension.
- Only dimensions that are present in all the dimension specifiers in the
original schema are required.
- Shape requirements are indicated using max/min cardinalities on the slot.
- The arraylike object should be stored in the `array` slot on the containing class
(since there are already properties named `data`)
If any of `dims`, `shape`, or `dtype` are undefined, return `None`
Args:
dataset (:class:`nwb_schema_language.Dataset`): The dataset defining the arraylike
name (str): If present, override the name of the class before appending _Array
(we don't use _get_full_name here because we want to eventually decouple these functions from this adapter
class, which is sort of a development crutch. Ideally all these methods would just work on base nwb schema language types)
"""
if not any((dataset.dims, dataset.shape)):
# none of the required properties are defined, that's fine.
return res
elif not all((dataset.dims, dataset.shape)):
# need to have both if one is present!
raise ValueError(f"A dataset needs both dims and shape to define an arraylike object")
# Special cases
if dataset.neurodata_type_inc == 'VectorData':
# Handle this in `handle_vectorlike` instead
return res
# The schema language doesn't have a way of specifying a dataset/group is "abstract"
# and yet hdmf-common says you don't need a dtype if the dataset is "abstract"
# so....
dtype = self.handle_dtype(dataset.dtype)
# dims and shape are lists of lists. First we couple them
# (so each dim has its corresponding shape)..
# and then we take unique
# (dicts are ordered by default in recent pythons,
# while set() doesn't preserve order)
dims_shape = []
for inner_dim, inner_shape in zip(dataset.dims, dataset.shape):
if isinstance(inner_dim, list):
# list of lists
dims_shape.extend([(dim, shape) for dim, shape in zip(inner_dim, inner_shape)])
else:
# single-layer list
dims_shape.append((inner_dim, inner_shape))
dims_shape = tuple(dict.fromkeys(dims_shape).keys())
# if we only have one possible dimension, it's equivalent to a list, so we just return the slot
if len(dims_shape) == 1 and self.parent:
quantity = QUANTITY_MAP[dataset.quantity]
slot = SlotDefinition(
name=dataset.name,
range=dtype,
description=dataset.doc,
required=quantity['required'],
multivalued=True
)
res.classes[0].attributes.update({dataset.name: slot})
self._handlers.append('arraylike-1d')
return res
# now make slots for each of them
slots = []
for dims, shape in dims_shape:
# if a dim is present in all possible combinations of dims, make it required
if all([dims in inner_dim for inner_dim in dataset.dims]):
required = True
else:
required = False
# use cardinality to do shape
if shape == 'null':
cardinality = None
else:
cardinality = shape
slots.append(SlotDefinition(
name=dims,
required=required,
maximum_cardinality=cardinality,
minimum_cardinality=cardinality,
range=dtype
))
# and then the class is just a subclass of `Arraylike` (which is imported by default from `nwb.language.yaml`)
if name:
pass
elif dataset.neurodata_type_def:
name = dataset.neurodata_type_def
elif dataset.name:
name = dataset.name
else:
raise ValueError(f"Dataset has no name or type definition, what do call it?")
name = '__'.join([name, 'Array'])
array_class = ClassDefinition(
name=name,
is_a="Arraylike",
attributes=slots
)
# make a slot for the arraylike class
array_slot = SlotDefinition(
name='array',
range=array_class.name
)
res.classes.append(array_class)
res.classes[0].attributes.update({'array': array_slot})
#res.slots.append(array_slot)
self._handlers.append('arraylike')
return res

View file

@ -0,0 +1,89 @@
"""
Adapter for NWB groups to linkml Classes
"""
import pdb
from typing import List
from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
from nwb_schema_language import Dataset, Group, ReferenceDtype, CompoundDtype, DTypeType
from nwb_linkml.adapters.classes import ClassAdapter
from nwb_linkml.adapters.dataset import DatasetAdapter
from nwb_linkml.adapters.adapter import BuildResult
from nwb_linkml.maps import QUANTITY_MAP
class GroupAdapter(ClassAdapter):
cls: Group
def build(self) -> BuildResult:
nested_res = self.build_subclasses()
# we don't propagate slots up to the next level since they are meant for this
# level (ie. a way to refer to our children)
res = self.build_base(extra_attrs=nested_res.slots)
# we do propagate classes tho
res.classes.extend(nested_res.classes)
return res
def handle_children(self, children: List[Group]) -> BuildResult:
"""
Make a special LinkML `children` slot that can
have any number of the objects that are of `neurodata_type_inc` class
Args:
children (List[:class:`.Group`]): Child groups
"""
child_slot = SlotDefinition(
name='children',
multivalued=True,
any_of=[{'range': cls.neurodata_type_inc} for cls in children]
)
return BuildResult(slots=[child_slot])
def build_subclasses(self) -> BuildResult:
"""
Build nested groups and datasets
Create ClassDefinitions for each, but then also create SlotDefinitions that
will be used as attributes linking the main class to the subclasses
"""
# Datasets are simple, they are terminal classes, and all logic
# for creating slots vs. classes is handled by the adapter class
dataset_res = BuildResult()
for dset in self.cls.datasets:
# if dset.name == 'timestamps':
# pdb.set_trace()
dset_adapter = DatasetAdapter(cls=dset, parent=self)
dataset_res += dset_adapter.build()
# Actually i'm not sure we have to special case this, we could handle it in
# i/o instead
# Groups are a bit more complicated because they can also behave like
# range declarations:
# eg. a group can have multiple groups with `neurodata_type_inc`, no name, and quantity of *,
# the group can then contain any number of groups of those included types as direct children
# group_res = BuildResult()
# children = []
# for group in self.cls.groups:
# if not group.name and \
# group.quantity == '*' and \
# group.neurodata_type_inc:
# children.append(group)
# else:
# group_adapter = GroupAdapter(cls=group, parent=self)
# group_res += group_adapter.build()
#
# group_res += self.handle_children(children)
group_res = BuildResult()
for group in self.cls.groups:
group_adapter = GroupAdapter(cls=group, parent=self)
group_res += group_adapter.build()
res = dataset_res + group_res
return res

View file

@ -8,7 +8,8 @@ from pathlib import Path
from pydantic import Field from pydantic import Field
from nwb_linkml.adapters.adapter import Adapter, BuildResult from nwb_linkml.adapters.adapter import Adapter, BuildResult
from nwb_linkml.adapters.classes import ClassAdapter from nwb_linkml.adapters.dataset import DatasetAdapter
from nwb_linkml.adapters.group import GroupAdapter
if TYPE_CHECKING: if TYPE_CHECKING:
from nwb_linkml.adapters.namespaces import NamespacesAdapter from nwb_linkml.adapters.namespaces import NamespacesAdapter
@ -68,17 +69,17 @@ class SchemaAdapter(Adapter):
""" """
classes = [ClassAdapter(cls=dset) for dset in self.datasets] res = BuildResult()
classes.extend(ClassAdapter(cls=group) for group in self.groups) for dset in self.datasets:
built_results = None res += DatasetAdapter(cls=dset).build()
for cls in classes: for group in self.groups:
if built_results is None: res += GroupAdapter(cls=group).build()
built_results = cls.build()
else: if len(res.slots) > 0:
built_results += cls.build() raise RuntimeError('Generated schema in this translation can only have classes, all slots should be attributes within a class')
if self.split: if self.split:
sch_split = self.split_subclasses(built_results) sch_split = self.split_subclasses(res)
return sch_split return sch_split
else: else:
@ -86,9 +87,9 @@ class SchemaAdapter(Adapter):
name = self.name, name = self.name,
id = self.name, id = self.name,
imports = [i.name for i in self.imports], imports = [i.name for i in self.imports],
classes=built_results.classes, classes=res.classes,
slots=built_results.slots, slots=res.slots,
types=built_results.types types=res.types
) )
# every schema needs the language elements # every schema needs the language elements
sch.imports.append('nwb.language') sch.imports.append('nwb.language')

View file

@ -0,0 +1,6 @@
# 0.1.1
Revised models to make `name` an optional slot regardless of presence/absence
of `neurodata_type_def`, the naming of individual classes within the schema will be
handled by `nwb_linkml` - see:
https://github.com/NeurodataWithoutBorders/nwb-schema/issues/552