more docs for dataset adapters.

also:
- fix search for initial - in dataset doctest
- don't clear git repos on repeated test runs
- fix string handling in load_yaml
This commit is contained in:
sneakers-the-rat 2024-07-09 03:26:45 -07:00
parent ce902476d1
commit f4d397cde1
Signed by untrusted user who does not match committer: jonny
GPG key ID: 6DCB96EF1E4D232D
5 changed files with 283 additions and 10 deletions

View file

@ -19,7 +19,7 @@ NWB_KEYS = re.compile(r"(^\s*datasets:\s*\n)|^groups:")
def _strip_nwb(nwb: str) -> str:
# strip 'datasets:' keys and decoration left in for readability/context
nwb = re.sub(NWB_KEYS, "", nwb)
nwb = re.sub(r"-", " ", nwb)
nwb = re.sub(r"^-", " ", nwb)
nwb = textwrap.dedent(nwb)
return nwb

View file

@ -107,7 +107,7 @@ class ArrayAdapter:
Dict[Literal["any_of"], Dict[Literal["array"], List[ArrayExpression]]],
]:
"""
Make the array expressions in a dict form that can be **kwarg'd into a SlotDefinition,
Make the array expressions in a dict form that can be ``**kwarg``'d into a SlotDefinition,
taking into account needing to use ``any_of`` for multiple array range specifications.
"""
expressions = self.make()

View file

@ -1,7 +1,6 @@
"""
Adapter for NWB datasets to linkml Classes
"""
from abc import abstractmethod
from typing import ClassVar, Optional, Type
@ -119,6 +118,54 @@ class MapScalarAttributes(DatasetMap):
"""
A scalar with attributes gets an additional slot "value" that contains the actual scalar
value of this field
Examples:
.. adapter:: DatasetAdapter
:nwb:
datasets:
- name: starting_time
dtype: float64
doc: Timestamp of the first sample in seconds. When timestamps are uniformly
spaced, the timestamp of the first sample can be specified and all subsequent
ones calculated from the sampling rate attribute.
quantity: '?'
attributes:
- name: rate
dtype: float32
doc: Sampling rate, in Hz.
- name: unit
dtype: text
value: seconds
doc: Unit of measurement for time, which is fixed to 'seconds'.
:linkml:
classes:
- name: starting_time
description: Timestamp of the first sample in seconds. When timestamps are uniformly
spaced, the timestamp of the first sample can be specified and all subsequent
ones calculated from the sampling rate attribute.
attributes:
name:
name: name
ifabsent: string(starting_time)
identifier: true
range: string
required: true
equals_string: starting_time
rate:
name: rate
description: Sampling rate, in Hz.
range: float32
unit:
name: unit
description: Unit of measurement for time, which is fixed to 'seconds'.
range: text
value:
name: value
range: float64
required: true
tree_root: true
"""
@classmethod
@ -133,7 +180,7 @@ class MapScalarAttributes(DatasetMap):
* - ``neurodata_type_inc``
- ``None``
* - ``attributes``
- Truthy
- ``True``
* - ``dims``
- ``None``
* - ``shape``
@ -167,13 +214,65 @@ class MapScalarAttributes(DatasetMap):
class MapListlike(DatasetMap):
"""
Datasets that refer to other datasets (that handle their own arrays)
Datasets that refer to a list of other datasets.
Used exactly once in the core schema, in ``ImageReferences`` -
an array of references to other ``Image`` datasets. We ignore the
usual array structure and unnest the implicit array into a slot names from the
target type rather than the oddly-named ``num_images`` dimension so that
ultimately in the pydantic model we get a nicely behaved single-level list.
Examples:
.. adapter:: DatasetAdapter
:nwb:
datasets:
- neurodata_type_def: ImageReferences
neurodata_type_inc: NWBData
dtype:
target_type: Image
reftype: object
dims:
- num_images
shape:
- null
doc: Ordered dataset of references to Image objects.
:linkml:
classes:
- name: ImageReferences
description: Ordered dataset of references to Image objects.
is_a: NWBData
attributes:
name:
name: name
identifier: true
range: string
required: true
image:
name: image
description: Ordered dataset of references to Image objects.
multivalued: true
range: Image
required: true
tree_root: true
"""
@classmethod
def check(c, cls: Dataset) -> bool:
"""
Check if we are a 1D dataset that isn't a normal datatype
.. list-table::
:header-rows: 1
:align: left
* - Attr
- Value
* - :func:`.is_1d`
- ``True``
* - ``dtype``
- ``Class``
"""
dtype = ClassAdapter.handle_dtype(cls.dtype)
return is_1d(cls) and dtype != "AnyType" and dtype not in flat_to_linkml
@ -202,13 +301,83 @@ class MapArraylike(DatasetMap):
Datasets without any additional attributes don't create their own subclass,
they're just an array :).
Replace the base class with the array class, and make a slot that refers to it.
Replace the base class with a slot that defines the array.
Examples:
eg. from ``image.ImageSeries`` :
.. adapter:: DatasetAdapter
:nwb:
datasets:
- name: data
dtype: numeric
dims:
- - frame
- x
- y
- - frame
- x
- y
- z
shape:
- - null
- null
- null
- - null
- null
- null
- null
doc: Binary data representing images across frames. If data are stored in an external
file, this should be an empty 3D array.
:linkml:
slots:
- name: data
description: Binary data representing images across frames. If data are stored in
an external file, this should be an empty 3D array.
multivalued: false
range: numeric
required: true
any_of:
- array:
dimensions:
- alias: frame
- alias: x
- alias: y
- array:
dimensions:
- alias: frame
- alias: x
- alias: y
- alias: z
"""
@classmethod
def check(c, cls: Dataset) -> bool:
"""
Check if we're a plain array
.. list-table::
:header-rows: 1
:align: left
* - Attr
- Value
* - ``name``
- ``True``
* - ``dims``
- ``True``
* - ``shape``
- ``True``
* - :func:`.has_attrs`
- ``False``
* - :func:`.is_compound`
- ``False``
"""
return (
cls.name and all([cls.dims, cls.shape]) and not has_attrs(cls) and not is_compound(cls)
@ -243,6 +412,88 @@ class MapArrayLikeAttributes(DatasetMap):
"""
The most general case - treat everything that isn't handled by one of the special cases
as an array!
Examples:
.. adapter:: DatasetAdapter
:nwb:
datasets:
- neurodata_type_def: Image
neurodata_type_inc: NWBData
dtype: numeric
dims:
- - x
- y
- - x
- y
- r, g, b
- - x
- y
- r, g, b, a
shape:
- - null
- null
- - null
- null
- 3
- - null
- null
- 4
doc: An abstract data type for an image. Shape can be 2-D (x, y), or 3-D where the
third dimension can have three or four elements, e.g. (x, y, (r, g, b)) or
(x, y, (r, g, b, a)).
attributes:
- name: resolution
dtype: float32
doc: Pixel resolution of the image, in pixels per centimeter.
required: false
- name: description
dtype: text
doc: Description of the image.
required: false
:linkml:
classes:
- name: Image
description: An abstract data type for an image. Shape can be 2-D (x, y), or 3-D
where the third dimension can have three or four elements, e.g. (x, y, (r, g,
b)) or (x, y, (r, g, b, a)).
is_a: NWBData
attributes:
name:
name: name
identifier: true
range: string
required: true
resolution:
name: resolution
description: Pixel resolution of the image, in pixels per centimeter.
range: float32
description:
name: description
description: Description of the image.
range: text
array:
name: array
range: numeric
any_of:
- array:
dimensions:
- alias: x
- alias: y
- array:
dimensions:
- alias: x
- alias: y
- alias: r_g_b
exact_cardinality: 3
- array:
dimensions:
- alias: x
- alias: y
- alias: r_g_b_a
exact_cardinality: 4
tree_root: true
"""
NEEDS_NAME = True
@ -487,7 +738,11 @@ class DatasetAdapter(ClassAdapter):
def is_1d(cls: Dataset) -> bool:
"""
Check if the values of a dataset are 1-dimensional
Check if the values of a dataset are 1-dimensional.
Specifically:
* a single-layer dim/shape list of length 1, or
* a nested dim/shape list where every nested spec is of length 1
"""
return (
not any([isinstance(dim, list) for dim in cls.dims]) and len(cls.dims) == 1

View file

@ -20,7 +20,16 @@ def load_yaml(path: Path | str) -> dict:
"""
Load yaml file from file, applying postload modifications
"""
if isinstance(path, str) and not Path(path).exists():
is_file = False
try:
a_path = Path(path)
if a_path.exists():
is_file = True
except OSError:
# long strings can't be made into paths!
pass
if not is_file:
ns_dict = yaml.safe_load(path)
else:
with open(path) as file:

View file

@ -35,8 +35,17 @@ __all__ = [
def tmp_output_dir() -> Path:
path = Path(__file__).parent.resolve() / "__tmp__"
if path.exists():
shutil.rmtree(str(path))
path.mkdir()
for subdir in path.iterdir():
if subdir.name == 'git':
# don't wipe out git repos every time, they don't rly change
continue
elif subdir.is_file() and subdir.parent != path:
continue
elif subdir.is_file():
subdir.unlink(missing_ok=True)
else:
shutil.rmtree(str(subdir))
path.mkdir(exist_ok=True)
return path