From a4806543ef63152fc40bafc883a61146806646f9 Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Wed, 23 Aug 2023 19:56:09 -0700 Subject: [PATCH] recursion works, more formal build method in adapters --- docs/notes/translation.md | 18 ++- nwb_linkml/adapters/adapter.py | 53 +++++++- nwb_linkml/adapters/classes.py | 122 ++++++++++++++++-- nwb_linkml/adapters/namespaces.py | 43 +++++- nwb_linkml/adapters/schema.py | 19 ++- .../src/data/tests/nwb.base.yaml | 2 +- .../src/nwb_schema_language/__init__.py | 6 +- .../nwb_schema_language/datamodel/__init__.py | 2 + .../datamodel/nwb_schema_pydantic.py | 1 + .../schema/nwb_schema_language.yaml | 1 + poetry.lock | 51 +++++++- pyproject.toml | 1 + tests/test_generate.py | 30 ++++- 13 files changed, 321 insertions(+), 28 deletions(-) diff --git a/docs/notes/translation.md b/docs/notes/translation.md index baddf3d..664f9d3 100644 --- a/docs/notes/translation.md +++ b/docs/notes/translation.md @@ -24,4 +24,20 @@ Steps: - new files: files not in the source domain - enum classes -- Rename items \ No newline at end of file +- Rename items + + +## Translation choices + +We aren't doing a 1:1 translation of NWB! The goal is to make something that is *import* +backwards-compatible - ie. we can read NWB traditional nwb files - but not necessarily +*export* for now. We will get to that eventually. NWB as it is now is highly tied to hdf5 +in multiple places - from the hdmf-common namespace to the nwb file classes, +we want to instead abstract the structure of NWB so the schema can be used +as a programming element (ie. labs can write their own schema extensions in yaml, +generate pydantic modules for them, and they should Just Work TM) with various different +storage backends. + +- Don't try and emulate the nwb.file schema - it is basically a file layout that indicates + what should go where. We are moving I/O out of the schema: storage layout is at a different level than the schema +- Don't worry about most of hdmf-common: instead create sensible generics that can be implemented in different ways by different storage mediums \ No newline at end of file diff --git a/nwb_linkml/adapters/adapter.py b/nwb_linkml/adapters/adapter.py index 5f656b0..eedfa5e 100644 --- a/nwb_linkml/adapters/adapter.py +++ b/nwb_linkml/adapters/adapter.py @@ -1,11 +1,58 @@ """ Base class for adapters """ -from typing import List, Dict, Type, Generator, Any, Tuple -from pydantic import BaseModel +from abc import abstractmethod +import warnings +from dataclasses import dataclass, field +from typing import List, Dict, Type, Generator, Any, Tuple, Optional +from pydantic import BaseModel, Field, validator +from linkml_runtime.linkml_model import Element, SchemaDefinition, ClassDefinition, SlotDefinition, TypeDefinition + +# SchemaDefClass = dataclass(SchemaDefinition).__pydantic_model__ + +@dataclass +class BuildResult: + # pass + schemas: List[SchemaDefinition] = field(default_factory=list) + classes: List[ClassDefinition] = field(default_factory=list) + slots: List[SlotDefinition] = field(default_factory=list) + types: List[TypeDefinition] = field(default_factory=list) + + def __post_init__(self): + for field in ('schemas', 'classes', 'slots', 'types'): + attr = getattr(self, field) + if not isinstance(attr, list): + setattr(self, field, [attr]) + + def _dedupe(self, ours, others): + existing_names = [c.name for c in ours] + others_dedupe = [o for o in others if o.name not in existing_names] + return others_dedupe + + def __add__(self, other:'BuildResult') -> 'BuildResult': + # if not isinstance(other, 'BuildResult'): + # raise TypeError('Can only add two build results together') + + self.schemas.extend(self._dedupe(self.schemas, other.schemas)) + self.classes.extend(self._dedupe(self.classes, other.classes)) + # existing_names = [c.name for c in self.classes] + # for newc in other.classes: + # if newc.name in existing_names: + # warnings.warn(f'Not creating duplicate class for {newc.name}') + # continue + # self.classes.append(newc) + # self.classes.extend(other.classes) + self.slots.extend(other.slots) + self.types.extend(other.types) + return self + class Adapter(BaseModel): - pass + @abstractmethod + def build(self) -> 'BuildResult': + """ + Generate the corresponding linkML element for this adapter + """ def walk(self, input: BaseModel | list | dict): yield input diff --git a/nwb_linkml/adapters/classes.py b/nwb_linkml/adapters/classes.py index 45fa5ca..6ed4ff5 100644 --- a/nwb_linkml/adapters/classes.py +++ b/nwb_linkml/adapters/classes.py @@ -1,9 +1,10 @@ """ Adapters to linkML classes """ - -from nwb_schema_language import Dataset, Group -from nwb_linkml.adapters.adapter import Adapter +import pdb +from typing import List, Optional +from nwb_schema_language import Dataset, Group, ReferenceDtype, DTypeType +from nwb_linkml.adapters.adapter import Adapter, BuildResult from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition @@ -12,25 +13,128 @@ class ClassAdapter(Adapter): Adapter to class-like things in linkml, including datasets and groups """ cls: Dataset | Group + parent: Optional['ClassAdapter'] = None - def build(self) -> ClassDefinition: + def _get_full_name(self) -> str: + """The full name of the object in the generated linkml + + Distinct from 'name' which is the thing that's often used in """ if self.cls.neurodata_type_def: name = self.cls.neurodata_type_def - else: - name = self.cls.name + elif self.cls.name is not None: + # not necessarily a unique name, so we combine parent names + name_parts = [] + if self.parent is not None: + name_parts.append(self.parent._get_full_name()) + name_parts.append(self.cls.name) + name = '_'.join(name_parts) + elif self.cls.neurodata_type_inc is not None: + # again, this is against the schema, but is common + name = self.cls.neurodata_type_inc + else: + raise ValueError('Not sure what our name is!') + + + return name + + def _get_name(self) -> str: + """ + Get the "regular" name, which is used as the name of the attr + + Returns: + + """ + # return self._get_full_name() + name = None + if self.cls.neurodata_type_def: + name = self.cls.neurodata_type_def + elif self.cls.name is not None: + # we do have a unique name + name = self.cls.name + elif self.cls.neurodata_type_inc: + # group members can be anonymous? this violates the schema but is common + name = self.cls.neurodata_type_inc + + if name is None: + raise ValueError(f'Class has no name!: {self.cls}') + + return name + + def handle_dtype(self, dtype: DTypeType): + if isinstance(dtype, ReferenceDtype): + return dtype.target_type + else: + return dtype + + def build_attrs(self, cls: Dataset | Group) -> List[SlotDefinition]: attrs = [ SlotDefinition( name=attr.name, description=attr.doc, - - ) for attr in self.cls.attributes + range=self.handle_dtype(attr.dtype) + ) for attr in cls.attributes ] + return attrs + + def build_subclasses(self, cls: Dataset | Group) -> BuildResult: + """ + Build nested groups and datasets + + Create ClassDefinitions for each, but then also create SlotDefinitions that + will be used as attributes linking the main class to the subclasses + """ + # build and flatten nested classes + nested_classes = [ClassAdapter(cls=dset, parent=self) for dset in cls.datasets] + nested_classes.extend([ClassAdapter(cls=grp, parent=self) for grp in cls.groups]) + nested_res = BuildResult() + for subclass in nested_classes: + this_slot = SlotDefinition( + name=subclass._get_name(), + description=subclass.cls.doc, + range=subclass._get_full_name() + ) + nested_res.slots.append(this_slot) + + if subclass.cls.name is None and subclass.cls.neurodata_type_def is None: + # anonymous group that's just an inc, we only need the slot since the class is defined elsewhere + continue + + this_build = subclass.build() + nested_res += this_build + return nested_res + + + def build(self) -> BuildResult: + + # Build this class + if self.parent is not None: + name = self._get_full_name() + else: + name = self._get_name() + # if name == 'TimeSeries': + # pdb.set_trace() + + # Get vanilla top-level attributes + attrs = self.build_attrs(self.cls) + + # unnest and build subclasses in datasets and groups + if isinstance(self.cls, Group): + # only groups have sub-datasets and sub-groups + nested_res = self.build_subclasses(self.cls) + attrs.extend(nested_res.slots) + else: + nested_res = BuildResult() + cls = ClassDefinition( name = name, is_a = self.cls.neurodata_type_inc, description=self.cls.doc, attributes=attrs ) - return cls \ No newline at end of file + res = BuildResult( + classes = [cls, *nested_res.classes] + ) + + return res \ No newline at end of file diff --git a/nwb_linkml/adapters/namespaces.py b/nwb_linkml/adapters/namespaces.py index 77ac8b8..7f3c67b 100644 --- a/nwb_linkml/adapters/namespaces.py +++ b/nwb_linkml/adapters/namespaces.py @@ -6,24 +6,57 @@ for extracting information and generating translated schema """ import pdb from typing import List, Optional -from pydantic import BaseModel, Field, validator +from pydantic import BaseModel, Field, validator, PrivateAttr from pprint import pformat +from linkml_runtime.linkml_model import SchemaDefinition from nwb_schema_language import Namespaces -from nwb_linkml.adapters.adapter import Adapter +from nwb_linkml.adapters.adapter import Adapter, BuildResult from nwb_linkml.adapters.schema import SchemaAdapter +from nwb_linkml.lang_elements import NwbLangSchema class NamespacesAdapter(Adapter): namespaces: Namespaces schemas: List[SchemaAdapter] imported: List['NamespacesAdapter'] = Field(default_factory=list) + _imports_populated = PrivateAttr(False) def __init__(self, **kwargs): super(NamespacesAdapter, self).__init__(**kwargs) self._populate_schema_namespaces() + def build(self) -> BuildResult: + if not self._imports_populated: + self.populate_imports() + + + sch_result = BuildResult() + for sch in self.schemas: + sch_result += sch.build() + # recursive step + for imported in self.imported: + imported_build = imported.build() + sch_result += imported_build + + # add in monkeypatch nwb types + sch_result.schemas.append(NwbLangSchema) + + # now generate the top-level namespaces that import everything + for ns in self.namespaces.namespaces: + ns_schemas = [sch for sch in self.schemas if sch.namespace == ns.name] + ns_schema = SchemaDefinition( + name = ns.name, + id = ns.name, + description = ns.doc, + version = ns.version, + imports=[sch.name for sch in ns_schemas] + ) + sch_result.schemas.append(ns_schema) + + return sch_result + def _populate_schema_namespaces(self): # annotate for each schema which namespace imports it for sch in self.schemas: @@ -78,4 +111,10 @@ class NamespacesAdapter(Adapter): if depends_on not in sch.imports: sch.imports.append(depends_on) + # do so recursively + for imported in self.imported: + imported.populate_imports() + + self._imports_populated = True + diff --git a/nwb_linkml/adapters/schema.py b/nwb_linkml/adapters/schema.py index 26980ec..0556a04 100644 --- a/nwb_linkml/adapters/schema.py +++ b/nwb_linkml/adapters/schema.py @@ -7,7 +7,7 @@ from typing import Optional, List, TYPE_CHECKING from pathlib import Path from pydantic import Field -from nwb_linkml.adapters.adapter import Adapter +from nwb_linkml.adapters.adapter import Adapter, BuildResult from nwb_linkml.adapters.classes import ClassAdapter if TYPE_CHECKING: from nwb_linkml.adapters.namespaces import NamespacesAdapter @@ -47,7 +47,7 @@ class SchemaAdapter(Adapter): return out_str - def build(self) -> SchemaDefinition: + def build(self) -> BuildResult: """ Make the LinkML representation for this schema file @@ -59,16 +59,25 @@ class SchemaAdapter(Adapter): """ classes = [ClassAdapter(cls=dset) for dset in self.datasets] classes.extend(ClassAdapter(cls=group) for group in self.groups) - built_classes = [c.build() for c in classes] + built_results = None + for cls in classes: + if built_results is None: + built_results = cls.build() + else: + built_results += cls.build() sch = SchemaDefinition( name = self.name, id = self.name, imports = [i.name for i in self.imports], - classes=built_classes + classes=built_results.classes, + slots=built_results.slots, + types=built_results.types ) - return sch + # every schema needs the language elements + sch.imports.append('nwb.language') + return BuildResult(schemas=[sch]) @property diff --git a/nwb_schema_language/src/data/tests/nwb.base.yaml b/nwb_schema_language/src/data/tests/nwb.base.yaml index 859e904..52fb3b5 100644 --- a/nwb_schema_language/src/data/tests/nwb.base.yaml +++ b/nwb_schema_language/src/data/tests/nwb.base.yaml @@ -253,7 +253,7 @@ groups: dtype: text doc: Description of this collection of images. datasets: - - neurodata_type_inc: Image +# - neurodata_type_inc: Image doc: Images stored in this collection. quantity: '+' - name: order_of_images diff --git a/nwb_schema_language/src/nwb_schema_language/__init__.py b/nwb_schema_language/src/nwb_schema_language/__init__.py index 10da423..2ff42b3 100644 --- a/nwb_schema_language/src/nwb_schema_language/__init__.py +++ b/nwb_schema_language/src/nwb_schema_language/__init__.py @@ -1,4 +1,5 @@ import warnings +from typing import List, Union try: from .datamodel.nwb_schema_pydantic import Namespace, \ Namespaces, \ @@ -8,6 +9,9 @@ try: Link, \ Dataset, \ ReferenceDtype, \ - CompoundDtype + CompoundDtype, \ + FlatDtype + + DTypeType = Union[List[CompoundDtype], FlatDtype, ReferenceDtype] except NameError: warnings.warn('Error importing pydantic classes, passing because we might be in the process of patching them, but it is likely they are broken and you will be unable to use them!') \ No newline at end of file diff --git a/nwb_schema_language/src/nwb_schema_language/datamodel/__init__.py b/nwb_schema_language/src/nwb_schema_language/datamodel/__init__.py index 6d64f5c..2536e1a 100644 --- a/nwb_schema_language/src/nwb_schema_language/datamodel/__init__.py +++ b/nwb_schema_language/src/nwb_schema_language/datamodel/__init__.py @@ -1 +1,3 @@ from .nwb_schema_language import * + +# create additional derived \ No newline at end of file diff --git a/nwb_schema_language/src/nwb_schema_language/datamodel/nwb_schema_pydantic.py b/nwb_schema_language/src/nwb_schema_language/datamodel/nwb_schema_pydantic.py index c986b47..48e808f 100644 --- a/nwb_schema_language/src/nwb_schema_language/datamodel/nwb_schema_pydantic.py +++ b/nwb_schema_language/src/nwb_schema_language/datamodel/nwb_schema_pydantic.py @@ -219,6 +219,7 @@ class Dataset(NamingMixin, DtypeMixin): quantity: Optional[Union[QuantityEnum, int]] = Field(1) linkable: Optional[bool] = Field(None) attributes: Optional[List[Attribute]] = Field(default_factory=list) + datasets: Optional[List[Dataset]] = Field(default_factory=list) dtype: Optional[Union[List[CompoundDtype], FlatDtype, ReferenceDtype]] = Field(default_factory=list) diff --git a/nwb_schema_language/src/nwb_schema_language/schema/nwb_schema_language.yaml b/nwb_schema_language/src/nwb_schema_language/schema/nwb_schema_language.yaml index a92bff9..a8446aa 100644 --- a/nwb_schema_language/src/nwb_schema_language/schema/nwb_schema_language.yaml +++ b/nwb_schema_language/src/nwb_schema_language/schema/nwb_schema_language.yaml @@ -124,6 +124,7 @@ classes: - quantity - linkable - attributes + - groups Datasets: slots: diff --git a/poetry.lock b/poetry.lock index 9ee4c88..b355a4e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -260,6 +260,20 @@ files = [ {file = "fqdn-1.5.1.tar.gz", hash = "sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f"}, ] +[[package]] +name = "future-fstrings" +version = "1.2.0" +description = "A backport of fstrings to python<3.6" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "future_fstrings-1.2.0-py2.py3-none-any.whl", hash = "sha256:90e49598b553d8746c4dc7d9442e0359d038c3039d802c91c0a55505da318c63"}, + {file = "future_fstrings-1.2.0.tar.gz", hash = "sha256:6cf41cbe97c398ab5a81168ce0dbb8ad95862d3caf23c21e4430627b90844089"}, +] + +[package.extras] +rewrite = ["tokenize-rt (>=3)"] + [[package]] name = "graphviz" version = "0.20.1" @@ -746,6 +760,24 @@ files = [ {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, ] +[[package]] +name = "networkx" +version = "3.1" +description = "Python package for creating and manipulating graphs and networks" +optional = false +python-versions = ">=3.8" +files = [ + {file = "networkx-3.1-py3-none-any.whl", hash = "sha256:4f33f68cb2afcf86f28a45f43efc27a9386b535d567d2127f8f61d51dec58d36"}, + {file = "networkx-3.1.tar.gz", hash = "sha256:de346335408f84de0eada6ff9fafafff9bcda11f0a0dfaa931133debb146ab61"}, +] + +[package.extras] +default = ["matplotlib (>=3.4)", "numpy (>=1.20)", "pandas (>=1.3)", "scipy (>=1.8)"] +developer = ["mypy (>=1.1)", "pre-commit (>=3.2)"] +doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.13)", "sphinx (>=6.1)", "sphinx-gallery (>=0.12)", "texext (>=0.6.7)"] +extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"] +test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"] + [[package]] name = "nwb-schema-language" version = "0.1.0" @@ -1023,6 +1055,23 @@ pluggy = ">=0.12,<2.0" [package.extras] testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +[[package]] +name = "pytest-depends" +version = "1.0.1" +description = "Tests that depend on other tests" +optional = false +python-versions = "*" +files = [ + {file = "pytest-depends-1.0.1.tar.gz", hash = "sha256:90a28e2b87b75b18abd128c94015248544acac20e4392e9921e5a86f93319dfe"}, + {file = "pytest_depends-1.0.1-py3-none-any.whl", hash = "sha256:a1df072bcc93d77aca3f0946903f5fed8af2d9b0056db1dfc9ed5ac164ab0642"}, +] + +[package.dependencies] +colorama = "*" +future-fstrings = "*" +networkx = "*" +pytest = ">=3" + [[package]] name = "pytest-logging" version = "2015.11.4" @@ -1759,4 +1808,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "90f5eaedd0572c26dfac8a9f2fed0c8ec50f70a54b7bff03a43816f96cb60bb1" +content-hash = "debbeeaba69d6afc3da329ccc76e0c2ae3124773b85a577a10cb5e673845a9e5" diff --git a/pyproject.toml b/pyproject.toml index b4d79aa..8a304da 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,7 @@ linkml = "^1.5.7" [tool.poetry.group.dev.dependencies] pytest = "^7.4.0" +pytest-depends = "^1.0.1" [build-system] requires = ["poetry-core"] diff --git a/tests/test_generate.py b/tests/test_generate.py index f4579f0..f035674 100644 --- a/tests/test_generate.py +++ b/tests/test_generate.py @@ -1,8 +1,11 @@ +import pdb + import pytest import warnings from .fixtures import nwb_core_fixture, tmp_output_dir from linkml_runtime.dumpers import yaml_dumper +from linkml.generators import PydanticGenerator from nwb_linkml.lang_elements import NwbLangSchema @@ -10,8 +13,25 @@ def test_generate_nwblang(tmp_output_dir): output_file = (tmp_output_dir / NwbLangSchema.name).with_suffix('.yml') yaml_dumper.dump(NwbLangSchema, output_file) -def test_generate_base(nwb_core_fixture, tmp_output_dir): - schema = nwb_core_fixture.schemas[0].build() - output_file = (tmp_output_dir / schema.name).with_suffix('.yml') - warnings.warn(output_file) - yaml_dumper.dump(schema, output_file) +def test_generate_core(nwb_core_fixture, tmp_output_dir): + schemas = nwb_core_fixture.build().schemas + for schema in schemas: + output_file = tmp_output_dir / (schema.name + '.yaml') + yaml_dumper.dump(schema, output_file) + +@pytest.mark.depends(on=['test_generate_core']) +def test_generate_pydantic(tmp_output_dir): + core_file = tmp_output_dir / 'core.yaml' + pydantic_file = tmp_output_dir / 'core.py' + + generator = PydanticGenerator( + str(core_file), + pydantic_version='1', + emit_metadata=True, + gen_classvars=True, + gen_slots=True + + ) + gen_pydantic = generator.serialize() + with open(pydantic_file, 'w') as pfile: + pfile.write(gen_pydantic)