recursion works, more formal build method in adapters

2025-01-09 13:44:27 +00:00 · 2023-08-23 19:56:09 -07:00 · 2023-08-23 19:56:09 -07:00 · a4806543ef
commit a4806543ef
parent 170a424fb1
13 changed files with 321 additions and 28 deletions
--- a/docs/notes/translation.md
+++ b/docs/notes/translation.md
@ -24,4 +24,20 @@ Steps:
 - new files: files not in the source domain
  - enum classes

- Rename items
+- Rename items
+
+
+## Translation choices
+
+We aren't doing a 1:1 translation of NWB! The goal is to make something that is *import*
+backwards-compatible - ie. we can read NWB traditional nwb files - but not necessarily
+*export* for now. We will get to that eventually. NWB as it is now is highly tied to hdf5
+in multiple places - from the hdmf-common namespace to the nwb file classes, 
+we want to instead abstract the structure of NWB so the schema can be used
+as a programming element (ie. labs can write their own schema extensions in yaml, 
+generate pydantic modules for them, and they should Just Work TM) with various different
+storage backends. 
+
+- Don't try and emulate the nwb.file schema - it is basically a file layout that indicates
+  what should go where. We are moving I/O out of the schema: storage layout is at a different level than the schema
+- Don't worry about most of hdmf-common: instead create sensible generics that can be implemented in different ways by different storage mediums
--- a/nwb_linkml/adapters/adapter.py
+++ b/nwb_linkml/adapters/adapter.py
@ -1,11 +1,58 @@
 """
 Base class for adapters
 """
-from typing import List, Dict, Type, Generator, Any, Tuple
-from pydantic import BaseModel
+from abc import abstractmethod
+import warnings
+from dataclasses import dataclass, field
+from typing import List, Dict, Type, Generator, Any, Tuple, Optional
+from pydantic import BaseModel, Field, validator
+from linkml_runtime.linkml_model import Element, SchemaDefinition, ClassDefinition, SlotDefinition, TypeDefinition
+
+# SchemaDefClass = dataclass(SchemaDefinition).__pydantic_model__
+
+@dataclass
+class BuildResult:
+    # pass
+    schemas: List[SchemaDefinition] = field(default_factory=list)
+    classes: List[ClassDefinition] = field(default_factory=list)
+    slots: List[SlotDefinition] = field(default_factory=list)
+    types: List[TypeDefinition] = field(default_factory=list)
+
+    def __post_init__(self):
+        for field in ('schemas', 'classes', 'slots', 'types'):
+            attr = getattr(self, field)
+            if not isinstance(attr, list):
+                setattr(self, field, [attr])
+
+    def _dedupe(self, ours, others):
+        existing_names = [c.name for c in ours]
+        others_dedupe = [o for o in others if o.name not in existing_names]
+        return others_dedupe
+
+    def __add__(self, other:'BuildResult') -> 'BuildResult':
+        # if not isinstance(other, 'BuildResult'):
+        #     raise TypeError('Can only add two build results together')
+
+        self.schemas.extend(self._dedupe(self.schemas, other.schemas))
+        self.classes.extend(self._dedupe(self.classes, other.classes))
+        # existing_names = [c.name for c in self.classes]
+        # for newc in other.classes:
+        #     if newc.name in existing_names:
+        #         warnings.warn(f'Not creating duplicate class for {newc.name}')
+        #         continue
+        #     self.classes.append(newc)
+        # self.classes.extend(other.classes)
+        self.slots.extend(other.slots)
+        self.types.extend(other.types)
+        return self
+

 class Adapter(BaseModel):
-    pass
+    @abstractmethod
+    def build(self) -> 'BuildResult':
+        """
+        Generate the corresponding linkML element for this adapter
+        """

    def walk(self, input: BaseModel | list | dict):
        yield input
--- a/nwb_linkml/adapters/classes.py
+++ b/nwb_linkml/adapters/classes.py
@ -1,9 +1,10 @@
 """
 Adapters to linkML classes
 """
-
-from nwb_schema_language import Dataset, Group
-from nwb_linkml.adapters.adapter import Adapter
+import pdb
+from typing import List, Optional
+from nwb_schema_language import Dataset, Group, ReferenceDtype, DTypeType
+from nwb_linkml.adapters.adapter import Adapter, BuildResult
 from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition


@ -12,25 +13,128 @@ class ClassAdapter(Adapter):
    Adapter to class-like things in linkml, including datasets and groups
    """
    cls: Dataset | Group
+    parent: Optional['ClassAdapter'] = None

-    def build(self) -> ClassDefinition:
+    def _get_full_name(self) -> str:
+        """The full name of the object in the generated linkml
+
+        Distinct from 'name' which is the thing that's often used in """
        if self.cls.neurodata_type_def:
            name = self.cls.neurodata_type_def
-        else:
-            name = self.cls.name
+        elif self.cls.name is not None:
+            # not necessarily a unique name, so we combine parent names
+            name_parts = []
+            if self.parent is not None:
+                name_parts.append(self.parent._get_full_name())

+            name_parts.append(self.cls.name)
+            name = '_'.join(name_parts)
+        elif self.cls.neurodata_type_inc is not None:
+            # again, this is against the schema, but is common
+            name = self.cls.neurodata_type_inc
+        else:
+            raise ValueError('Not sure what our name is!')
+
+
+        return name
+
+    def _get_name(self) -> str:
+        """
+        Get the "regular" name, which is used as the name of the attr
+
+        Returns:
+
+        """
+        # return self._get_full_name()
+        name = None
+        if self.cls.neurodata_type_def:
+            name = self.cls.neurodata_type_def
+        elif self.cls.name is not None:
+            # we do have a unique name
+            name = self.cls.name
+        elif self.cls.neurodata_type_inc:
+            # group members can be anonymous? this violates the schema but is common
+            name = self.cls.neurodata_type_inc
+
+        if name is None:
+            raise ValueError(f'Class has no name!: {self.cls}')
+
+        return name
+
+    def handle_dtype(self, dtype: DTypeType):
+        if isinstance(dtype, ReferenceDtype):
+            return dtype.target_type
+        else:
+            return dtype
+
+    def build_attrs(self, cls: Dataset | Group) -> List[SlotDefinition]:
        attrs = [
            SlotDefinition(
                name=attr.name,
                description=attr.doc,
-
-            ) for attr in self.cls.attributes
+                range=self.handle_dtype(attr.dtype)
+            ) for attr in cls.attributes
        ]

+        return attrs
+
+    def build_subclasses(self, cls: Dataset | Group) -> BuildResult:
+        """
+        Build nested groups and datasets
+
+        Create ClassDefinitions for each, but then also create SlotDefinitions that
+        will be used as attributes linking the main class to the subclasses
+        """
+        # build and flatten nested classes
+        nested_classes = [ClassAdapter(cls=dset, parent=self) for dset in cls.datasets]
+        nested_classes.extend([ClassAdapter(cls=grp, parent=self) for grp in cls.groups])
+        nested_res = BuildResult()
+        for subclass in nested_classes:
+            this_slot = SlotDefinition(
+                name=subclass._get_name(),
+                description=subclass.cls.doc,
+                range=subclass._get_full_name()
+            )
+            nested_res.slots.append(this_slot)
+
+            if subclass.cls.name is None and subclass.cls.neurodata_type_def is None:
+                # anonymous group that's just an inc, we only need the slot since the class is defined elsewhere
+                continue
+
+            this_build = subclass.build()
+            nested_res += this_build
+        return nested_res
+
+
+    def build(self) -> BuildResult:
+
+        # Build this class
+        if self.parent is not None:
+            name = self._get_full_name()
+        else:
+            name = self._get_name()
+        # if name == 'TimeSeries':
+        #     pdb.set_trace()
+
+        # Get vanilla top-level attributes
+        attrs = self.build_attrs(self.cls)
+
+        # unnest and build subclasses in datasets and groups
+        if isinstance(self.cls, Group):
+            # only groups have sub-datasets and sub-groups
+            nested_res = self.build_subclasses(self.cls)
+            attrs.extend(nested_res.slots)
+        else:
+            nested_res = BuildResult()
+
        cls = ClassDefinition(
            name = name,
            is_a = self.cls.neurodata_type_inc,
            description=self.cls.doc,
            attributes=attrs
        )
-        return cls
+        res = BuildResult(
+            classes = [cls, *nested_res.classes]
+        )
+
+        return res
--- a/nwb_linkml/adapters/namespaces.py
+++ b/nwb_linkml/adapters/namespaces.py
@ -6,24 +6,57 @@ for extracting information and generating translated schema
 """
 import pdb
 from typing import List, Optional
-from pydantic import BaseModel, Field, validator
+from pydantic import BaseModel, Field, validator, PrivateAttr
 from pprint import pformat
+from linkml_runtime.linkml_model import SchemaDefinition

 from nwb_schema_language import Namespaces

-from nwb_linkml.adapters.adapter import Adapter
+from nwb_linkml.adapters.adapter import Adapter, BuildResult
 from nwb_linkml.adapters.schema import SchemaAdapter
+from nwb_linkml.lang_elements import NwbLangSchema

 class NamespacesAdapter(Adapter):
    namespaces: Namespaces
    schemas: List[SchemaAdapter]
    imported: List['NamespacesAdapter'] = Field(default_factory=list)

+    _imports_populated = PrivateAttr(False)

    def __init__(self, **kwargs):
        super(NamespacesAdapter, self).__init__(**kwargs)
        self._populate_schema_namespaces()

+    def build(self) -> BuildResult:
+        if not self._imports_populated:
+            self.populate_imports()
+
+
+        sch_result = BuildResult()
+        for sch in self.schemas:
+            sch_result += sch.build()
+        # recursive step
+        for imported in self.imported:
+            imported_build = imported.build()
+            sch_result += imported_build
+
+        # add in monkeypatch nwb types
+        sch_result.schemas.append(NwbLangSchema)
+
+        # now generate the top-level namespaces that import everything
+        for ns in self.namespaces.namespaces:
+            ns_schemas = [sch for sch in self.schemas if sch.namespace == ns.name]
+            ns_schema = SchemaDefinition(
+                name = ns.name,
+                id = ns.name,
+                description = ns.doc,
+                version = ns.version,
+                imports=[sch.name for sch in ns_schemas]
+            )
+            sch_result.schemas.append(ns_schema)
+
+        return sch_result
+
    def _populate_schema_namespaces(self):
        # annotate for each schema which namespace imports it
        for sch in self.schemas:
@ -78,4 +111,10 @@ class NamespacesAdapter(Adapter):
                if depends_on not in sch.imports:
                    sch.imports.append(depends_on)

+        # do so recursively
+        for imported in self.imported:
+            imported.populate_imports()
+
+        self._imports_populated = True
+

--- a/nwb_linkml/adapters/schema.py
+++ b/nwb_linkml/adapters/schema.py
@ -7,7 +7,7 @@ from typing import Optional, List, TYPE_CHECKING
 from pathlib import Path
 from pydantic import Field

-from nwb_linkml.adapters.adapter import Adapter
+from nwb_linkml.adapters.adapter import Adapter, BuildResult
 from nwb_linkml.adapters.classes import ClassAdapter
 if TYPE_CHECKING:
    from nwb_linkml.adapters.namespaces import NamespacesAdapter
@ -47,7 +47,7 @@ class SchemaAdapter(Adapter):

        return out_str

-    def build(self) -> SchemaDefinition:
+    def build(self) -> BuildResult:
        """
        Make the LinkML representation for this schema file

@ -59,16 +59,25 @@ class SchemaAdapter(Adapter):
        """
        classes = [ClassAdapter(cls=dset) for dset in self.datasets]
        classes.extend(ClassAdapter(cls=group) for group in self.groups)
-        built_classes = [c.build() for c in classes]
+        built_results = None
+        for cls in classes:
+            if built_results is None:
+                built_results = cls.build()
+            else:
+                built_results += cls.build()


        sch = SchemaDefinition(
            name = self.name,
            id = self.name,
            imports = [i.name for i in self.imports],
-            classes=built_classes
+            classes=built_results.classes,
+            slots=built_results.slots,
+            types=built_results.types
        )
-        return sch
+        # every schema needs the language elements
+        sch.imports.append('nwb.language')
+        return BuildResult(schemas=[sch])


    @property
--- a/nwb_schema_language/src/data/tests/nwb.base.yaml
+++ b/nwb_schema_language/src/data/tests/nwb.base.yaml
@ -253,7 +253,7 @@ groups:
    dtype: text
    doc: Description of this collection of images.
  datasets:
-  - neurodata_type_inc: Image
+#  - neurodata_type_inc: Image
    doc: Images stored in this collection.
    quantity: '+'
  - name: order_of_images
--- a/nwb_schema_language/src/nwb_schema_language/init.py
+++ b/nwb_schema_language/src/nwb_schema_language/init.py
@ -1,4 +1,5 @@
 import warnings
+from typing import List, Union
 try:
    from .datamodel.nwb_schema_pydantic import Namespace, \
        Namespaces, \
@ -8,6 +9,9 @@ try:
        Link, \
        Dataset, \
        ReferenceDtype, \
-        CompoundDtype
+        CompoundDtype, \
+        FlatDtype
+
+    DTypeType = Union[List[CompoundDtype], FlatDtype, ReferenceDtype]
 except NameError:
    warnings.warn('Error importing pydantic classes, passing because we might be in the process of patching them, but it is likely they are broken and you will be unable to use them!')
--- a/nwb_schema_language/src/nwb_schema_language/datamodel/init.py
+++ b/nwb_schema_language/src/nwb_schema_language/datamodel/init.py
@ -1 +1,3 @@
 from .nwb_schema_language import *
+
+# create additional derived
--- a/nwb_schema_language/src/nwb_schema_language/datamodel/nwb_schema_pydantic.py
+++ b/nwb_schema_language/src/nwb_schema_language/datamodel/nwb_schema_pydantic.py
@ -219,6 +219,7 @@ class Dataset(NamingMixin, DtypeMixin):
    quantity: Optional[Union[QuantityEnum, int]] = Field(1)
    linkable: Optional[bool] = Field(None)
    attributes: Optional[List[Attribute]] = Field(default_factory=list)
+    datasets: Optional[List[Dataset]] = Field(default_factory=list)
    dtype: Optional[Union[List[CompoundDtype], FlatDtype, ReferenceDtype]] = Field(default_factory=list)
    

--- a/nwb_schema_language/src/nwb_schema_language/schema/nwb_schema_language.yaml
+++ b/nwb_schema_language/src/nwb_schema_language/schema/nwb_schema_language.yaml
@ -124,6 +124,7 @@ classes:
      - quantity
      - linkable
      - attributes
+      - groups

  Datasets:
    slots:
--- a/poetry.lock
+++ b/poetry.lock
@ -260,6 +260,20 @@ files = [
    {file = "fqdn-1.5.1.tar.gz", hash = "sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f"},
 ]

+[[package]]
+name = "future-fstrings"
+version = "1.2.0"
+description = "A backport of fstrings to python<3.6"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+    {file = "future_fstrings-1.2.0-py2.py3-none-any.whl", hash = "sha256:90e49598b553d8746c4dc7d9442e0359d038c3039d802c91c0a55505da318c63"},
+    {file = "future_fstrings-1.2.0.tar.gz", hash = "sha256:6cf41cbe97c398ab5a81168ce0dbb8ad95862d3caf23c21e4430627b90844089"},
+]
+
+[package.extras]
+rewrite = ["tokenize-rt (>=3)"]
+
 [[package]]
 name = "graphviz"
 version = "0.20.1"
@ -746,6 +760,24 @@ files = [
    {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"},
 ]

+[[package]]
+name = "networkx"
+version = "3.1"
+description = "Python package for creating and manipulating graphs and networks"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "networkx-3.1-py3-none-any.whl", hash = "sha256:4f33f68cb2afcf86f28a45f43efc27a9386b535d567d2127f8f61d51dec58d36"},
+    {file = "networkx-3.1.tar.gz", hash = "sha256:de346335408f84de0eada6ff9fafafff9bcda11f0a0dfaa931133debb146ab61"},
+]
+
+[package.extras]
+default = ["matplotlib (>=3.4)", "numpy (>=1.20)", "pandas (>=1.3)", "scipy (>=1.8)"]
+developer = ["mypy (>=1.1)", "pre-commit (>=3.2)"]
+doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.13)", "sphinx (>=6.1)", "sphinx-gallery (>=0.12)", "texext (>=0.6.7)"]
+extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"]
+test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"]
+
 [[package]]
 name = "nwb-schema-language"
 version = "0.1.0"
@ -1023,6 +1055,23 @@ pluggy = ">=0.12,<2.0"
 [package.extras]
 testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]

+[[package]]
+name = "pytest-depends"
+version = "1.0.1"
+description = "Tests that depend on other tests"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pytest-depends-1.0.1.tar.gz", hash = "sha256:90a28e2b87b75b18abd128c94015248544acac20e4392e9921e5a86f93319dfe"},
+    {file = "pytest_depends-1.0.1-py3-none-any.whl", hash = "sha256:a1df072bcc93d77aca3f0946903f5fed8af2d9b0056db1dfc9ed5ac164ab0642"},
+]
+
+[package.dependencies]
+colorama = "*"
+future-fstrings = "*"
+networkx = "*"
+pytest = ">=3"
+
 [[package]]
 name = "pytest-logging"
 version = "2015.11.4"
@ -1759,4 +1808,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "90f5eaedd0572c26dfac8a9f2fed0c8ec50f70a54b7bff03a43816f96cb60bb1"
+content-hash = "debbeeaba69d6afc3da329ccc76e0c2ae3124773b85a577a10cb5e673845a9e5"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -22,6 +22,7 @@ linkml = "^1.5.7"

 [tool.poetry.group.dev.dependencies]
 pytest = "^7.4.0"
+pytest-depends = "^1.0.1"

 [build-system]
 requires = ["poetry-core"]
--- a/tests/test_generate.py
+++ b/tests/test_generate.py
@ -1,8 +1,11 @@
+import pdb
+
 import pytest
 import warnings

 from .fixtures import nwb_core_fixture, tmp_output_dir
 from linkml_runtime.dumpers import yaml_dumper
+from linkml.generators import PydanticGenerator

 from nwb_linkml.lang_elements import NwbLangSchema

@ -10,8 +13,25 @@ def test_generate_nwblang(tmp_output_dir):
    output_file = (tmp_output_dir / NwbLangSchema.name).with_suffix('.yml')
    yaml_dumper.dump(NwbLangSchema, output_file)

-def test_generate_base(nwb_core_fixture, tmp_output_dir):
-    schema = nwb_core_fixture.schemas[0].build()
-    output_file = (tmp_output_dir / schema.name).with_suffix('.yml')
-    warnings.warn(output_file)
-    yaml_dumper.dump(schema, output_file)
+def test_generate_core(nwb_core_fixture, tmp_output_dir):
+    schemas = nwb_core_fixture.build().schemas
+    for schema in schemas:
+        output_file = tmp_output_dir / (schema.name + '.yaml')
+        yaml_dumper.dump(schema, output_file)
+
+@pytest.mark.depends(on=['test_generate_core'])
+def test_generate_pydantic(tmp_output_dir):
+    core_file = tmp_output_dir / 'core.yaml'
+    pydantic_file = tmp_output_dir / 'core.py'
+
+    generator = PydanticGenerator(
+        str(core_file),
+        pydantic_version='1',
+        emit_metadata=True,
+        gen_classvars=True,
+        gen_slots=True
+
+    )
+    gen_pydantic = generator.serialize()
+    with open(pydantic_file, 'w') as pfile:
+        pfile.write(gen_pydantic)