working on providers to cache generated models

This commit is contained in:
sneakers-the-rat 2023-09-07 18:50:50 -07:00
parent 0ec09a035a
commit a01bb49b1e
12 changed files with 560 additions and 29 deletions

33
nwb_linkml/poetry.lock generated
View file

@ -1385,6 +1385,21 @@ files = [
[package.dependencies]
typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
[[package]]
name = "pydantic-settings"
version = "2.0.3"
description = "Settings management using Pydantic"
optional = false
python-versions = ">=3.7"
files = [
{file = "pydantic_settings-2.0.3-py3-none-any.whl", hash = "sha256:ddd907b066622bd67603b75e2ff791875540dc485b7307c4fffc015719da8625"},
{file = "pydantic_settings-2.0.3.tar.gz", hash = "sha256:962dc3672495aad6ae96a4390fac7e593591e144625e5112d359f8f67fb75945"},
]
[package.dependencies]
pydantic = ">=2.0.1"
python-dotenv = ">=0.21.0"
[[package]]
name = "pygments"
version = "2.16.1"
@ -1598,6 +1613,20 @@ files = [
[package.dependencies]
six = ">=1.5"
[[package]]
name = "python-dotenv"
version = "1.0.0"
description = "Read key-value pairs from a .env file and set them as environment variables"
optional = false
python-versions = ">=3.8"
files = [
{file = "python-dotenv-1.0.0.tar.gz", hash = "sha256:a8df96034aae6d2d50a4ebe8216326c61c3eb64836776504fcca410e5937a3ba"},
{file = "python_dotenv-1.0.0-py3-none-any.whl", hash = "sha256:f5971a9226b701070a4bf2c38c89e5a3f0d64de8debda981d1db98583009122a"},
]
[package.extras]
cli = ["click (>=5.0)"]
[[package]]
name = "pytrie"
version = "0.4.0"
@ -2367,9 +2396,9 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
[extras]
plot = ["dash", "dash-cytoscape"]
tests = ["coverage", "coveralls", "pytest", "pytest-cov", "pytest-depends", "pytest-emoji", "pytest-md"]
tests = ["coverage", "coveralls", "pytest", "pytest-cov", "pytest-depends", "pytest-emoji", "pytest-md", "pytest-profiling"]
[metadata]
lock-version = "2.0"
python-versions = "^3.11"
content-hash = "7ae9160a401b3bfa2f4535696ecf15e33815e356f7757ee611893c701485d24f"
content-hash = "9ef89b731746d07d428c6cff4a8c8b4771fbfcfcc8f17120ce3c6089e5161eb6"

View file

@ -30,6 +30,7 @@ pytest-emoji = {version="^0.2.0", optional = true}
pytest-cov = {version = "^4.1.0", optional = true}
coveralls = {version = "^3.3.1", optional = true}
pytest-profiling = {version = "^1.7.0", optional = true}
pydantic-settings = "^2.0.3"
[tool.poetry.extras]
tests = [

View file

@ -1,28 +1,16 @@
"""
Adapters to linkML classes
"""
import re
from abc import abstractmethod
from typing import List, Optional
from nwb_schema_language import Dataset, Group, ReferenceDtype, CompoundDtype, DTypeType
from nwb_linkml.adapters.adapter import Adapter, BuildResult
from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
from nwb_linkml.maps import QUANTITY_MAP
from nwb_linkml.maps.naming import camel_to_snake
CAMEL_TO_SNAKE = re.compile('((?<=[a-z0-9])[A-Z]|(?!^)[A-Z](?=[a-z]))')
"""
Convert camel case to snake case
courtesy of: https://stackoverflow.com/a/12867228
"""
def camel_to_snake(name:str) -> str:
"""
Convert camel case to snake case
courtesy of: https://stackoverflow.com/a/12867228
"""
return CAMEL_TO_SNAKE.sub(r'_\1', name).lower()
class ClassAdapter(Adapter):
"""

View file

@ -9,7 +9,8 @@ from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
from pydantic import PrivateAttr
from nwb_schema_language import Dataset, ReferenceDtype, CompoundDtype, DTypeType
from nwb_linkml.adapters.classes import ClassAdapter, camel_to_snake
from nwb_linkml.adapters.classes import ClassAdapter
from nwb_linkml.maps.naming import camel_to_snake
from nwb_linkml.adapters.adapter import BuildResult
from nwb_linkml.maps import QUANTITY_MAP

View file

@ -6,7 +6,8 @@ from typing import List
from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
from nwb_schema_language import Dataset, Group, ReferenceDtype, CompoundDtype, DTypeType
from nwb_linkml.adapters.classes import ClassAdapter, camel_to_snake
from nwb_linkml.adapters.classes import ClassAdapter
from nwb_linkml.maps.naming import camel_to_snake
from nwb_linkml.adapters.dataset import DatasetAdapter
from nwb_linkml.adapters.adapter import BuildResult
from nwb_linkml.maps import QUANTITY_MAP

View file

@ -6,7 +6,7 @@ for extracting information and generating translated schema
"""
import pdb
from typing import List, Optional
from typing import List, Optional, Dict
from pathlib import Path
from pydantic import BaseModel, Field, validator, PrivateAttr
from pprint import pformat
@ -33,8 +33,8 @@ class NamespacesAdapter(Adapter):
self._populate_schema_namespaces()
self.split = self._split
def build(self) -> BuildResult:
if not self._imports_populated:
def build(self, skip_imports:bool=False) -> BuildResult:
if not self._imports_populated and not skip_imports:
self.populate_imports()
@ -42,6 +42,7 @@ class NamespacesAdapter(Adapter):
for sch in self.schemas:
sch_result += sch.build()
# recursive step
if not skip_imports:
for imported in self.imported:
imported_build = imported.build()
sch_result += imported_build
@ -53,6 +54,7 @@ class NamespacesAdapter(Adapter):
for ns in self.namespaces.namespaces:
ns_schemas = [sch.name for sch in self.schemas if sch.namespace == ns.name]
# also add imports bc, well, we need them
if not skip_imports:
ns_schemas.extend([ns.name for imported in self.imported for ns in imported.namespaces.namespaces])
ns_schema = SchemaDefinition(
name = ns.name,
@ -164,4 +166,41 @@ class NamespacesAdapter(Adapter):
output_file = base_dir / (schema.name + '.yaml')
yaml_dumper.dump(schema, output_file)
@property
def needed_imports(self) -> Dict[str, List[str]]:
"""
List of other, external namespaces that we need to import.
Usually provided as schema with a namespace but not a source
Returns:
{'namespace_name': ['needed_import_0', ...]}
"""
needed_imports = {}
for a_ns in self.namespaces.namespaces:
needed_imports[a_ns.name] = []
for potential_import in a_ns.schema_:
if potential_import.namespace and not potential_import.source:
needed_imports[a_ns.name].append(potential_import.namespace)
return needed_imports
@property
def versions(self) -> Dict[str, str]:
"""
versions for each namespace
"""
return {ns['name']:ns['version'] for ns in self.namespaces.namespaces}
def namespace_schemas(self, name:str) -> List[str]:
"""
Get the schemas that are defined in a given namespace
"""
ns = [ns for ns in self.namespaces.namespaces if ns.name == name][0]
schema_names = []
for sch in ns.schema_:
if sch.source is not None:
schema_names.append(sch.source)
return schema_names

View file

@ -29,7 +29,7 @@ class SchemaAdapter(Adapter):
path: Path
groups: List[Group] = Field(default_factory=list)
datasets: List[Dataset] = Field(default_factory=list)
imports: List['SchemaAdapter'] = Field(default_factory=list)
imports: List['SchemaAdapter' | str] = Field(default_factory=list)
namespace: Optional[str] = Field(
None,
description="""String of containing namespace. Populated by NamespacesAdapter""")
@ -48,7 +48,7 @@ class SchemaAdapter(Adapter):
out_str += '-'*len(self.name) + '\n'
if len(self.imports) > 0:
out_str += "Imports:\n"
out_str += " " + ', '.join([i.name for i in self.imports]) + '\n'
out_str += " " + ', '.join([i.name if isinstance(i, SchemaAdapter) else i for i in self.imports ]) + '\n'
out_str += 'Groups:\n'
out_str += ' ' + ', '.join([g.neurodata_type_def for g in self.groups])
@ -83,10 +83,11 @@ class SchemaAdapter(Adapter):
return sch_split
else:
sch = SchemaDefinition(
name = self.name,
id = self.name,
imports = [i.name for i in self.imports],
imports = [i.name if isinstance(i, SchemaAdapter) else i for i in self.imports ],
classes=res.classes,
slots=res.slots,
types=res.types
@ -113,7 +114,7 @@ class SchemaAdapter(Adapter):
split_sch_name = '.'.join([self.name, 'include'])
imports = [i.name for i in self.imports]
imports = [i.name if isinstance(i, SchemaAdapter) else i for i in self.imports ]
imports.append('nwb.language')
# need to mutually import the two schemas because the subclasses
# could refer to the main classes

View file

@ -0,0 +1,41 @@
"""
Manage the operation of nwb_linkml from environmental variables
"""
import tempfile
from pathlib import Path
from pydantic import Field, DirectoryPath, computed_field
from pydantic_settings import BaseSettings, SettingsConfigDict
class Config(BaseSettings):
"""
Configuration for nwb_linkml, populated by default but can be overridden
by environment variables.
Examples:
export NWB_LINKML_CACHE_DIR="/home/mycache/dir"
"""
model_config = SettingsConfigDict(env_prefix="nwb_linkml_")
cache_dir: DirectoryPath = Field(
default_factory= lambda: Path(tempfile.gettempdir()) / 'nwb_linkml__cache',
description="Location to cache generated schema and models")
@computed_field
@property
def linkml_dir(self) -> Path:
"""Directory to store generated linkml models"""
return self.cache_dir / 'linkml'
@computed_field
@property
def pydantic_dir(self) -> Path:
"""Directory to store generated pydantic models"""
return self.cache_dir / 'pydantic'
def __post_init__(self):
self.cache_dir.mkdir(exist_ok=True)
self.linkml_dir.mkdir(exist_ok=True)
self.pydantic_dir.mkdir(exist_ok=True)

View file

@ -45,6 +45,7 @@ from linkml_runtime.utils.schemaview import SchemaView
from linkml_runtime.utils.compile_python import file_text
from linkml.utils.ifabsent_functions import ifabsent_value_declaration
from nwb_linkml.maps.naming import module_case, version_module_case
from jinja2 import Template
@ -193,6 +194,8 @@ class NWBPydanticGenerator(PydanticGenerator):
# SKIP_CLASSES=('VectorData','VectorIndex')
split:bool=True
schema_map:Dict[str, SchemaDefinition]=None
versions:List[dict] = None
"""See :meth:`.LinkMLProvider.build` for usage - a list of specific versions to import from"""
def _locate_imports(
@ -217,7 +220,12 @@ class NWBPydanticGenerator(PydanticGenerator):
if module_name == self.schema.name:
continue
local_mod_name = '.' + module_name.replace('.', '_').replace('-', '_')
if self.versions and module_name in [v['name'] for v in self.versions]:
version = version_module_case([v['version'] for v in self.versions if v['name'] == module_name][0])
local_mod_name = '....' + module_case(module_name) + '.' + version + '.' + 'namespace'
else:
local_mod_name = '.' + module_case(module_name)
if local_mod_name not in imports:
imports[local_mod_name] = [camelcase(cls)]
else:

View file

@ -0,0 +1,52 @@
import re
from pathlib import Path
CAMEL_TO_SNAKE = re.compile('((?<=[a-z0-9])[A-Z]|(?!^)[A-Z](?=[a-z]))')
"""
Convert camel case to snake case
courtesy of: https://stackoverflow.com/a/12867228
"""
def camel_to_snake(name:str) -> str:
"""
Convert camel case to snake case
courtesy of: https://stackoverflow.com/a/12867228
"""
return CAMEL_TO_SNAKE.sub(r'_\1', name).lower()
def module_case(name:str) -> str:
"""
Returns name that can be used as a python module, used for
referring to generated pydantic and linkml models.
Replaces with underscores:
- -
- .
"""
return name.replace('-', '_').replace('.', '_').lower()
def version_module_case(name:str) -> str:
"""
:func:`.module_case` except ensure that it starts with "v"
"""
name = module_case(name)
if not name.startswith('v'):
name = v + name
return name
def relative_path(target: Path, origin: Path):
"""
return path of target relative to origin, even if they're
not in the same subpath
References:
- https://stackoverflow.com/a/71874881
"""
try:
return Path(target).resolve().relative_to(Path(origin).resolve())
except ValueError as e: # target does not start with origin
# recursion with origin (eventually origin is root so try will succeed)
return Path('..').joinpath(relative_path(target, Path(origin).parent))

View file

@ -0,0 +1,370 @@
"""
Class for managing, building, and caching built schemas.
The nwb.core and hdmf-common schema are statically built and stored in this repository,
but to make it feasible to use arbitrary schema, eg. those stored inside of
an NWB file, we need a bit of infrastructure for generating and caching
pydantic models on the fly.
Relationship to other modules:
- :mod:`.adapters` manage the conversion from NWB schema language to linkML.
- :mod:`.generators` create models like pydantic models from the linkML schema
- :mod:`.providers` then use ``adapters`` and ``generators`` to provide models
from generated schema!
"""
from typing import Dict, TypedDict, List, Optional, Literal, TypeVar, Any, Dict
from pathlib import Path
import os
from abc import abstractmethod
from linkml_runtime.linkml_model import SchemaDefinition
from linkml_runtime.dumpers import yaml_dumper
from linkml_runtime import SchemaView
from nwb_linkml.config import Config
from nwb_linkml import io
from nwb_linkml import adapters
from nwb_linkml.adapters.adapter import BuildResult
from nwb_linkml.maps.naming import module_case, version_module_case, relative_path
from nwb_schema_language import Namespaces
from nwb_linkml.generators.pydantic import NWBPydanticGenerator
class NamespaceVersion(TypedDict):
namespace: str
version: str
P = TypeVar('P')
class Provider:
"""
Metaclass for different kind of providers!
"""
PROVIDES: str
PROVIDES_CLASS: P = None
def __init__(self,
path: Optional[Path] = None,
verbose: bool = True):
if path is not None:
config = Config(cache_dir=path)
else:
config = Config()
self.config = config
self.cache_dir = config.cache_dir
@abstractmethod
@property
def path(self) -> Path:
"""
Base path for this kind of provider
"""
@abstractmethod
def build(self, *args: Any):
"""
Whatever needs to be done to build this thing, if applicable
"""
def namespace_path(
self,
namespace: str,
version: Optional[str] = None) -> Path:
"""
Get the location for a given namespace of this type.
Note that we don't check for existence, because this method should
also be used when generating schema --- this is the canonical location
Arguments:
namespace (str): Namespace to get!
version (str): Optional, version of namespace. If ``None``,
either get the most recent version built, or if
``namespace`` is ``core`` or ``hdmf-common``, use the
modules provided with the package. We do not use the most
recent *version*, but the most recently *generated* version
because it's assumed that's the one you want if you're just
gesturally reaching for one.
"""
namespace_module = module_case(namespace)
namespace_path = self.path / namespace_module
if not namespace_path.exists() and namespace in ('core', 'hdmf-common'):
# return builtins
if self.PROVIDES == 'linkml':
from nwb_linkml import schema
namespace_path = Path(schema.__file__)
elif self.PROVIDES == 'pydantic':
from nwb_linkml import models
namespace_path = Path(models.__file__)
if version is not None:
version_path = namespace_path / version_module_case(version)
else:
# or find the most recently built one
versions = sorted(namespace_path.iterdir(), key=os.path.getmtime)
if len(versions) == 0:
raise FileNotFoundError('No version provided, and no existing schema found')
version_path = versions[-1]
return version_path
class LinkMLProvider(Provider):
PROVIDES = 'linkml'
PROVIDES_CLASS = SchemaDefinition
@property
def path(self) -> Path:
return self.config.linkml_dir
def build_from_yaml(self, path: Path, **kwargs):
"""
Build a namespace's schema
Arguments:
path (:class:`pathlib.Path`): Path to the namespace .yaml
kwargs: passed to :meth:`.build`
"""
sch = {}
ns_dict = io.schema.load_yaml(path)
sch['namespace'] = ns_dict
namespace = Namespaces(**ns_dict)
for ns in namespace.namespaces:
for schema in ns.schema_:
if schema.source is None:
# this is normal, we'll resolve later
continue
yml_file = path.parent / schema.source
sch[yml_file.stem] = (io.schema.load_yaml(yml_file))
return self.build(schemas=sch, **kwargs)
def build(
self,
schemas:Dict[str, dict],
versions: Optional[List[NamespaceVersion]] = None,
dump: bool = True,
) -> BuildResult:
"""
Arguments:
schemas (dict): A dictionary of ``{'schema_name': {:schema_definition}}``.
The "namespace" schema should have the key ``namespace``, which is used
to infer version and schema name. Post-load maps should have already
been applied
versions (List[NamespaceVersion]): List of specific versions to use
for cross-namespace imports. If none is provided, use the most recent version
available.
dump (bool): If ``True`` (default), dump generated schema to YAML. otherwise just return
"""
ns = Namespaces(**schemas['namespace'])
typed_schemas = [
io.schema.load_schema_file(
path=Path(key + ".yaml"),
yaml=val)
for key,val in schemas.items()
if key != 'namespace'
]
ns_adapter = adapters.NamespacesAdapter(
namespaces=ns,
schemas=typed_schemas
)
self._find_imports(ns_adapter, versions, populate=True)
built = ns_adapter.build()
# write schemas to yaml files
namespace_sch = [sch for sch in built.schemas if 'namespace' in sch.annotations.keys()]
for ns_linkml in namespace_sch:
version = ns_adapter.versions[ns_linkml.name]
version_path = self.namespace_path(ns_linkml.name, version)
with open(version_path / 'namespace.yaml', 'w') as ns_f:
yaml_dumper.dump(ns_linkml, version_path)
# write the schemas for this namespace
ns_schema_names = ns_adapter.namespace_schemas(ns_linkml.name)
other_schema = [sch for sch in built.schemas if sch.name in ns_schema_names]
for sch in other_schema:
output_file = version_path / (sch.name + '.yaml')
yaml_dumper.dump(sch, output_file)
return built
def get(self, namespace: str, version: Optional[str] = None) -> SchemaView:
"""
Get a schema view over the namespace
"""
path = self.namespace_path(namespace, version) / 'namespace.yaml'
return SchemaView(path)
def _find_imports(self,
ns: adapters.NamespacesAdapter,
versions: Optional[List[NamespaceVersion]] = None,
populate: bool=True) -> Dict[str, List[str]]:
"""
Find relative paths to other linkml schema that need to be
imported, but lack an explicit source
Arguments:
ns (:class:`.NamespacesAdapter`): Namespaces to find imports to
versions (List[:class:`.NamespaceVersion`]): Specific versions to import
populate (bool): If ``True`` (default), modify the namespace adapter to include the imports,
otherwise just return
Returns:
dict of lists for relative paths to other schema namespaces
"""
import_paths = {}
for ns_name, needed_imports in ns.needed_imports.items():
our_path = self.namespace_path(ns_name, ns.versions[ns_name]) / 'namespace.yaml'
import_paths[ns_name] = []
for needed_import in needed_imports:
needed_version = None
if versions:
needed_versions = [v['version'] for v in versions if v['namespace'] == needed_import]
if len(needed_versions) > 0:
needed_version = needed_versions[0]
version_path = self.namespace_path(needed_import, needed_version) / 'namespace.yaml'
import_paths[ns_name].append(str(relative_path(version_path, our_path)))
if populate:
for sch in ns.schemas:
sch.imports.extend(import_paths)
return import_paths
class PydanticProvider(Provider):
PROVIDES = 'pydantic'
@property
def path(self) -> Path:
return self.config.pydantic_dir
def build(
self,
namespace: str | Path,
version: Optional[str] = None,
versions: Optional[List[NamespaceVersion]] = None,
dump: bool = True
) -> str:
if isinstance(namespace, str) and not (namespace.endswith('.yaml') or namespace.endswith('.yml')):
# we're given a name of a namespace to build
path = LinkMLProvider(path=self.config.cache_dir).namespace_path(namespace, version) / 'namespace.yaml'
else:
# given a path to a namespace linkml yaml file
path = Path(namespace)
generator = NWBPydanticGenerator(
str(path),
split=False,
versions=versions,
emit_metadata=True,
gen_slots=True,
pydantic_version='2'
)
serialized = generator.serialize()
if dump:
out_file = self.path / path.parts[-3] / path.parts[-2] / 'namespace.py'
with open(out_file, 'w') as ofile:
ofile.write(serialized)
return serialized
class SchemaProvider:
"""
Class to manage building and caching linkml and pydantic models generated
from nwb schema language
Behaves like a singleton without needing to be one - since we're working off
caches on disk that are indexed by hash in most "normal" conditions you should
be able to use this anywhere, though no file-level locks are present to ensure
consistency.
Store each generated schema in a directory structure indexed by
schema namespace name and a truncated hash of the loaded schema dictionaries
(not the hash of the .yaml file, since we are also provided schema in nwbfiles)
eg:
cache_dir
- linkml
- nwb_core
- hash_532gn90f
- nwb.core.namespace.yaml
- nwb.fore.file.yaml
- ...
- hash_fuia082f
- nwb.core.namespace.yaml
- ...
- my_schema
- hash_t3tn908h
- ...
- pydantic
- nwb_core
- hash_532gn90f
- core.py
- ...
- hash_fuia082f
- core.py
- ...
"""
def __init__(self,
path: Optional[Path] = None,
verbose: bool = True):
"""
Arguments:
path (bool): If provided, output to an explicit base directory.
Otherwise use that provided in ``NWB_LINKML_CACHE_DIR``
verbose (bool): If ``True`` (default), show progress bars and other messages
useful for interactive use
"""
if path is not None:
config = Config(cache_dir=path)
else:
config = Config()
self.cache_dir = config.cache_dir
self.pydantic_dir = config.pydantic_dir
self.linkml_dir = config.linkml_dir
self.verbose = verbose
def generate_linkml(
self,
schemas:Dict[str, dict],
versions: Optional[List[NamespaceVersion]] = None
):
"""
Generate linkml from loaded nwb schemas, either from yaml or from an
nwb file's ``/specifications`` group.
Arguments:
schemas (dict): A dictionary of ``{'schema_name': {:schema_definition}}``.
The "namespace" schema should have the key ``namespace``, which is used
to infer version and schema name. Post-load maps should have already
been applied
versions (List[NamespaceVersion]): List of specific versions to use
for cross-namespace imports. If none is provided, use the most recent version
available.
"""