yaml peek to quickly find the value of a key

This commit is contained in:
sneakers-the-rat 2024-07-19 20:50:18 -07:00
parent 27d18b69d8
commit 2e7670a2bd
Signed by untrusted user who does not match committer: jonny
GPG key ID: 6DCB96EF1E4D232D
3 changed files with 131 additions and 21 deletions

View file

@ -0,0 +1,63 @@
"""
Utility functions for dealing with yaml files.
No we are not going to implement a yaml parser here
"""
import re
from pathlib import Path
from typing import Literal, List, Union, overload
@overload
def yaml_peek(key: str, path: Union[str, Path], root:bool = True, first:Literal[True]=True) -> str: ...
@overload
def yaml_peek(key: str, path: Union[str, Path], root:bool = True, first:Literal[False]=False) -> List[str]: ...
@overload
def yaml_peek(key: str, path: Union[str, Path], root:bool = True, first:bool=True) -> Union[str, List[str]]: ...
def yaml_peek(key: str, path: Union[str, Path], root:bool = True, first:bool=True) -> Union[str, List[str]]:
"""
Peek into a yaml file without parsing the whole file to retrieve the value of a single key.
This function is _not_ designed for robustness to the yaml spec, it is for simple key: value
pairs, not fancy shit like multiline strings, tagged values, etc. If you want it to be,
then i'm afraid you'll have to make a PR about it.
Returns a string no matter what the yaml type is so ya have to do your own casting if you want
Args:
key (str): The key to peek for
path (:class:`pathlib.Path` , str): The yaml file to peek into
root (bool): Only find keys at the root of the document (default ``True`` ), otherwise
find keys at any level of nesting.
first (bool): Only return the first appearance of the key (default). Otherwise return a
list of values (not implemented lol)
Returns:
str
"""
if root:
pattern = re.compile(rf'^(?P<key>{key}):\s*(?P<value>\S.*)')
else:
pattern = re.compile(rf'^\s*(?P<key>{key}):\s*(?P<value>\S.*)')
res = None
if first:
with open(path, 'r') as yfile:
for l in yfile:
res = pattern.match(l)
if res:
break
if res:
return res.groupdict()['value']
else:
with open(path, 'r') as yfile:
text = yfile.read()
res = [match.groupdict()['value'] for match in pattern.finditer(text)]
if res:
return res
raise KeyError(f'Key {key} not found in {path}')

View file

@ -13,6 +13,7 @@ from typing import List, Optional, Type
from pydantic import BaseModel
from nwb_linkml import io
from nwb_linkml.io.yaml import yaml_peek
from nwb_linkml.generators.pydantic import NWBPydanticGenerator
from nwb_linkml.maps.naming import module_case, version_module_case
from nwb_linkml.providers import LinkMLProvider, Provider
@ -36,9 +37,6 @@ class PydanticProvider(Provider):
def __init__(self, path: Optional[Path] = None, verbose: bool = True):
super().__init__(path, verbose)
# create a metapathfinder to find module we might create
pathfinder = EctopicModelFinder(self.path)
sys.meta_path.append(pathfinder)
@property
def path(self) -> Path:
@ -50,7 +48,6 @@ class PydanticProvider(Provider):
namespace: str | Path,
out_file: Optional[Path] = None,
version: Optional[str] = None,
versions: Optional[dict] = None,
split: bool = True,
dump: bool = True,
force: bool = False,
@ -75,13 +72,6 @@ class PydanticProvider(Provider):
version (Optional[str]): The version of the schema to build, if present.
Works similarly to ``version`` in :class:`.LinkMLProvider`.
Ignored if ``namespace`` is a Path.
versions (Optional[dict]): An explicit mapping of namespaces and versions to use when
building the combined pydantic `namespace.py` file.
Since NWB doesn't have an explicit version dependency system between schema,
there is intrinsic ambiguity between which version
of which schema should be used when imported from another.
This mapping allows those ambiguities to be resolved.
See :class:`.NWBPydanticGenerator` 's ``versions`` argument for more information.
split (bool): If ``False`` (default), generate a single ``namespace.py`` file,
otherwise generate a python file for each schema in the namespace
in addition to a ``namespace.py`` that imports from them
@ -107,19 +97,15 @@ class PydanticProvider(Provider):
if version is None:
# Get the most recently built version
version = LinkMLProvider(path=self.config.cache_dir).available_versions[name][-1]
fn = path.parts[-1]
fn = path.name
else:
# given a path to a namespace linkml yaml file
path = Path(namespace)
# FIXME: this is extremely fragile, but get the details from the path.
# this is faster than reading yaml for now
name = path.parts[-3]
version = path.parts[-2]
fn = path.parts[-1]
name = yaml_peek('name', path)
version = yaml_peek('version', path)
fn = path.name
version = version_module_case(version)
# this is extremely fragile, we should not be inferring version number from paths...
# TODO: we need an efficient peek for specific keys within a yaml file
if out_file is None:
fn = fn.removesuffix(".yaml")
fn = module_case(fn) + ".py"
@ -137,10 +123,14 @@ class PydanticProvider(Provider):
if versions is None:
versions = self._get_dependent_versions(path)
if split:
return self._build_split(path, versions, default_kwargs, dump, out_file, force)
result = self._build_split(path, versions, default_kwargs, dump, out_file, force)
else:
return self._build_unsplit(path, versions, default_kwargs, dump, out_file, force)
result = self._build_unsplit(path, versions, default_kwargs, dump, out_file, force)
self.install_pathfinder()
return result
def _build_unsplit(
self,
@ -406,6 +396,19 @@ class PydanticProvider(Provider):
mod = self.get(namespace, version)
return getattr(mod, class_)
def install_pathfinder(self):
"""
Add a :class:`.EctopicModelFinder` instance that allows us to import from
the directory that we are generating models into
"""
# check if one already exists
matches = [finder for finder in sys.meta_path if isinstance(finder, EctopicModelFinder) and finder.path == self.path]
if len(matches) > 0:
return
pathfinder = EctopicModelFinder(self.path)
sys.meta_path.append(pathfinder)
class EctopicModelFinder(MetaPathFinder):
"""

View file

@ -0,0 +1,44 @@
import pytest
import yaml
from nwb_linkml.io.yaml import yaml_peek
@pytest.fixture()
def yaml_file(tmp_path):
data = {
'key1': 'val1',
'key2': 'val2',
'key3': {
'key1': 'val3',
'key4': 'val4'
}
}
out_file = tmp_path / 'test.yaml'
with open(out_file, 'w') as yfile:
yaml.dump(data, yfile)
yield out_file
out_file.unlink()
@pytest.mark.parametrize(
'key,expected,root,first',
[
('key1', 'val1', True, True),
('key1', 'val1', False, True),
('key1', ['val1'], True, False),
('key1', ['val1', 'val3'], False, False),
('key2', 'val2', True, True),
('key3', False, True, True),
('key4', False, True, True),
('key4', 'val4', False, True)
]
)
def test_peek_yaml(key, expected, root, first, yaml_file):
if not expected:
with pytest.raises(KeyError):
_ = yaml_peek(key, yaml_file, root=root, first=first)
else:
assert yaml_peek(key, yaml_file, root=root, first=first)