remove nwb_linkml remnants, add to_json method for json serialization

This commit is contained in:
sneakers-the-rat 2024-04-22 19:31:56 -07:00
parent 46060c1154
commit 5b722bb6da
Signed by untrusted user who does not match committer: jonny
GPG key ID: 6DCB96EF1E4D232D
11 changed files with 212 additions and 204 deletions

View file

@ -8,7 +8,6 @@ authors = [
dependencies = [ dependencies = [
"pydantic>=2.3.0", "pydantic>=2.3.0",
"nptyping>=2.5.0", "nptyping>=2.5.0",
"blosc2<3.0.0,>=2.5.1",
"numpy>=1.24.0", "numpy>=1.24.0",
] ]
requires-python = "<4.0,>=3.9" requires-python = "<4.0,>=3.9"

View file

@ -1,4 +1,5 @@
from typing import Any from typing import Any
import numpy as np
from numpydantic.interface.interface import Interface from numpydantic.interface.interface import Interface
try: try:
@ -28,3 +29,19 @@ class DaskInterface(Interface):
def enabled(cls) -> bool: def enabled(cls) -> bool:
"""check if we successfully imported dask""" """check if we successfully imported dask"""
return DaskArray is not None return DaskArray is not None
@classmethod
def to_json(cls, array: DaskArray) -> list:
"""
Convert an array to a JSON serializable array by first converting to a numpy
array and then to a list.
.. note::
This is likely a very memory intensive operation if you are using dask for
large arrays. This can't be avoided, since the creation of the json string
happens in-memory with Pydantic, so you are likely looking for a different
method of serialization here using the python object itself rather than
its JSON representation.
"""
return np.array(array).tolist()

View file

@ -1,3 +1,4 @@
import pdb
from pathlib import Path from pathlib import Path
from typing import Any, NamedTuple, Tuple, Union, TypeAlias from typing import Any, NamedTuple, Tuple, Union, TypeAlias
@ -14,7 +15,7 @@ except ImportError:
H5Arraylike: TypeAlias = Tuple[Union[Path, str], str] H5Arraylike: TypeAlias = Tuple[Union[Path, str], str]
class H5Array(NamedTuple): class H5ArrayPath(NamedTuple):
"""Location specifier for arrays within an HDF5 file""" """Location specifier for arrays within an HDF5 file"""
file: Union[Path, str] file: Union[Path, str]
@ -43,6 +44,7 @@ class H5Proxy:
""" """
def __init__(self, file: Union[Path, str], path: str): def __init__(self, file: Union[Path, str], path: str):
self._h5f = None
self.file = Path(file) self.file = Path(file)
self.path = path self.path = path
@ -53,8 +55,8 @@ class H5Proxy:
return obj is not None return obj is not None
@classmethod @classmethod
def from_h5array(cls, h5array: H5Array) -> "H5Proxy": def from_h5array(cls, h5array: H5ArrayPath) -> "H5Proxy":
"""Instantiate using :class:`.H5Array`""" """Instantiate using :class:`.H5ArrayPath`"""
return H5Proxy(file=h5array.file, path=h5array.path) return H5Proxy(file=h5array.file, path=h5array.path)
def __getattr__(self, item: str): def __getattr__(self, item: str):
@ -72,18 +74,37 @@ class H5Proxy:
obj = h5f.get(self.path) obj = h5f.get(self.path)
obj[key] = value obj[key] = value
def open(self, mode: str = "r"):
"""
Return the opened :class:`h5py.Dataset` object
You must remember to close the associated file with :meth:`.close`
"""
if self._h5f is None:
self._h5f = h5py.File(self.file, mode)
return self._h5f.get(self.path)
def close(self):
"""
Close the :class:`h5py.File` object left open when returning the dataset with
:meth:`.open`
"""
if self._h5f is not None:
self._h5f.close()
self._h5f = None
class H5Interface(Interface): class H5Interface(Interface):
""" """
Interface for Arrays stored as datasets within an HDF5 file. Interface for Arrays stored as datasets within an HDF5 file.
Takes a :class:`.H5Array` specifier to select a :class:`h5py.Dataset` from a Takes a :class:`.H5ArrayPath` specifier to select a :class:`h5py.Dataset` from a
:class:`h5py.File` and returns a :class:`.H5Proxy` class that acts like a :class:`h5py.File` and returns a :class:`.H5Proxy` class that acts like a
passthrough numpy-like interface to the dataset. passthrough numpy-like interface to the dataset.
""" """
input_types = ( input_types = (
H5Array, H5ArrayPath,
H5Arraylike, H5Arraylike,
) )
return_type = H5Proxy return_type = H5Proxy
@ -94,9 +115,9 @@ class H5Interface(Interface):
return h5py is not None return h5py is not None
@classmethod @classmethod
def check(cls, array: Union[H5Array, Tuple[Union[Path, str], str]]) -> bool: def check(cls, array: Union[H5ArrayPath, Tuple[Union[Path, str], str]]) -> bool:
"""Check that the given array is a :class:`.H5Array` or something that resembles one.""" """Check that the given array is a :class:`.H5ArrayPath` or something that resembles one."""
if isinstance(array, H5Array): if isinstance(array, H5ArrayPath):
return True return True
if isinstance(array, (tuple, list)) and len(array) == 2: if isinstance(array, (tuple, list)) and len(array) == 2:
@ -125,7 +146,7 @@ class H5Interface(Interface):
def before_validation(self, array: Any) -> NDArrayType: def before_validation(self, array: Any) -> NDArrayType:
"""Create an :class:`.H5Proxy` to use throughout validation""" """Create an :class:`.H5Proxy` to use throughout validation"""
if isinstance(array, H5Array): if isinstance(array, H5ArrayPath):
array = H5Proxy.from_h5array(h5array=array) array = H5Proxy.from_h5array(h5array=array)
elif isinstance(array, (tuple, list)) and len(array) == 2: elif isinstance(array, (tuple, list)) and len(array) == 2:
array = H5Proxy(file=array[0], path=array[1]) array = H5Proxy(file=array[0], path=array[1])
@ -141,3 +162,17 @@ class H5Interface(Interface):
) )
return array return array
@classmethod
def to_json(cls, array: H5Proxy) -> dict:
try:
dset = array.open()
meta = {
"file": array.file,
"path": array.path,
"attrs": dict(dset.attrs),
"array": dset[:].tolist(),
}
return meta
finally:
array.close()

View file

@ -1,7 +1,8 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from operator import attrgetter from operator import attrgetter
from typing import Any, Generic, Tuple, Type, TypeVar from typing import Any, Generic, Tuple, Type, TypeVar, Union
import numpy as np
from nptyping.shape_expression import check_shape from nptyping.shape_expression import check_shape
from numpydantic.exceptions import DtypeError, ShapeError from numpydantic.exceptions import DtypeError, ShapeError
@ -92,6 +93,15 @@ class Interface(ABC, Generic[T]):
Check whether this array interface can be used (eg. its dependent packages are installed, etc.) Check whether this array interface can be used (eg. its dependent packages are installed, etc.)
""" """
@classmethod
def to_json(cls, array: Type[T]) -> Union[list, dict]:
"""
Convert an array of :attr:`.return_type` to a JSON-compatible format using base python types
"""
if not isinstance(array, np.ndarray):
array = np.array(array)
return array.tolist()
@classmethod @classmethod
def interfaces(cls) -> Tuple[Type["Interface"], ...]: def interfaces(cls) -> Tuple[Type["Interface"], ...]:
""" """
@ -106,7 +116,7 @@ class Interface(ABC, Generic[T]):
) )
@classmethod @classmethod
def array_types(cls) -> Tuple[NDArrayType, ...]: def return_types(cls) -> Tuple[NDArrayType, ...]:
"""Return types for all enabled interfaces""" """Return types for all enabled interfaces"""
return tuple([i.return_type for i in cls.interfaces()]) return tuple([i.return_type for i in cls.interfaces()])
@ -125,7 +135,7 @@ class Interface(ABC, Generic[T]):
@classmethod @classmethod
def match(cls, array: Any) -> Type["Interface"]: def match(cls, array: Any) -> Type["Interface"]:
""" """
Find the interface that should be used for this array Find the interface that should be used for this array based on its input type
""" """
matches = [i for i in cls.interfaces() if i.check(array)] matches = [i for i in cls.interfaces() if i.check(array)]
if len(matches) > 1: if len(matches) > 1:
@ -136,3 +146,21 @@ class Interface(ABC, Generic[T]):
raise ValueError(f"No matching interfaces found for input {array}") raise ValueError(f"No matching interfaces found for input {array}")
else: else:
return matches[0] return matches[0]
@classmethod
def match_output(cls, array: Any) -> Type["Interface"]:
"""
Find the interface that should be used based on the output type -
in the case that the output type differs from the input type, eg.
the HDF5 interface, match an instantiated array for purposes of
serialization to json, etc.
"""
matches = [i for i in cls.interfaces() if isinstance(array, i.return_type)]
if len(matches) > 1:
msg = f"More than one interface matches output {array}:\n"
msg += "\n".join([f" - {i}" for i in matches])
raise ValueError(msg)
elif len(matches) == 0:
raise ValueError(f"No matching interfaces found for output {array}")
else:
return matches[0]

View file

@ -2,6 +2,7 @@ from datetime import datetime
from typing import Any from typing import Any
import numpy as np import numpy as np
from nptyping import Float, Int, String, Bool
np_to_python = { np_to_python = {
Any: Any, Any: Any,
@ -74,3 +75,5 @@ flat_to_nptyping = {
"AnyType": "Any", "AnyType": "Any",
"object": "Object", "object": "Object",
} }
python_to_nptyping = {float: Float, str: String, int: Int, bool: Bool}

View file

@ -4,19 +4,14 @@ Extension of nptyping NDArray for pydantic that allows for JSON-Schema serializa
* Order to store data in (row first) * Order to store data in (row first)
""" """
import base64
import sys
from collections.abc import Callable from collections.abc import Callable
from copy import copy from typing import Any, Tuple, Union
from typing import Any, Tuple, TypeVar, cast, Union
import blosc2
import nptyping.structure import nptyping.structure
import numpy as np import numpy as np
from nptyping import Shape from nptyping import Shape
from nptyping.ndarray import NDArrayMeta as _NDArrayMeta from nptyping.ndarray import NDArrayMeta as _NDArrayMeta
from nptyping.nptyping_type import NPTypingType from nptyping.nptyping_type import NPTypingType
from nptyping.shape_expression import check_shape
from pydantic_core import core_schema from pydantic_core import core_schema
from pydantic_core.core_schema import ListSchema from pydantic_core.core_schema import ListSchema
@ -65,57 +60,7 @@ def list_of_lists_schema(shape: Shape, array_type_handler: dict) -> ListSchema:
return list_schema return list_schema
def jsonize_array(array: NDArrayType) -> list | dict: def _get_validate_interface(shape: ShapeType, dtype: DtypeType) -> Callable:
"""
Render an array to base python types that can be serialized to JSON
For small arrays, returns a list of lists.
If the array is over :class:`.COMPRESSION_THRESHOLD` bytes, use :func:`.compress_array`
to return a compressed b64 encoded string.
Args:
array (:class:`np.ndarray`, :class:`dask.DaskArray`): Array to render as a list!
"""
# if isinstance(array, DaskArray):
# arr = array.__array__()
# elif isinstance(array, NDArrayProxy):
# arr = array[:]
# else:
# arr = array
arr = array
# If we're larger than 16kB then compress array!
if sys.getsizeof(arr) > COMPRESSION_THRESHOLD:
packed = blosc2.pack_array2(arr)
packed = base64.b64encode(packed)
ret = {
"array": packed,
"shape": copy(arr.shape),
"dtype": copy(arr.dtype.name),
"unpack_fns": ["base64.b64decode", "blosc2.unpack_array2"],
}
return ret
else:
return arr.tolist()
def get_validate_shape(shape: Shape) -> Callable:
"""
Get a closure around a shape validation function that includes the shape definition
"""
def validate_shape(value: Any) -> np.ndarray:
assert shape is Any or check_shape(
value.shape, shape
), f"Invalid shape! expected shape {shape.prepared_args}, got shape {value.shape}"
return value
return validate_shape
def get_validate_interface(shape: ShapeType, dtype: DtypeType) -> Callable:
""" """
Validate using a matching :class:`.Interface` class using its :meth:`.Interface.validate` method Validate using a matching :class:`.Interface` class using its :meth:`.Interface.validate` method
""" """
@ -129,6 +74,11 @@ def get_validate_interface(shape: ShapeType, dtype: DtypeType) -> Callable:
return validate_interface return validate_interface
def _jsonize_array(value: Any) -> Union[list, dict]:
interface_cls = Interface.match_output(value)
return interface_cls.to_json(value)
def coerce_list(value: Any) -> np.ndarray: def coerce_list(value: Any) -> np.ndarray:
""" """
If a value is passed as a list or list of lists, try and coerce it into an array If a value is passed as a list or list of lists, try and coerce it into an array
@ -147,9 +97,6 @@ class NDArrayMeta(_NDArrayMeta, implementation="NDArray"):
""" """
T = TypeVar("T")
class NDArray(NPTypingType, metaclass=NDArrayMeta): class NDArray(NPTypingType, metaclass=NDArrayMeta):
""" """
Constrained array type allowing npytyping syntax for dtype and shape validation and serialization. Constrained array type allowing npytyping syntax for dtype and shape validation and serialization.
@ -196,15 +143,11 @@ class NDArray(NPTypingType, metaclass=NDArrayMeta):
[ [
core_schema.no_info_plain_validator_function(coerce_list), core_schema.no_info_plain_validator_function(coerce_list),
core_schema.with_info_plain_validator_function( core_schema.with_info_plain_validator_function(
get_validate_interface(shape, dtype) _get_validate_interface(shape, dtype)
), ),
] ]
), ),
serialization=core_schema.plain_serializer_function_ser_schema( serialization=core_schema.plain_serializer_function_ser_schema(
jsonize_array, when_used="json" _jsonize_array, when_used="json"
), ),
) )
NDArray = cast(Union[np.ndarray, list[int]], NDArray)
# NDArray = cast(Union[Interface.array_types()], NDArray)

View file

@ -1,48 +0,0 @@
from collections.abc import Callable
from pathlib import Path
from typing import Any
import h5py
import numpy as np
from nptyping import NDArray as _NDArray
from pydantic_core import core_schema
class NDArrayProxy:
"""
Thin proxy to numpy arrays stored within hdf5 files,
only read into memory when accessed, but otherwise
passthrough all attempts to access attributes.
"""
def __init__(self, h5f_file: Path | str, path: str):
"""
Args:
h5f_file (:class:`pathlib.Path`): Path to source HDF5 file
path (str): Location within HDF5 file where this array is located
"""
self.h5f_file = Path(h5f_file)
self.path = path
def __getattr__(self, item) -> Any:
with h5py.File(self.h5f_file, "r") as h5f:
obj = h5f.get(self.path)
return getattr(obj, item)
def __getitem__(self, slice: slice) -> np.ndarray:
with h5py.File(self.h5f_file, "r") as h5f:
obj = h5f.get(self.path)
return obj[slice]
def __setitem__(self, slice, value) -> None:
raise NotImplementedError("Cant write into an arrayproxy yet!")
@classmethod
def __get_pydantic_core_schema__(
cls,
_source_type: _NDArray,
_handler: Callable[[Any], core_schema.CoreSchema],
) -> core_schema.CoreSchema:
from numpydantic import NDArray
return NDArray.__get_pydantic_core_schema__(cls, _source_type, _handler)

View file

@ -1,40 +1,11 @@
import pytest import pytest
from pathlib import Path
from typing import Optional, Union, Type
import h5py from tests.fixtures import *
import numpy as np
from pydantic import BaseModel, Field
from numpydantic.interface.hdf5 import H5Array
from numpydantic import NDArray, Shape
from nptyping import Number
@pytest.fixture(scope="session") def pytest_addoption(parser):
def model_rgb() -> Type[BaseModel]: parser.addoption(
class RGB(BaseModel): "--with-output",
array: Optional[ action="store_true",
Union[ help="Keep test outputs in the __tmp__ directory",
NDArray[Shape["* x, * y"], Number], )
NDArray[Shape["* x, * y, 3 r_g_b"], Number],
NDArray[Shape["* x, * y, 3 r_g_b, 4 r_g_b_a"], Number],
]
] = Field(None)
return RGB
@pytest.fixture(scope="function")
def h5file(tmp_path) -> h5py.File:
h5f = h5py.File(tmp_path / "file.h5", "w")
yield h5f
h5f.close()
@pytest.fixture(scope="function")
def h5_array(h5file) -> H5Array:
"""trivial hdf5 array used for testing array existence"""
path = "/data"
h5file.create_dataset(path, data=np.zeros((3, 4)))
return H5Array(file=Path(h5file.filename), path=path)

View file

@ -1,27 +1,38 @@
import shutil import shutil
from pathlib import Path from pathlib import Path
from typing import Callable, Optional, Tuple, Type, Union
import h5py
import numpy as np
import pytest import pytest
from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition from nptyping import Number
from pydantic import BaseModel, Field
from numpydantic.interface.hdf5 import H5ArrayPath
from numpydantic import NDArray, Shape
from numpydantic.maps import python_to_nptyping
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def tmp_output_dir() -> Path: def tmp_output_dir(request: pytest.FixtureRequest) -> Path:
path = Path(__file__).parent.resolve() / "__tmp__" path = Path(__file__).parent.resolve() / "__tmp__"
if path.exists(): if path.exists():
shutil.rmtree(str(path)) shutil.rmtree(str(path))
path.mkdir() path.mkdir()
return path yield path
if not request.config.getvalue("--with-output"):
shutil.rmtree(str(path))
@pytest.fixture(scope="function") @pytest.fixture(scope="function")
def tmp_output_dir_func(tmp_output_dir) -> Path: def tmp_output_dir_func(tmp_output_dir, request: pytest.FixtureRequest) -> Path:
""" """
tmp output dir that gets cleared between every function tmp output dir that gets cleared between every function
cleans at the start rather than at cleanup in case the output is to be inspected cleans at the start rather than at cleanup in case the output is to be inspected
""" """
subpath = tmp_output_dir / "__tmpfunc__" subpath = tmp_output_dir / f"__tmpfunc_{request.node.name}__"
if subpath.exists(): if subpath.exists():
shutil.rmtree(str(subpath)) shutil.rmtree(str(subpath))
subpath.mkdir() subpath.mkdir()
@ -29,46 +40,68 @@ def tmp_output_dir_func(tmp_output_dir) -> Path:
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def tmp_output_dir_mod(tmp_output_dir) -> Path: def tmp_output_dir_mod(tmp_output_dir, request: pytest.FixtureRequest) -> Path:
""" """
tmp output dir that gets cleared between every function tmp output dir that gets cleared between every function
cleans at the start rather than at cleanup in case the output is to be inspected cleans at the start rather than at cleanup in case the output is to be inspected
""" """
subpath = tmp_output_dir / "__tmpmod__" subpath = tmp_output_dir / f"__tmpmod_{request.module}__"
if subpath.exists(): if subpath.exists():
shutil.rmtree(str(subpath)) shutil.rmtree(str(subpath))
subpath.mkdir() subpath.mkdir()
return subpath return subpath
@pytest.fixture() @pytest.fixture(scope="function")
def nwb_linkml_array() -> tuple[ClassDefinition, str]: def array_model() -> (
classdef = ClassDefinition( Callable[[Tuple[int, ...], Union[Type, np.dtype]], Type[BaseModel]]
name="NWB_Linkml Array", ):
description="Main class's array", def _model(
is_a="Arraylike", shape: Tuple[int, ...] = (10, 10), dtype: Union[Type, np.dtype] = float
attributes=[ ) -> Type[BaseModel]:
SlotDefinition(name="x", range="numeric", required=True), shape_str = ", ".join([str(s) for s in shape])
SlotDefinition(name="y", range="numeric", required=True),
SlotDefinition( class MyModel(BaseModel):
name="z", array: NDArray[Shape[shape_str], python_to_nptyping[dtype]]
range="numeric",
required=False, return MyModel
maximum_cardinality=3,
minimum_cardinality=3, return _model
),
SlotDefinition(
name="a", @pytest.fixture(scope="session")
range="numeric", def model_rgb() -> Type[BaseModel]:
required=False, class RGB(BaseModel):
minimum_cardinality=4, array: Optional[
maximum_cardinality=4, Union[
), NDArray[Shape["* x, * y"], Number],
], NDArray[Shape["* x, * y, 3 r_g_b"], Number],
) NDArray[Shape["* x, * y, 3 r_g_b, 4 r_g_b_a"], Number],
generated = """Union[ ]
NDArray[Shape["* x, * y"], Number], ] = Field(None)
NDArray[Shape["* x, * y, 3 z"], Number],
NDArray[Shape["* x, * y, 3 z, 4 a"], Number] return RGB
]"""
return classdef, generated
@pytest.fixture(scope="function")
def hdf5_file(tmp_output_dir_func) -> h5py.File:
h5f_file = tmp_output_dir_func / "h5f.h5"
h5f = h5py.File(h5f_file, "w")
yield h5f
h5f.close()
@pytest.fixture(scope="function")
def hdf5_array(
hdf5_file, request
) -> Callable[[Tuple[int, ...], Union[np.dtype, type]], H5ArrayPath]:
def _hdf5_array(
shape: Tuple[int, ...] = (10, 10), dtype: Union[np.dtype, type] = float
) -> H5ArrayPath:
array_path = "/" + "_".join([str(s) for s in shape]) + "__" + dtype.__name__
data = np.random.random(shape).astype(dtype)
_ = hdf5_file.create_dataset(array_path, data=data)
return H5ArrayPath(Path(hdf5_file.filename), array_path)
return _hdf5_array

View file

@ -4,7 +4,7 @@ import numpy as np
import dask.array as da import dask.array as da
from numpydantic import interface from numpydantic import interface
from tests.conftest import h5_array, h5file from tests.fixtures import hdf5_array
@pytest.fixture( @pytest.fixture(
@ -12,10 +12,10 @@ from tests.conftest import h5_array, h5file
params=[ params=[
([[1, 2], [3, 4]], interface.NumpyInterface), ([[1, 2], [3, 4]], interface.NumpyInterface),
(np.zeros((3, 4)), interface.NumpyInterface), (np.zeros((3, 4)), interface.NumpyInterface),
(h5_array, interface.H5Interface), (hdf5_array, interface.H5Interface),
(da.random.random((10, 10)), interface.DaskInterface), (da.random.random((10, 10)), interface.DaskInterface),
], ],
ids=["numpy_list", "numpy", "H5Array", "dask"], ids=["numpy_list", "numpy", "H5ArrayPath", "dask"],
) )
def interface_type(request): def interface_type(request):
return request.param return request.param

View file

@ -0,0 +1,27 @@
import pdb
import json
from pydantic import BaseModel
def test_to_json(hdf5_array, array_model):
"""
Test serialization of HDF5 arrays to JSON
Args:
hdf5_array:
Returns:
"""
array = hdf5_array((10, 10), int)
model = array_model((10, 10), int)
instance = model(array=array) # type: BaseModel
json_str = instance.model_dump_json()
json_dict = json.loads(json_str)["array"]
assert json_dict["file"] == str(array.file)
assert json_dict["path"] == str(array.path)
assert json_dict["attrs"] == {}
assert json_dict["array"] == instance.array[:].tolist()