From 46060c11543af1221ff521ac33a9dfd559840e1d Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Mon, 8 Apr 2024 18:36:47 -0700 Subject: [PATCH] dask and hdf5 array interfaces --- docs/conf.py | 2 + pdm.lock | 26 +---- pyproject.toml | 7 +- src/numpydantic/__init__.py | 5 +- src/numpydantic/interface/__init__.py | 4 +- src/numpydantic/interface/dask.py | 30 ++++++ src/numpydantic/interface/hdf5.py | 143 +++++++++++++++++++++++++ src/numpydantic/interface/interface.py | 15 ++- src/numpydantic/interface/numpy.py | 1 + src/numpydantic/meta.py | 23 ++-- src/numpydantic/ndarray.py | 3 - src/numpydantic/types.py | 3 +- tests/conftest.py | 40 +++++++ tests/test_interface/__init__.py | 0 tests/test_interface/conftest.py | 21 ++++ tests/test_interface/test_dask.py | 44 ++++++++ tests/test_interface/test_hdf5.py | 0 tests/test_interface/test_numpy.py | 0 18 files changed, 330 insertions(+), 37 deletions(-) create mode 100644 tests/test_interface/__init__.py create mode 100644 tests/test_interface/conftest.py create mode 100644 tests/test_interface/test_dask.py create mode 100644 tests/test_interface/test_hdf5.py create mode 100644 tests/test_interface/test_numpy.py diff --git a/docs/conf.py b/docs/conf.py index 2eb7a6e..ab96dd4 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -34,6 +34,8 @@ intersphinx_mapping = { "linkml": ("https://linkml.io/linkml/", None), "linkml_runtime": ("https://linkml.io/linkml/", None), "linkml-runtime": ("https://linkml.io/linkml/", None), + "dask": ("https://docs.dask.org/en/stable/", None), + "h5py": ("https://docs.h5py.org/en/stable/", None), } # -- Options for HTML output ------------------------------------------------- diff --git a/pdm.lock b/pdm.lock index 69484ac..b601942 100644 --- a/pdm.lock +++ b/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "arrays", "dask", "dev", "docs", "hdf5", "tests"] strategy = ["cross_platform", "inherit_metadata"] lock_version = "4.4.1" -content_hash = "sha256:761d4dccd4e594b9e441dddefdb5677d22a4a94c129183e0bb8c88d9acbea1b9" +content_hash = "sha256:37b2b742a3addd598fce4747623d941ce0b7b2f18b0c33e2a9a2015196239902" [[package]] name = "alabaster" @@ -371,7 +371,7 @@ files = [ [[package]] name = "dask" -version = "2024.4.0" +version = "2024.4.1" requires_python = ">=3.9" summary = "Parallel PyData with Task Scheduling" groups = ["arrays", "dask", "dev", "tests"] @@ -386,24 +386,8 @@ dependencies = [ "toolz>=0.10.0", ] files = [ - {file = "dask-2024.4.0-py3-none-any.whl", hash = "sha256:f8332781ffde3d3e49df31fe4066e1eab571a87b94a11661a8ecf06e2892ee6d"}, - {file = "dask-2024.4.0.tar.gz", hash = "sha256:d5be22660b332865e7e868df2f1322a75f6cacaf8dd9ec08057e6fa8a96a19ac"}, -] - -[[package]] -name = "dask" -version = "2024.4.0" -extras = ["array"] -requires_python = ">=3.9" -summary = "Parallel PyData with Task Scheduling" -groups = ["arrays", "dask", "dev", "tests"] -dependencies = [ - "dask==2024.4.0", - "numpy>=1.21", -] -files = [ - {file = "dask-2024.4.0-py3-none-any.whl", hash = "sha256:f8332781ffde3d3e49df31fe4066e1eab571a87b94a11661a8ecf06e2892ee6d"}, - {file = "dask-2024.4.0.tar.gz", hash = "sha256:d5be22660b332865e7e868df2f1322a75f6cacaf8dd9ec08057e6fa8a96a19ac"}, + {file = "dask-2024.4.1-py3-none-any.whl", hash = "sha256:cac5d28b9de7a7cfde46d6fbd8fa81f5654980d010b44d1dbe04dd13b5b63126"}, + {file = "dask-2024.4.1.tar.gz", hash = "sha256:6cd8eb03ddc8dc08d6ca5b167b8de559872bc51cc2b6587d0e9dc754ab19cdf0"}, ] [[package]] @@ -811,7 +795,7 @@ name = "numpy" version = "1.26.4" requires_python = ">=3.9" summary = "Fundamental package for array computing in Python" -groups = ["arrays", "dask", "default", "dev", "hdf5", "tests"] +groups = ["arrays", "default", "dev", "hdf5", "tests"] files = [ {file = "numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0"}, {file = "numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a"}, diff --git a/pyproject.toml b/pyproject.toml index 1c12a86..ce724dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,7 @@ dependencies = [ "pydantic>=2.3.0", "nptyping>=2.5.0", "blosc2<3.0.0,>=2.5.1", + "numpy>=1.24.0", ] requires-python = "<4.0,>=3.9" readme = "README.md" @@ -17,7 +18,7 @@ license = {text = "MIT"} [project.optional-dependencies] dask = [ - "dask[array]>=2024.1.1" + "dask>=2024.4.0", ] hdf5 = [ "h5py>=3.10.0" @@ -99,9 +100,11 @@ select = [ ] ignore = [ - "ANN101", "ANN102", "ANN401", + "ANN101", "ANN102", "ANN401", "ANN204", # builtin type annotations "UP006", "UP035", + # | for Union types (only supported >=3.10 + "UP007", "UP038", # docstrings for __init__ "D107", ] diff --git a/src/numpydantic/__init__.py b/src/numpydantic/__init__.py index f8fd847..fc4d17d 100644 --- a/src/numpydantic/__init__.py +++ b/src/numpydantic/__init__.py @@ -6,7 +6,10 @@ from numpydantic.monkeypatch import apply_patches apply_patches() from numpydantic.ndarray import NDArray - from numpydantic.meta import update_ndarray_stub +from nptyping import Shape + update_ndarray_stub() + +__all__ = ["NDArray", "Shape"] diff --git a/src/numpydantic/interface/__init__.py b/src/numpydantic/interface/__init__.py index fce5b1f..2ad3682 100644 --- a/src/numpydantic/interface/__init__.py +++ b/src/numpydantic/interface/__init__.py @@ -1,4 +1,6 @@ +from numpydantic.interface.dask import DaskInterface +from numpydantic.interface.hdf5 import H5Interface from numpydantic.interface.interface import Interface from numpydantic.interface.numpy import NumpyInterface -__all__ = ["Interface", "NumpyInterface"] +__all__ = ["Interface", "DaskInterface", "H5Interface", "NumpyInterface"] diff --git a/src/numpydantic/interface/dask.py b/src/numpydantic/interface/dask.py index e69de29..dd4b5e9 100644 --- a/src/numpydantic/interface/dask.py +++ b/src/numpydantic/interface/dask.py @@ -0,0 +1,30 @@ +from typing import Any +from numpydantic.interface.interface import Interface + +try: + from dask.array.core import Array as DaskArray +except ImportError: + DaskArray = None + + +class DaskInterface(Interface): + """ + Interface for Dask :class:`~dask.array.core.Array` + """ + + input_types = (DaskArray,) + return_type = DaskArray + + @classmethod + def check(cls, array: Any) -> bool: + """ + check if array is a dask array + """ + if DaskArray is not None and isinstance(array, DaskArray): + return True + return False + + @classmethod + def enabled(cls) -> bool: + """check if we successfully imported dask""" + return DaskArray is not None diff --git a/src/numpydantic/interface/hdf5.py b/src/numpydantic/interface/hdf5.py index e69de29..61f0557 100644 --- a/src/numpydantic/interface/hdf5.py +++ b/src/numpydantic/interface/hdf5.py @@ -0,0 +1,143 @@ +from pathlib import Path +from typing import Any, NamedTuple, Tuple, Union, TypeAlias + +import numpy as np + +from numpydantic.interface.interface import Interface +from numpydantic.types import NDArrayType + +try: + import h5py +except ImportError: + h5py = None + +H5Arraylike: TypeAlias = Tuple[Union[Path, str], str] + + +class H5Array(NamedTuple): + """Location specifier for arrays within an HDF5 file""" + + file: Union[Path, str] + """Location of HDF5 file""" + path: str + """Path within the HDF5 file""" + + +class H5Proxy: + """ + Proxy class to mimic numpy-like array behavior with an HDF5 array + + The attribute and item access methods only open the file for the duration of the method, + making it less perilous to share this object between threads and processes. + + This class attempts to be a passthrough class to a :class:`h5py.Dataset` object, + including its attributes and item getters/setters. + + When using read-only methods, no locking is attempted (beyond the HDF5 defaults), + but when using the write methods (setting an array value), try and use the ``locking`` + methods of :class:`h5py.File` . + + Args: + file (pathlib.Path | str): Location of hdf5 file on filesystem + path (str): Path to array within hdf5 file + """ + + def __init__(self, file: Union[Path, str], path: str): + self.file = Path(file) + self.path = path + + def array_exists(self) -> bool: + """Check that there is in fact an array at :attr:`.path` within :attr:`.file`""" + with h5py.File(self.file, "r") as h5f: + obj = h5f.get(self.path) + return obj is not None + + @classmethod + def from_h5array(cls, h5array: H5Array) -> "H5Proxy": + """Instantiate using :class:`.H5Array`""" + return H5Proxy(file=h5array.file, path=h5array.path) + + def __getattr__(self, item: str): + with h5py.File(self.file, "r") as h5f: + obj = h5f.get(self.path) + return getattr(obj, item) + + def __getitem__(self, item: Union[int, slice]) -> np.ndarray: + with h5py.File(self.file, "r") as h5f: + obj = h5f.get(self.path) + return obj[item] + + def __setitem__(self, key: Union[int, slice], value: Union[int, float, np.ndarray]): + with h5py.File(self.file, "r+", locking=True) as h5f: + obj = h5f.get(self.path) + obj[key] = value + + +class H5Interface(Interface): + """ + Interface for Arrays stored as datasets within an HDF5 file. + + Takes a :class:`.H5Array` specifier to select a :class:`h5py.Dataset` from a + :class:`h5py.File` and returns a :class:`.H5Proxy` class that acts like a + passthrough numpy-like interface to the dataset. + """ + + input_types = ( + H5Array, + H5Arraylike, + ) + return_type = H5Proxy + + @classmethod + def enabled(cls) -> bool: + """Check whether h5py can be imported""" + return h5py is not None + + @classmethod + def check(cls, array: Union[H5Array, Tuple[Union[Path, str], str]]) -> bool: + """Check that the given array is a :class:`.H5Array` or something that resembles one.""" + if isinstance(array, H5Array): + return True + + if isinstance(array, (tuple, list)) and len(array) == 2: + # check that the first arg is an hdf5 file + try: + file = Path(array[0]) + except TypeError: + # not a path, we don't apply. + return False + + if not file.exists(): + return False + + # hdf5 files are commonly given odd suffixes, + # so we just try and open it and see what happens + try: + with h5py.File(file, "r"): + # don't check that the array exists and raise here, + # this check is just for whether the validator applies or not. + pass + return True + except (FileNotFoundError, OSError): + return False + + return False + + def before_validation(self, array: Any) -> NDArrayType: + """Create an :class:`.H5Proxy` to use throughout validation""" + if isinstance(array, H5Array): + array = H5Proxy.from_h5array(h5array=array) + elif isinstance(array, (tuple, list)) and len(array) == 2: + array = H5Proxy(file=array[0], path=array[1]) + else: + raise ValueError( + "Need to specify a file and a path within an HDF5 file to use the HDF5 Interface" + ) + + if not array.array_exists(): + raise ValueError( + f"HDF5 file located at {array.file}, " + f"but no array found at {array.path}" + ) + + return array diff --git a/src/numpydantic/interface/interface.py b/src/numpydantic/interface/interface.py index d7b496d..f4c9b84 100644 --- a/src/numpydantic/interface/interface.py +++ b/src/numpydantic/interface/interface.py @@ -1,6 +1,6 @@ from abc import ABC, abstractmethod from operator import attrgetter -from typing import Any, Generic, List, Type, TypeVar, Tuple +from typing import Any, Generic, Tuple, Type, TypeVar from nptyping.shape_expression import check_shape @@ -15,6 +15,7 @@ class Interface(ABC, Generic[T]): Abstract parent class for interfaces to different array formats """ + input_types: Tuple[Any, ...] return_type: Type[T] priority: int = 0 @@ -109,6 +110,18 @@ class Interface(ABC, Generic[T]): """Return types for all enabled interfaces""" return tuple([i.return_type for i in cls.interfaces()]) + @classmethod + def input_types(cls) -> Tuple[Any, ...]: + """Input types for all enabled interfaces""" + in_types = [] + for iface in cls.interfaces(): + if isinstance(iface.input_types, tuple | list): + in_types.extend(iface.input_types) + else: + in_types.append(iface.input_types) + + return tuple(in_types) + @classmethod def match(cls, array: Any) -> Type["Interface"]: """ diff --git a/src/numpydantic/interface/numpy.py b/src/numpydantic/interface/numpy.py index 5bea417..fae7baf 100644 --- a/src/numpydantic/interface/numpy.py +++ b/src/numpydantic/interface/numpy.py @@ -17,6 +17,7 @@ class NumpyInterface(Interface): Numpy :class:`~numpy.ndarray` s! """ + input_types = (ndarray, list) return_type = ndarray @classmethod diff --git a/src/numpydantic/meta.py b/src/numpydantic/meta.py index 27e242d..d42b9b3 100644 --- a/src/numpydantic/meta.py +++ b/src/numpydantic/meta.py @@ -3,9 +3,12 @@ Metaprogramming functions for numpydantic to modify itself :) """ from pathlib import Path +from warnings import warn from numpydantic.interface import Interface +_BUILTIN_IMPORTS = ("import typing", "import pathlib") + def generate_ndarray_stub() -> str: """ @@ -14,11 +17,16 @@ def generate_ndarray_stub() -> str: import_strings = [ f"from {arr.__module__} import {arr.__name__}" - for arr in Interface.array_types() + for arr in Interface.input_types() + if arr.__module__ != "builtins" ] + import_strings.extend(_BUILTIN_IMPORTS) import_string = "\n".join(import_strings) - class_names = [arr.__name__ for arr in Interface.array_types()] + class_names = [ + arr.__name__ if arr.__module__ != "typing" else str(arr) + for arr in Interface.input_types() + ] class_union = " | ".join(class_names) ndarray_type = "NDArray = " + class_union @@ -32,8 +40,11 @@ def update_ndarray_stub() -> None: """ from numpydantic import ndarray - stub_string = generate_ndarray_stub() + try: + stub_string = generate_ndarray_stub() - pyi_file = Path(ndarray.__file__).with_suffix(".pyi") - with open(pyi_file, "w") as pyi: - pyi.write(stub_string) + pyi_file = Path(ndarray.__file__).with_suffix(".pyi") + with open(pyi_file, "w") as pyi: + pyi.write(stub_string) + except Exception as e: + warn(f"ndarray.pyi stub file could not be generated: {e}", stacklevel=1) diff --git a/src/numpydantic/ndarray.py b/src/numpydantic/ndarray.py index f4d51ee..aab2a81 100644 --- a/src/numpydantic/ndarray.py +++ b/src/numpydantic/ndarray.py @@ -165,9 +165,6 @@ class NDArray(NPTypingType, metaclass=NDArrayMeta): - https://docs.pydantic.dev/latest/usage/types/custom/#handling-third-party-types """ - def __init__(self: T): - pass - __args__: Tuple[ShapeType, DtypeType] = (Any, Any) @classmethod diff --git a/src/numpydantic/types.py b/src/numpydantic/types.py index a54d814..f54968e 100644 --- a/src/numpydantic/types.py +++ b/src/numpydantic/types.py @@ -4,12 +4,11 @@ Types for numpydantic Note that these are types as in python typing types, not classes. """ -from typing import Any, Protocol, Tuple, TypeVar, Union, runtime_checkable +from typing import Any, Protocol, Tuple, runtime_checkable import numpy as np from nptyping import DType - ShapeType = Tuple[int, ...] | Any DtypeType = np.dtype | str | type | Any | DType diff --git a/tests/conftest.py b/tests/conftest.py index e69de29..e84033c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -0,0 +1,40 @@ +import pytest +from pathlib import Path +from typing import Optional, Union, Type + +import h5py +import numpy as np +from pydantic import BaseModel, Field + +from numpydantic.interface.hdf5 import H5Array +from numpydantic import NDArray, Shape +from nptyping import Number + + +@pytest.fixture(scope="session") +def model_rgb() -> Type[BaseModel]: + class RGB(BaseModel): + array: Optional[ + Union[ + NDArray[Shape["* x, * y"], Number], + NDArray[Shape["* x, * y, 3 r_g_b"], Number], + NDArray[Shape["* x, * y, 3 r_g_b, 4 r_g_b_a"], Number], + ] + ] = Field(None) + + return RGB + + +@pytest.fixture(scope="function") +def h5file(tmp_path) -> h5py.File: + h5f = h5py.File(tmp_path / "file.h5", "w") + yield h5f + h5f.close() + + +@pytest.fixture(scope="function") +def h5_array(h5file) -> H5Array: + """trivial hdf5 array used for testing array existence""" + path = "/data" + h5file.create_dataset(path, data=np.zeros((3, 4))) + return H5Array(file=Path(h5file.filename), path=path) diff --git a/tests/test_interface/__init__.py b/tests/test_interface/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_interface/conftest.py b/tests/test_interface/conftest.py new file mode 100644 index 0000000..24f286e --- /dev/null +++ b/tests/test_interface/conftest.py @@ -0,0 +1,21 @@ +import pytest + +import numpy as np +import dask.array as da + +from numpydantic import interface +from tests.conftest import h5_array, h5file + + +@pytest.fixture( + scope="function", + params=[ + ([[1, 2], [3, 4]], interface.NumpyInterface), + (np.zeros((3, 4)), interface.NumpyInterface), + (h5_array, interface.H5Interface), + (da.random.random((10, 10)), interface.DaskInterface), + ], + ids=["numpy_list", "numpy", "H5Array", "dask"], +) +def interface_type(request): + return request.param diff --git a/tests/test_interface/test_dask.py b/tests/test_interface/test_dask.py new file mode 100644 index 0000000..51122d2 --- /dev/null +++ b/tests/test_interface/test_dask.py @@ -0,0 +1,44 @@ +import pytest + +import dask.array as da +from pydantic import ValidationError + +from numpydantic.interface import DaskInterface + + +def test_dask_enabled(): + """ + We need dask to be available to run these tests :) + """ + assert DaskInterface.enabled() + + +def test_dask_check(interface_type): + if interface_type[1] is DaskInterface: + assert DaskInterface.check(interface_type[0]) + else: + assert not DaskInterface.check(interface_type[0]) + + +@pytest.mark.parametrize( + "array,passes", + [ + (da.random.random((5, 10)), True), + (da.random.random((5, 10, 3)), True), + (da.random.random((5, 10, 3, 4)), True), + (da.random.random((5, 10, 4)), False), + (da.random.random((5, 10, 3, 6)), False), + (da.random.random((5, 10, 4, 6)), False), + ], +) +def test_dask_shape(model_rgb, array, passes): + if passes: + model_rgb(array=array) + else: + with pytest.raises(ValidationError): + model_rgb(array=array) + + +@pytest.mark.skip("TODO") +def test_dask_dtype(): + pass diff --git a/tests/test_interface/test_hdf5.py b/tests/test_interface/test_hdf5.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_interface/test_numpy.py b/tests/test_interface/test_numpy.py new file mode 100644 index 0000000..e69de29