From d884055067e3cdce3855e72ef42fe7b649129cfe Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Mon, 29 Apr 2024 19:49:38 -0700 Subject: [PATCH] not quite working zarr implementation --- docs/conf.py | 1 + pdm.lock | 69 ++++++++++++++- pyproject.toml | 5 +- src/numpydantic/interface/__init__.py | 9 +- src/numpydantic/interface/zarr.py | 120 +++++++++++++++++++++++++- tests/fixtures.py | 20 +++++ tests/test_interface/conftest.py | 16 +++- tests/test_interface/test_zarr.py | 64 ++++++++++++++ 8 files changed, 297 insertions(+), 7 deletions(-) create mode 100644 tests/test_interface/test_zarr.py diff --git a/docs/conf.py b/docs/conf.py index ab96dd4..b4406a5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -36,6 +36,7 @@ intersphinx_mapping = { "linkml-runtime": ("https://linkml.io/linkml/", None), "dask": ("https://docs.dask.org/en/stable/", None), "h5py": ("https://docs.h5py.org/en/stable/", None), + "zarr": ("https://zarr.readthedocs.io/en/stable/", None), } # -- Options for HTML output ------------------------------------------------- diff --git a/pdm.lock b/pdm.lock index 21ed479..ce47f8a 100644 --- a/pdm.lock +++ b/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "arrays", "dask", "dev", "docs", "hdf5", "tests"] strategy = ["cross_platform", "inherit_metadata"] lock_version = "4.4.1" -content_hash = "sha256:870d3111512c0bccf768ad2c06acb01e0bd9e3091f8544bca2bcf609eea02102" +content_hash = "sha256:4e22ffd83cb1ae3916c6c41c77f74b84db5a77e572c796cc537023bd6c3e3128" [[package]] name = "alabaster" @@ -46,6 +46,15 @@ files = [ {file = "anyio-4.3.0.tar.gz", hash = "sha256:f75253795a87df48568485fd18cdd2a3fa5c4f7c5be8e5e36637733fce06fed6"}, ] +[[package]] +name = "asciitree" +version = "0.3.3" +summary = "Draws ASCII trees." +groups = ["default"] +files = [ + {file = "asciitree-0.3.3.tar.gz", hash = "sha256:4aa4b9b649f85e3fcb343363d97564aa1fb62e249677f2e18a96765145cc0f6e"}, +] + [[package]] name = "autodoc-pydantic" version = "2.1.0" @@ -395,6 +404,18 @@ files = [ {file = "exceptiongroup-1.2.1.tar.gz", hash = "sha256:a4785e48b045528f5bfe627b6ad554ff32def154f42372786903b7abcfe1aa16"}, ] +[[package]] +name = "fasteners" +version = "0.19" +requires_python = ">=3.6" +summary = "A python package that provides useful locks" +groups = ["default"] +marker = "sys_platform != \"emscripten\"" +files = [ + {file = "fasteners-0.19-py3-none-any.whl", hash = "sha256:758819cb5d94cdedf4e836988b74de396ceacb8e2794d21f82d131fd9ee77237"}, + {file = "fasteners-0.19.tar.gz", hash = "sha256:b4f37c3ac52d8a445af3a66bce57b33b5e90b97c696b7b984f530cf8f0ded09c"}, +] + [[package]] name = "fsspec" version = "2024.3.1" @@ -692,6 +713,35 @@ files = [ {file = "nptyping-2.5.0.tar.gz", hash = "sha256:e3d35b53af967e6fb407c3016ff9abae954d3a0568f7cc13a461084224e8e20a"}, ] +[[package]] +name = "numcodecs" +version = "0.12.1" +requires_python = ">=3.8" +summary = "A Python package providing buffer compression and transformation codecs for use in data storage and communication applications." +groups = ["default"] +dependencies = [ + "numpy>=1.7", +] +files = [ + {file = "numcodecs-0.12.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d37f628fe92b3699e65831d5733feca74d2e33b50ef29118ffd41c13c677210e"}, + {file = "numcodecs-0.12.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:941b7446b68cf79f089bcfe92edaa3b154533dcbcd82474f994b28f2eedb1c60"}, + {file = "numcodecs-0.12.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e79bf9d1d37199ac00a60ff3adb64757523291d19d03116832e600cac391c51"}, + {file = "numcodecs-0.12.1-cp310-cp310-win_amd64.whl", hash = "sha256:82d7107f80f9307235cb7e74719292d101c7ea1e393fe628817f0d635b7384f5"}, + {file = "numcodecs-0.12.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:eeaf42768910f1c6eebf6c1bb00160728e62c9343df9e2e315dc9fe12e3f6071"}, + {file = "numcodecs-0.12.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:135b2d47563f7b9dc5ee6ce3d1b81b0f1397f69309e909f1a35bb0f7c553d45e"}, + {file = "numcodecs-0.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a191a8e347ecd016e5c357f2bf41fbcb026f6ffe78fff50c77ab12e96701d155"}, + {file = "numcodecs-0.12.1-cp311-cp311-win_amd64.whl", hash = "sha256:21d8267bd4313f4d16f5b6287731d4c8ebdab236038f29ad1b0e93c9b2ca64ee"}, + {file = "numcodecs-0.12.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:2f84df6b8693206365a5b37c005bfa9d1be486122bde683a7b6446af4b75d862"}, + {file = "numcodecs-0.12.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:760627780a8b6afdb7f942f2a0ddaf4e31d3d7eea1d8498cf0fd3204a33c4618"}, + {file = "numcodecs-0.12.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c258bd1d3dfa75a9b708540d23b2da43d63607f9df76dfa0309a7597d1de3b73"}, + {file = "numcodecs-0.12.1-cp312-cp312-win_amd64.whl", hash = "sha256:e04649ea504aff858dbe294631f098fbfd671baf58bfc04fc48d746554c05d67"}, + {file = "numcodecs-0.12.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2fbb12a6a1abe95926f25c65e283762d63a9bf9e43c0de2c6a1a798347dfcb40"}, + {file = "numcodecs-0.12.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f2207871868b2464dc11c513965fd99b958a9d7cde2629be7b2dc84fdaab013b"}, + {file = "numcodecs-0.12.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:abff3554a6892a89aacf7b642a044e4535499edf07aeae2f2e6e8fc08c9ba07f"}, + {file = "numcodecs-0.12.1-cp39-cp39-win_amd64.whl", hash = "sha256:ef964d4860d3e6b38df0633caf3e51dc850a6293fd8e93240473642681d95136"}, + {file = "numcodecs-0.12.1.tar.gz", hash = "sha256:05d91a433733e7eef268d7e80ec226a0232da244289614a8f3826901aec1098e"}, +] + [[package]] name = "numpy" version = "1.26.4" @@ -1459,6 +1509,23 @@ files = [ {file = "websockets-12.0.tar.gz", hash = "sha256:81df9cbcbb6c260de1e007e58c011bfebe2dafc8435107b0537f393dd38c8b1b"}, ] +[[package]] +name = "zarr" +version = "2.17.2" +requires_python = ">=3.9" +summary = "An implementation of chunked, compressed, N-dimensional arrays for Python" +groups = ["default"] +dependencies = [ + "asciitree", + "fasteners; sys_platform != \"emscripten\"", + "numcodecs>=0.10.0", + "numpy>=1.23", +] +files = [ + {file = "zarr-2.17.2-py3-none-any.whl", hash = "sha256:70d7cc07c24280c380ef80644151d136b7503b0d83c9f214e8000ddc0f57f69b"}, + {file = "zarr-2.17.2.tar.gz", hash = "sha256:2cbaa6cb4e342d45152d4a7a4b2013c337fcd3a8e7bc98253560180de60552ce"}, +] + [[package]] name = "zipp" version = "3.18.1" diff --git a/pyproject.toml b/pyproject.toml index 865807f..9584aa1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,8 +22,11 @@ dask = [ hdf5 = [ "h5py>=3.10.0" ] +zarr = [ + "zarr>=2.17.2", +] arrays = [ - "numpydantic[dask,hdf5]" + "numpydantic[dask,hdf5,zarr]" ] tests = [ "numpydantic[arrays]", diff --git a/src/numpydantic/interface/__init__.py b/src/numpydantic/interface/__init__.py index 3139756..2a98d8b 100644 --- a/src/numpydantic/interface/__init__.py +++ b/src/numpydantic/interface/__init__.py @@ -6,5 +6,12 @@ from numpydantic.interface.dask import DaskInterface from numpydantic.interface.hdf5 import H5Interface from numpydantic.interface.interface import Interface from numpydantic.interface.numpy import NumpyInterface +from numpydantic.interface.zarr import ZarrInterface -__all__ = ["Interface", "DaskInterface", "H5Interface", "NumpyInterface"] +__all__ = [ + "Interface", + "DaskInterface", + "H5Interface", + "NumpyInterface", + "ZarrInterface", +] diff --git a/src/numpydantic/interface/zarr.py b/src/numpydantic/interface/zarr.py index 4880d4e..8e8a2ea 100644 --- a/src/numpydantic/interface/zarr.py +++ b/src/numpydantic/interface/zarr.py @@ -1,5 +1,121 @@ """ Interface to zarr arrays - -(Not Implemented) """ + +import contextlib +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Optional, Union, Sequence + +from numpydantic.interface.interface import Interface + +try: + from zarr.core import Array as ZarrArray + from zarr.storage import StoreLike + import zarr +except ImportError: + ZarrArray = None + StoreLike = None + storage = None + + +@dataclass +class ZarrArrayPath: + """ + Map to an array within a zarr store. + + See :func:`zarr.open` + """ + + file: Union[Path, str] + """Location of Zarr store file or directory""" + path: Optional[str] = None + """Path to array within hierarchical zarr store""" + + def open(self, **kwargs) -> ZarrArray: + return zarr.open(str(self.file), path=self.path, **kwargs) + + @classmethod + def from_iterable(cls, spec: Sequence) -> "ZarrArrayPath": + if len(spec) == 1: + return ZarrArrayPath(file=spec[0]) + elif len(spec) == 2: + return ZarrArrayPath(file=spec[0], path=spec[1]) + else: + raise ValueError("Only len 1-2 iterables can be used for a ZarrArrayPath") + + +class ZarrInterface(Interface): + """ + Interface to in-memory or on-disk zarr arrays + """ + + input_types = (Path, ZarrArray, ZarrArrayPath) + return_type = ZarrArray + + @classmethod + def enabled(cls) -> bool: + """True if zarr is installed""" + return ZarrArray is not None + + @staticmethod + def _get_array( + array: Union[ZarrArray, str, Path, ZarrArrayPath, Sequence] + ) -> ZarrArray: + if isinstance(array, ZarrArray): + return array + + if isinstance(array, (str, Path)): + array = ZarrArrayPath(file=array) + elif isinstance(array, (tuple, list)): + array = ZarrArrayPath.from_iterable(array) + + return array.open(mode="a") + + @classmethod + def check(cls, array: Any) -> bool: + """ + Check if array is in-memory zarr array, + a path to a zarr array, or a :class:`.ZarrArrayPath` + """ + if isinstance(array, ZarrArray): + return True + + # See if can be coerced to ZarrArrayPath + if isinstance(array, (Path, str)): + array = ZarrArrayPath(file=array) + + if isinstance(array, (tuple, list)): + # something that can be coerced to ZarrArrayPath + with contextlib.suppress(ValueError): + array = ZarrArrayPath.from_iterable(array) + + if isinstance(array, ZarrArrayPath): + with contextlib.suppress(Exception): + arr = array.open(mode="r") + if isinstance(arr, ZarrArray): + return True + + return False + + def before_validation( + self, array: Union[ZarrArray, str, Path, ZarrArrayPath, Sequence] + ) -> ZarrArray: + """ + Ensure that the zarr array is opened + """ + return self._get_array(array) + + @classmethod + def to_json( + cls, array: Union[ZarrArray, str, Path, ZarrArrayPath, Sequence] + ) -> dict: + """ + Dump just the metadata for an array from :meth:`zarr.core.Array.info_items` + plus the :meth:`zarr.core.Array.hexdigest` + """ + array = cls._get_array(array) + info = array.info_items() + info_dict = {i[0]: i[1] for i in info} + info_dict["hexdigest"] = array.hexdigest() + return info_dict diff --git a/tests/fixtures.py b/tests/fixtures.py index a24ab76..138ccb4 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -7,8 +7,10 @@ import numpy as np import pytest from nptyping import Number from pydantic import BaseModel, Field +import zarr from numpydantic.interface.hdf5 import H5ArrayPath +from numpydantic.interface.zarr import ZarrArrayPath from numpydantic import NDArray, Shape from numpydantic.maps import python_to_nptyping @@ -105,3 +107,21 @@ def hdf5_array( return H5ArrayPath(Path(hdf5_file.filename), array_path) return _hdf5_array + + +@pytest.fixture(scope="function") +def zarr_nested_array(tmp_output_dir_func) -> ZarrArrayPath: + """Zarr array within a nested array""" + file = tmp_output_dir_func / "nested.zarr" + path = "a/b/c" + root = zarr.open(str(file), mode="w") + array = root.zeros(path, shape=(100, 100), chunks=(10, 10)) + return ZarrArrayPath(file=file, path=path) + + +@pytest.fixture(scope="function") +def zarr_array(tmp_output_dir_func) -> Path: + file = tmp_output_dir_func / "array.zarr" + array = zarr.open(str(file), mode="w", shape=(100, 100), chunks=(10, 10)) + array[:] = 0 + return file diff --git a/tests/test_interface/conftest.py b/tests/test_interface/conftest.py index c14eda7..85f1c7e 100644 --- a/tests/test_interface/conftest.py +++ b/tests/test_interface/conftest.py @@ -2,9 +2,10 @@ import pytest import numpy as np import dask.array as da +import zarr from numpydantic import interface -from tests.fixtures import hdf5_array +from tests.fixtures import hdf5_array, zarr_nested_array, zarr_array @pytest.fixture( @@ -14,8 +15,19 @@ from tests.fixtures import hdf5_array (np.zeros((3, 4)), interface.NumpyInterface), (hdf5_array, interface.H5Interface), (da.random.random((10, 10)), interface.DaskInterface), + (zarr.ones((10, 10)), interface.ZarrInterface), + (zarr_nested_array, interface.ZarrInterface), + (zarr_array, interface.ZarrInterface), + ], + ids=[ + "numpy_list", + "numpy", + "H5ArrayPath", + "dask", + "zarr_memory", + "zarr_nested", + "zarr_array", ], - ids=["numpy_list", "numpy", "H5ArrayPath", "dask"], ) def interface_type(request): return request.param diff --git a/tests/test_interface/test_zarr.py b/tests/test_interface/test_zarr.py new file mode 100644 index 0000000..f1d2458 --- /dev/null +++ b/tests/test_interface/test_zarr.py @@ -0,0 +1,64 @@ +import pytest +import zarr + +from pydantic import ValidationError + +from numpydantic.interface import ZarrInterface + + +@pytest.fixture() +def dir_array(tmp_output_dir_func) -> zarr.DirectoryStore: + store = zarr.DirectoryStore(tmp_output_dir_func / "array.zarr") + return store + + +@pytest.fixture() +def zip_array(tmp_output_dir_func) -> zarr.ZipStore: + store = zarr.ZipStore(tmp_output_dir_func / "array.zip", mode="w") + return store + + +@pytest.fixture() +def nested_dir_array(tmp_output_dir_func) -> zarr.NestedDirectoryStore: + store = zarr.NestedDirectoryStore(tmp_output_dir_func / "nested") + return store + + +STORES = ( + dir_array, + zip_array, +) +"""stores for single arrays""" + + +def test_zarr_enabled(): + assert ZarrInterface.enabled() + + +def test_zarr_check(interface_type): + """ + We should only use the zarr interface for zarr-like things + """ + if interface_type[1] is ZarrInterface: + assert ZarrInterface.check(interface_type[0]) + else: + assert not ZarrInterface.check(interface_type[0]) + + +@pytest.mark.parametrize( + "array,passes", + [ + (zarr.zeros((5, 10)), True), + (zarr.zeros((5, 10, 3)), True), + (zarr.zeros((5, 10, 3, 4)), True), + (zarr.zeros((5, 10, 4)), False), + (zarr.zeros((5, 10, 3, 6)), False), + (zarr.zeros((5, 10, 4, 6)), False), + ], +) +def test_zarr_shape(model_rgb, array, passes): + if passes: + model_rgb(array=array) + else: + with pytest.raises(ValidationError): + model_rgb(array=array)