diff --git a/docs/changelog.md b/docs/changelog.md index b93adb2..6959982 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,6 +2,40 @@ ## 1.* +### 1.5.0 - 24-09-02 - `str` support for HDF5 + +Strings in hdf5 are tricky! HDF5 doesn't have native support for unicode, +but it can be persuaded to store data in ASCII or virtualized utf-8 under somewhat obscure conditions. + +This PR uses h5py's string methods to expose string datasets (compound or not) +via the h5proxy with the `asstr()` view method. +This also allows us to set strings with normal python strings, +although hdf5 datasets can only be created with `bytes` or other non-unicode encodings. + +Since numpydantic isn't necessarily a tool for *creating* hdf5 files +(nobody should be doing that), but rather an interface to them, +tests are included for reading and validating (unskip the existing string tests) +as well as setting/getting. + +```python +import h5py +import numpy as np +from pydantic import BaseModel +from numpydantic import NDArray +from typing import Any + +class MyModel(BaseModel): + array: NDArray[Any, str] + +h5f = h5py.File('my_data.h5', 'w') +data = np.random.random((10,10)).astype(bytes) +_ = h5f.create_dataset('/dataset', data=data) + +instance = MyModel(array=('my_data.h5', '/dataset')) +instance[0,0] = 'hey' +assert instance[0,0] == 'hey' +``` + ### 1.4.1 - 24-09-02 - `len()` support and dunder method testing It's pretty natural to want to do `len(array)` as a shorthand for `array.shape[0]`, diff --git a/pyproject.toml b/pyproject.toml index 57ae6be..59f725c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "numpydantic" -version = "1.4.1" +version = "1.5.0" description = "Type and shape validation and serialization for numpy arrays in pydantic models" authors = [ {name = "sneakers-the-rat", email = "sneakers-the-rat@protonmail.com"}, diff --git a/src/numpydantic/interface/hdf5.py b/src/numpydantic/interface/hdf5.py index 656273d..6bb7dda 100644 --- a/src/numpydantic/interface/hdf5.py +++ b/src/numpydantic/interface/hdf5.py @@ -1,5 +1,27 @@ """ Interfaces for HDF5 Datasets + +.. note:: + + HDF5 arrays are accessed through a proxy class :class:`.H5Proxy` . + Getting/setting values should work as normal, **except** that setting + values on nested views is impossible - + + Specifically this doesn't work: + + .. code-block:: python + + my_model.array[0][0] = 1 + + But this does work: + + .. code-block:: python + + my_model.array[0,0] = 1 + + To have direct access to the hdf5 dataset, use the + :meth:`.H5Proxy.open` method. + """ import sys @@ -10,7 +32,7 @@ import numpy as np from pydantic import SerializationInfo from numpydantic.interface.interface import Interface -from numpydantic.types import NDArrayType +from numpydantic.types import DtypeType, NDArrayType try: import h5py @@ -102,7 +124,25 @@ class H5Proxy: with h5py.File(self.file, "r") as h5f: obj = h5f.get(self.path) if self.field is not None: - obj = obj.fields(self.field) + if encoding := h5py.h5t.check_string_dtype(obj.dtype[self.field]): + if isinstance(item, tuple): + item = (*item, self.field) + else: + item = (item, self.field) + + try: + # single string + return obj[item].decode(encoding.encoding) + except AttributeError: + # numpy array of bytes + return np.char.decode(obj[item], encoding=encoding.encoding) + + else: + obj = obj.fields(self.field) + else: + if h5py.h5t.check_string_dtype(obj.dtype): + obj = obj.asstr() + return obj[item] def __setitem__( @@ -222,6 +262,17 @@ class H5Interface(Interface): return array + def get_dtype(self, array: NDArrayType) -> DtypeType: + """ + Get the dtype from the input array + + Subclasses to correctly handle + """ + if h5py.h5t.check_string_dtype(array.dtype): + return str + else: + return array.dtype + @classmethod def to_json(cls, array: H5Proxy, info: Optional[SerializationInfo] = None) -> dict: """ diff --git a/src/numpydantic/interface/interface.py b/src/numpydantic/interface/interface.py index 3dc3fdc..1ef307f 100644 --- a/src/numpydantic/interface/interface.py +++ b/src/numpydantic/interface/interface.py @@ -126,7 +126,10 @@ class Interface(ABC, Generic[T]): if isinstance(self.dtype, tuple): valid = dtype in self.dtype elif self.dtype is np.str_: - valid = getattr(dtype, "type", None) is np.str_ or dtype is np.str_ + valid = getattr(dtype, "type", None) in (np.str_, str) or dtype in ( + np.str_, + str, + ) else: # try to match as any subclass, if self.dtype is a class try: diff --git a/tests/fixtures.py b/tests/fixtures.py index 6fbc5ad..9d5bba6 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -122,13 +122,22 @@ def hdf5_array( compound: bool = False, ) -> H5ArrayPath: array_path = "/" + "_".join([str(s) for s in shape]) + "__" + dtype.__name__ + if not compound: - data = np.random.random(shape).astype(dtype) + if dtype is str: + data = np.random.random(shape).astype(bytes) + else: + data = np.random.random(shape).astype(dtype) _ = hdf5_file.create_dataset(array_path, data=data) return H5ArrayPath(Path(hdf5_file.filename), array_path) else: - dt = np.dtype([("data", dtype), ("extra", "i8")]) - data = np.zeros(shape, dtype=dt) + + if dtype is str: + dt = np.dtype([("data", np.dtype("S10")), ("extra", "i8")]) + data = np.array([("hey", 0)] * np.prod(shape), dtype=dt).reshape(shape) + else: + dt = np.dtype([("data", dtype), ("extra", "i8")]) + data = np.zeros(shape, dtype=dt) _ = hdf5_file.create_dataset(array_path, data=data) return H5ArrayPath(Path(hdf5_file.filename), array_path, "data") diff --git a/tests/test_interface/test_hdf5.py b/tests/test_interface/test_hdf5.py index 23d26a3..891dd9f 100644 --- a/tests/test_interface/test_hdf5.py +++ b/tests/test_interface/test_hdf5.py @@ -78,8 +78,6 @@ def test_hdf5_shape(shape_cases, hdf5_array, compound): @pytest.mark.parametrize("compound", [True, False]) def test_hdf5_dtype(dtype_cases, hdf5_array, compound): - if dtype_cases.dtype is str: - pytest.skip("hdf5 cant do string arrays") _test_hdf5_case(dtype_cases, hdf5_array, compound) @@ -157,3 +155,22 @@ def test_compound_dtype(tmp_path): assert all(instance.array[1, :] == 0) instance.array[1] = 2 assert all(instance.array[1] == 2) + + +@pytest.mark.parametrize("compound", [True, False]) +def test_strings(hdf5_array, compound): + """ + HDF5 proxy can get and set strings just like any other dtype + """ + array = hdf5_array((10, 10), str, compound=compound) + + class MyModel(BaseModel): + array: NDArray[Shape["10, 10"], str] + + instance = MyModel(array=array) + instance.array[0, 0] = "hey" + assert instance.array[0, 0] == "hey" + assert isinstance(instance.array[0, 1], str) + + instance.array[1] = "sup" + assert all(instance.array[1] == "sup")