From 0e6ea07d5e9eb1be5718b45874107a17271911f8 Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Mon, 2 Sep 2024 22:14:47 -0700 Subject: [PATCH 1/6] Add support for strings in hdf5 --- src/numpydantic/interface/hdf5.py | 49 ++++++++++++++++++++++++-- src/numpydantic/interface/interface.py | 5 ++- tests/fixtures.py | 9 +++-- tests/test_interface/test_hdf5.py | 17 +++++++-- 4 files changed, 73 insertions(+), 7 deletions(-) diff --git a/src/numpydantic/interface/hdf5.py b/src/numpydantic/interface/hdf5.py index 656273d..b63e760 100644 --- a/src/numpydantic/interface/hdf5.py +++ b/src/numpydantic/interface/hdf5.py @@ -1,5 +1,27 @@ """ Interfaces for HDF5 Datasets + +.. note:: + + HDF5 arrays are accessed through a proxy class :class:`.H5Proxy` . + Getting/setting values should work as normal, **except** that setting + values on nested views is impossible - + + Specifically this doesn't work: + + .. code-block:: python + + my_model.array[0][0] = 1 + + But this does work: + + .. code-block:: python + + my_model.array[0,0] = 1 + + To have direct access to the hdf5 dataset, use the + :meth:`.H5Proxy.open` method. + """ import sys @@ -10,7 +32,7 @@ import numpy as np from pydantic import SerializationInfo from numpydantic.interface.interface import Interface -from numpydantic.types import NDArrayType +from numpydantic.types import DtypeType, NDArrayType try: import h5py @@ -102,7 +124,14 @@ class H5Proxy: with h5py.File(self.file, "r") as h5f: obj = h5f.get(self.path) if self.field is not None: - obj = obj.fields(self.field) + if h5py.h5t.check_string_dtype(obj.dtype[self.field]): + obj = obj.fields(self.field).asstr() + else: + obj = obj.fields(self.field) + else: + if h5py.h5t.check_string_dtype(obj.dtype): + obj = obj.asstr() + return obj[item] def __setitem__( @@ -222,6 +251,22 @@ class H5Interface(Interface): return array + def get_dtype(self, array: NDArrayType) -> DtypeType: + """ + Get the dtype from the input array + + Subclasses to correctly handle + """ + if hasattr(array.dtype, "type") and array.dtype.type is np.object_: + if h5py.h5t.check_string_dtype(array.dtype): + return str + else: + return self.get_object_dtype(array) + elif h5py.h5t.check_string_dtype(array.dtype): + return str + else: + return array.dtype + @classmethod def to_json(cls, array: H5Proxy, info: Optional[SerializationInfo] = None) -> dict: """ diff --git a/src/numpydantic/interface/interface.py b/src/numpydantic/interface/interface.py index 3dc3fdc..1ef307f 100644 --- a/src/numpydantic/interface/interface.py +++ b/src/numpydantic/interface/interface.py @@ -126,7 +126,10 @@ class Interface(ABC, Generic[T]): if isinstance(self.dtype, tuple): valid = dtype in self.dtype elif self.dtype is np.str_: - valid = getattr(dtype, "type", None) is np.str_ or dtype is np.str_ + valid = getattr(dtype, "type", None) in (np.str_, str) or dtype in ( + np.str_, + str, + ) else: # try to match as any subclass, if self.dtype is a class try: diff --git a/tests/fixtures.py b/tests/fixtures.py index 6fbc5ad..cb5b59b 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -127,8 +127,13 @@ def hdf5_array( _ = hdf5_file.create_dataset(array_path, data=data) return H5ArrayPath(Path(hdf5_file.filename), array_path) else: - dt = np.dtype([("data", dtype), ("extra", "i8")]) - data = np.zeros(shape, dtype=dt) + + if dtype is str: + dt = np.dtype([("data", np.dtype("S10")), ("extra", "i8")]) + data = np.array([("hey", 0)] * np.prod(shape), dtype=dt).reshape(shape) + else: + dt = np.dtype([("data", dtype), ("extra", "i8")]) + data = np.zeros(shape, dtype=dt) _ = hdf5_file.create_dataset(array_path, data=data) return H5ArrayPath(Path(hdf5_file.filename), array_path, "data") diff --git a/tests/test_interface/test_hdf5.py b/tests/test_interface/test_hdf5.py index 23d26a3..67dcc44 100644 --- a/tests/test_interface/test_hdf5.py +++ b/tests/test_interface/test_hdf5.py @@ -78,8 +78,6 @@ def test_hdf5_shape(shape_cases, hdf5_array, compound): @pytest.mark.parametrize("compound", [True, False]) def test_hdf5_dtype(dtype_cases, hdf5_array, compound): - if dtype_cases.dtype is str: - pytest.skip("hdf5 cant do string arrays") _test_hdf5_case(dtype_cases, hdf5_array, compound) @@ -157,3 +155,18 @@ def test_compound_dtype(tmp_path): assert all(instance.array[1, :] == 0) instance.array[1] = 2 assert all(instance.array[1] == 2) + + +def test_strings(hdf5_array): + """ + HDF5 proxy can get and set strings just like any other dtype + """ + array = hdf5_array((10, 10), str) + + class MyModel(BaseModel): + array: NDArray[Shape["10, 10"], str] + + instance = MyModel(array=array) + instance.array[0, 0] = "hey" + assert instance.array[0, 0] == "hey" + assert isinstance(instance.array[0, 1], str) From 067ffa0342fce95e99d0c210e52c827c1eaec058 Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Mon, 2 Sep 2024 22:23:41 -0700 Subject: [PATCH 2/6] bump changelog, version --- docs/changelog.md | 33 +++++++++++++++++++++++++++++++++ pyproject.toml | 2 +- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/docs/changelog.md b/docs/changelog.md index b93adb2..5dfe547 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,6 +2,39 @@ ## 1.* +### 1.5.0 - 24-09-02 - `str` support for HDF5 + +Strings in hdf5 are tricky! HDF5 doesn't have native support for unicode, +but it can be persuaded to store data in ASCII or virtualized utf-8 under somewhat obscure conditions. + +This PR uses h5py's string methods to expose string datasets (compound or not) +via the h5proxy with the `asstr()` view method. +This also allows us to set strings with normal python strings. + +Since numpydantic isn't necessarily a tool for *creating* hdf5 files +(nobody should be doing that), but rather an interface to them, +tests are included for reading and validating (unskip the existing string tests) +as well as setting/getting. + +```python +import h5py +import numpy as np +from pydantic import BaseModel +from numpydantic import NDArray +from typing import Any + +class MyModel(BaseModel): + array: NDArray[Any, str] + +h5f = h5py.File('my_data.h5', 'w') +data = np.random.random((10,10)).astype(str) +_ = h5f.create_dataset('/dataset', data=data) + +instance = MyModel(array=('my_data.h5', '/dataset')) +instance[0,0] = 'hey' +assert instance[0,0] == 'hey' +``` + ### 1.4.1 - 24-09-02 - `len()` support and dunder method testing It's pretty natural to want to do `len(array)` as a shorthand for `array.shape[0]`, diff --git a/pyproject.toml b/pyproject.toml index 57ae6be..59f725c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "numpydantic" -version = "1.4.1" +version = "1.5.0" description = "Type and shape validation and serialization for numpy arrays in pydantic models" authors = [ {name = "sneakers-the-rat", email = "sneakers-the-rat@protonmail.com"}, From 0f1e0d0caf56f52d9a9bd0bde70a010c00b82454 Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Mon, 2 Sep 2024 22:29:58 -0700 Subject: [PATCH 3/6] oop still have to do bytes conversion --- docs/changelog.md | 5 +++-- tests/fixtures.py | 6 +++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index 5dfe547..6959982 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -9,7 +9,8 @@ but it can be persuaded to store data in ASCII or virtualized utf-8 under somewh This PR uses h5py's string methods to expose string datasets (compound or not) via the h5proxy with the `asstr()` view method. -This also allows us to set strings with normal python strings. +This also allows us to set strings with normal python strings, +although hdf5 datasets can only be created with `bytes` or other non-unicode encodings. Since numpydantic isn't necessarily a tool for *creating* hdf5 files (nobody should be doing that), but rather an interface to them, @@ -27,7 +28,7 @@ class MyModel(BaseModel): array: NDArray[Any, str] h5f = h5py.File('my_data.h5', 'w') -data = np.random.random((10,10)).astype(str) +data = np.random.random((10,10)).astype(bytes) _ = h5f.create_dataset('/dataset', data=data) instance = MyModel(array=('my_data.h5', '/dataset')) diff --git a/tests/fixtures.py b/tests/fixtures.py index cb5b59b..9d5bba6 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -122,8 +122,12 @@ def hdf5_array( compound: bool = False, ) -> H5ArrayPath: array_path = "/" + "_".join([str(s) for s in shape]) + "__" + dtype.__name__ + if not compound: - data = np.random.random(shape).astype(dtype) + if dtype is str: + data = np.random.random(shape).astype(bytes) + else: + data = np.random.random(shape).astype(dtype) _ = hdf5_file.create_dataset(array_path, data=data) return H5ArrayPath(Path(hdf5_file.filename), array_path) else: From f28c766b96d87f5f3d938588ce16d97878fbd095 Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Mon, 2 Sep 2024 22:40:17 -0700 Subject: [PATCH 4/6] strings in compound dtypes --- src/numpydantic/interface/hdf5.py | 9 +++++++-- tests/test_interface/test_hdf5.py | 5 +++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/numpydantic/interface/hdf5.py b/src/numpydantic/interface/hdf5.py index b63e760..0e72669 100644 --- a/src/numpydantic/interface/hdf5.py +++ b/src/numpydantic/interface/hdf5.py @@ -124,8 +124,13 @@ class H5Proxy: with h5py.File(self.file, "r") as h5f: obj = h5f.get(self.path) if self.field is not None: - if h5py.h5t.check_string_dtype(obj.dtype[self.field]): - obj = obj.fields(self.field).asstr() + if encoding := h5py.h5t.check_string_dtype(obj.dtype[self.field]): + if isinstance(item, tuple): + item = (*item, self.field) + else: + item = (item, self.field) + + return obj[item].decode(encoding.encoding) else: obj = obj.fields(self.field) else: diff --git a/tests/test_interface/test_hdf5.py b/tests/test_interface/test_hdf5.py index 67dcc44..7d00174 100644 --- a/tests/test_interface/test_hdf5.py +++ b/tests/test_interface/test_hdf5.py @@ -157,11 +157,12 @@ def test_compound_dtype(tmp_path): assert all(instance.array[1] == 2) -def test_strings(hdf5_array): +@pytest.mark.parametrize("compound", [True, False]) +def test_strings(hdf5_array, compound): """ HDF5 proxy can get and set strings just like any other dtype """ - array = hdf5_array((10, 10), str) + array = hdf5_array((10, 10), str, compound=compound) class MyModel(BaseModel): array: NDArray[Shape["10, 10"], str] From e78c170a2b6e22aa13d13130edfcbb49dcafb398 Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Mon, 2 Sep 2024 22:49:13 -0700 Subject: [PATCH 5/6] correct decoding of byte arrays --- src/numpydantic/interface/hdf5.py | 12 +++++------- tests/test_interface/test_hdf5.py | 3 +++ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/numpydantic/interface/hdf5.py b/src/numpydantic/interface/hdf5.py index 0e72669..1f75409 100644 --- a/src/numpydantic/interface/hdf5.py +++ b/src/numpydantic/interface/hdf5.py @@ -130,7 +130,10 @@ class H5Proxy: else: item = (item, self.field) - return obj[item].decode(encoding.encoding) + try: + return obj[item].decode(encoding.encoding) + except AttributeError: + return np.strings.decode(obj[item], encoding=encoding.encoding) else: obj = obj.fields(self.field) else: @@ -262,12 +265,7 @@ class H5Interface(Interface): Subclasses to correctly handle """ - if hasattr(array.dtype, "type") and array.dtype.type is np.object_: - if h5py.h5t.check_string_dtype(array.dtype): - return str - else: - return self.get_object_dtype(array) - elif h5py.h5t.check_string_dtype(array.dtype): + if h5py.h5t.check_string_dtype(array.dtype): return str else: return array.dtype diff --git a/tests/test_interface/test_hdf5.py b/tests/test_interface/test_hdf5.py index 7d00174..891dd9f 100644 --- a/tests/test_interface/test_hdf5.py +++ b/tests/test_interface/test_hdf5.py @@ -171,3 +171,6 @@ def test_strings(hdf5_array, compound): instance.array[0, 0] = "hey" assert instance.array[0, 0] == "hey" assert isinstance(instance.array[0, 1], str) + + instance.array[1] = "sup" + assert all(instance.array[1] == "sup") From 5e154ce21e47bbcf9719663f5e64ca6fa325024d Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Mon, 2 Sep 2024 22:52:29 -0700 Subject: [PATCH 6/6] correct decoding of byte arrays --- src/numpydantic/interface/hdf5.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/numpydantic/interface/hdf5.py b/src/numpydantic/interface/hdf5.py index 1f75409..6bb7dda 100644 --- a/src/numpydantic/interface/hdf5.py +++ b/src/numpydantic/interface/hdf5.py @@ -131,9 +131,12 @@ class H5Proxy: item = (item, self.field) try: + # single string return obj[item].decode(encoding.encoding) except AttributeError: - return np.strings.decode(obj[item], encoding=encoding.encoding) + # numpy array of bytes + return np.char.decode(obj[item], encoding=encoding.encoding) + else: obj = obj.fields(self.field) else: