Merge pull request #13 from p2p-ld/hdf5-str
Some checks are pending
Lint / Ruff Linting (push) Waiting to run
Lint / Black Formatting (push) Waiting to run
Tests / test (<2.0.0, macos-latest, 3.12) (push) Waiting to run
Tests / test (<2.0.0, macos-latest, 3.9) (push) Waiting to run
Tests / test (<2.0.0, ubuntu-latest, 3.12) (push) Waiting to run
Tests / test (<2.0.0, ubuntu-latest, 3.9) (push) Waiting to run
Tests / test (<2.0.0, windows-latest, 3.12) (push) Waiting to run
Tests / test (<2.0.0, windows-latest, 3.9) (push) Waiting to run
Tests / test (>=2.0.0, macos-latest, 3.12) (push) Waiting to run
Tests / test (>=2.0.0, macos-latest, 3.9) (push) Waiting to run
Tests / test (>=2.0.0, ubuntu-latest, 3.10) (push) Waiting to run
Tests / test (>=2.0.0, ubuntu-latest, 3.11) (push) Waiting to run
Tests / test (>=2.0.0, ubuntu-latest, 3.12) (push) Waiting to run
Tests / test (>=2.0.0, ubuntu-latest, 3.9) (push) Waiting to run
Tests / test (>=2.0.0, windows-latest, 3.12) (push) Waiting to run
Tests / test (>=2.0.0, windows-latest, 3.9) (push) Waiting to run
Tests / finish-coverage (push) Blocked by required conditions

Add support for strings in hdf5
This commit is contained in:
Jonny Saunders 2024-09-02 22:59:39 -07:00 committed by GitHub
commit 2ed0be8ef3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 123 additions and 9 deletions

View file

@ -2,6 +2,40 @@
## 1.*
### 1.5.0 - 24-09-02 - `str` support for HDF5
Strings in hdf5 are tricky! HDF5 doesn't have native support for unicode,
but it can be persuaded to store data in ASCII or virtualized utf-8 under somewhat obscure conditions.
This PR uses h5py's string methods to expose string datasets (compound or not)
via the h5proxy with the `asstr()` view method.
This also allows us to set strings with normal python strings,
although hdf5 datasets can only be created with `bytes` or other non-unicode encodings.
Since numpydantic isn't necessarily a tool for *creating* hdf5 files
(nobody should be doing that), but rather an interface to them,
tests are included for reading and validating (unskip the existing string tests)
as well as setting/getting.
```python
import h5py
import numpy as np
from pydantic import BaseModel
from numpydantic import NDArray
from typing import Any
class MyModel(BaseModel):
array: NDArray[Any, str]
h5f = h5py.File('my_data.h5', 'w')
data = np.random.random((10,10)).astype(bytes)
_ = h5f.create_dataset('/dataset', data=data)
instance = MyModel(array=('my_data.h5', '/dataset'))
instance[0,0] = 'hey'
assert instance[0,0] == 'hey'
```
### 1.4.1 - 24-09-02 - `len()` support and dunder method testing
It's pretty natural to want to do `len(array)` as a shorthand for `array.shape[0]`,

View file

@ -1,6 +1,6 @@
[project]
name = "numpydantic"
version = "1.4.1"
version = "1.5.0"
description = "Type and shape validation and serialization for numpy arrays in pydantic models"
authors = [
{name = "sneakers-the-rat", email = "sneakers-the-rat@protonmail.com"},

View file

@ -1,5 +1,27 @@
"""
Interfaces for HDF5 Datasets
.. note::
HDF5 arrays are accessed through a proxy class :class:`.H5Proxy` .
Getting/setting values should work as normal, **except** that setting
values on nested views is impossible -
Specifically this doesn't work:
.. code-block:: python
my_model.array[0][0] = 1
But this does work:
.. code-block:: python
my_model.array[0,0] = 1
To have direct access to the hdf5 dataset, use the
:meth:`.H5Proxy.open` method.
"""
import sys
@ -10,7 +32,7 @@ import numpy as np
from pydantic import SerializationInfo
from numpydantic.interface.interface import Interface
from numpydantic.types import NDArrayType
from numpydantic.types import DtypeType, NDArrayType
try:
import h5py
@ -102,7 +124,25 @@ class H5Proxy:
with h5py.File(self.file, "r") as h5f:
obj = h5f.get(self.path)
if self.field is not None:
obj = obj.fields(self.field)
if encoding := h5py.h5t.check_string_dtype(obj.dtype[self.field]):
if isinstance(item, tuple):
item = (*item, self.field)
else:
item = (item, self.field)
try:
# single string
return obj[item].decode(encoding.encoding)
except AttributeError:
# numpy array of bytes
return np.char.decode(obj[item], encoding=encoding.encoding)
else:
obj = obj.fields(self.field)
else:
if h5py.h5t.check_string_dtype(obj.dtype):
obj = obj.asstr()
return obj[item]
def __setitem__(
@ -222,6 +262,17 @@ class H5Interface(Interface):
return array
def get_dtype(self, array: NDArrayType) -> DtypeType:
"""
Get the dtype from the input array
Subclasses to correctly handle
"""
if h5py.h5t.check_string_dtype(array.dtype):
return str
else:
return array.dtype
@classmethod
def to_json(cls, array: H5Proxy, info: Optional[SerializationInfo] = None) -> dict:
"""

View file

@ -126,7 +126,10 @@ class Interface(ABC, Generic[T]):
if isinstance(self.dtype, tuple):
valid = dtype in self.dtype
elif self.dtype is np.str_:
valid = getattr(dtype, "type", None) is np.str_ or dtype is np.str_
valid = getattr(dtype, "type", None) in (np.str_, str) or dtype in (
np.str_,
str,
)
else:
# try to match as any subclass, if self.dtype is a class
try:

View file

@ -122,13 +122,22 @@ def hdf5_array(
compound: bool = False,
) -> H5ArrayPath:
array_path = "/" + "_".join([str(s) for s in shape]) + "__" + dtype.__name__
if not compound:
data = np.random.random(shape).astype(dtype)
if dtype is str:
data = np.random.random(shape).astype(bytes)
else:
data = np.random.random(shape).astype(dtype)
_ = hdf5_file.create_dataset(array_path, data=data)
return H5ArrayPath(Path(hdf5_file.filename), array_path)
else:
dt = np.dtype([("data", dtype), ("extra", "i8")])
data = np.zeros(shape, dtype=dt)
if dtype is str:
dt = np.dtype([("data", np.dtype("S10")), ("extra", "i8")])
data = np.array([("hey", 0)] * np.prod(shape), dtype=dt).reshape(shape)
else:
dt = np.dtype([("data", dtype), ("extra", "i8")])
data = np.zeros(shape, dtype=dt)
_ = hdf5_file.create_dataset(array_path, data=data)
return H5ArrayPath(Path(hdf5_file.filename), array_path, "data")

View file

@ -78,8 +78,6 @@ def test_hdf5_shape(shape_cases, hdf5_array, compound):
@pytest.mark.parametrize("compound", [True, False])
def test_hdf5_dtype(dtype_cases, hdf5_array, compound):
if dtype_cases.dtype is str:
pytest.skip("hdf5 cant do string arrays")
_test_hdf5_case(dtype_cases, hdf5_array, compound)
@ -157,3 +155,22 @@ def test_compound_dtype(tmp_path):
assert all(instance.array[1, :] == 0)
instance.array[1] = 2
assert all(instance.array[1] == 2)
@pytest.mark.parametrize("compound", [True, False])
def test_strings(hdf5_array, compound):
"""
HDF5 proxy can get and set strings just like any other dtype
"""
array = hdf5_array((10, 10), str, compound=compound)
class MyModel(BaseModel):
array: NDArray[Shape["10, 10"], str]
instance = MyModel(array=array)
instance.array[0, 0] = "hey"
assert instance.array[0, 0] == "hey"
assert isinstance(instance.array[0, 1], str)
instance.array[1] = "sup"
assert all(instance.array[1] == "sup")