mirror of
https://github.com/p2p-ld/numpydantic.git
synced 2025-01-09 13:44:26 +00:00
Merge pull request #13 from p2p-ld/hdf5-str
Some checks are pending
Lint / Ruff Linting (push) Waiting to run
Lint / Black Formatting (push) Waiting to run
Tests / test (<2.0.0, macos-latest, 3.12) (push) Waiting to run
Tests / test (<2.0.0, macos-latest, 3.9) (push) Waiting to run
Tests / test (<2.0.0, ubuntu-latest, 3.12) (push) Waiting to run
Tests / test (<2.0.0, ubuntu-latest, 3.9) (push) Waiting to run
Tests / test (<2.0.0, windows-latest, 3.12) (push) Waiting to run
Tests / test (<2.0.0, windows-latest, 3.9) (push) Waiting to run
Tests / test (>=2.0.0, macos-latest, 3.12) (push) Waiting to run
Tests / test (>=2.0.0, macos-latest, 3.9) (push) Waiting to run
Tests / test (>=2.0.0, ubuntu-latest, 3.10) (push) Waiting to run
Tests / test (>=2.0.0, ubuntu-latest, 3.11) (push) Waiting to run
Tests / test (>=2.0.0, ubuntu-latest, 3.12) (push) Waiting to run
Tests / test (>=2.0.0, ubuntu-latest, 3.9) (push) Waiting to run
Tests / test (>=2.0.0, windows-latest, 3.12) (push) Waiting to run
Tests / test (>=2.0.0, windows-latest, 3.9) (push) Waiting to run
Tests / finish-coverage (push) Blocked by required conditions
Some checks are pending
Lint / Ruff Linting (push) Waiting to run
Lint / Black Formatting (push) Waiting to run
Tests / test (<2.0.0, macos-latest, 3.12) (push) Waiting to run
Tests / test (<2.0.0, macos-latest, 3.9) (push) Waiting to run
Tests / test (<2.0.0, ubuntu-latest, 3.12) (push) Waiting to run
Tests / test (<2.0.0, ubuntu-latest, 3.9) (push) Waiting to run
Tests / test (<2.0.0, windows-latest, 3.12) (push) Waiting to run
Tests / test (<2.0.0, windows-latest, 3.9) (push) Waiting to run
Tests / test (>=2.0.0, macos-latest, 3.12) (push) Waiting to run
Tests / test (>=2.0.0, macos-latest, 3.9) (push) Waiting to run
Tests / test (>=2.0.0, ubuntu-latest, 3.10) (push) Waiting to run
Tests / test (>=2.0.0, ubuntu-latest, 3.11) (push) Waiting to run
Tests / test (>=2.0.0, ubuntu-latest, 3.12) (push) Waiting to run
Tests / test (>=2.0.0, ubuntu-latest, 3.9) (push) Waiting to run
Tests / test (>=2.0.0, windows-latest, 3.12) (push) Waiting to run
Tests / test (>=2.0.0, windows-latest, 3.9) (push) Waiting to run
Tests / finish-coverage (push) Blocked by required conditions
Add support for strings in hdf5
This commit is contained in:
commit
2ed0be8ef3
6 changed files with 123 additions and 9 deletions
|
@ -2,6 +2,40 @@
|
|||
|
||||
## 1.*
|
||||
|
||||
### 1.5.0 - 24-09-02 - `str` support for HDF5
|
||||
|
||||
Strings in hdf5 are tricky! HDF5 doesn't have native support for unicode,
|
||||
but it can be persuaded to store data in ASCII or virtualized utf-8 under somewhat obscure conditions.
|
||||
|
||||
This PR uses h5py's string methods to expose string datasets (compound or not)
|
||||
via the h5proxy with the `asstr()` view method.
|
||||
This also allows us to set strings with normal python strings,
|
||||
although hdf5 datasets can only be created with `bytes` or other non-unicode encodings.
|
||||
|
||||
Since numpydantic isn't necessarily a tool for *creating* hdf5 files
|
||||
(nobody should be doing that), but rather an interface to them,
|
||||
tests are included for reading and validating (unskip the existing string tests)
|
||||
as well as setting/getting.
|
||||
|
||||
```python
|
||||
import h5py
|
||||
import numpy as np
|
||||
from pydantic import BaseModel
|
||||
from numpydantic import NDArray
|
||||
from typing import Any
|
||||
|
||||
class MyModel(BaseModel):
|
||||
array: NDArray[Any, str]
|
||||
|
||||
h5f = h5py.File('my_data.h5', 'w')
|
||||
data = np.random.random((10,10)).astype(bytes)
|
||||
_ = h5f.create_dataset('/dataset', data=data)
|
||||
|
||||
instance = MyModel(array=('my_data.h5', '/dataset'))
|
||||
instance[0,0] = 'hey'
|
||||
assert instance[0,0] == 'hey'
|
||||
```
|
||||
|
||||
### 1.4.1 - 24-09-02 - `len()` support and dunder method testing
|
||||
|
||||
It's pretty natural to want to do `len(array)` as a shorthand for `array.shape[0]`,
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
[project]
|
||||
name = "numpydantic"
|
||||
version = "1.4.1"
|
||||
version = "1.5.0"
|
||||
description = "Type and shape validation and serialization for numpy arrays in pydantic models"
|
||||
authors = [
|
||||
{name = "sneakers-the-rat", email = "sneakers-the-rat@protonmail.com"},
|
||||
|
|
|
@ -1,5 +1,27 @@
|
|||
"""
|
||||
Interfaces for HDF5 Datasets
|
||||
|
||||
.. note::
|
||||
|
||||
HDF5 arrays are accessed through a proxy class :class:`.H5Proxy` .
|
||||
Getting/setting values should work as normal, **except** that setting
|
||||
values on nested views is impossible -
|
||||
|
||||
Specifically this doesn't work:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
my_model.array[0][0] = 1
|
||||
|
||||
But this does work:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
my_model.array[0,0] = 1
|
||||
|
||||
To have direct access to the hdf5 dataset, use the
|
||||
:meth:`.H5Proxy.open` method.
|
||||
|
||||
"""
|
||||
|
||||
import sys
|
||||
|
@ -10,7 +32,7 @@ import numpy as np
|
|||
from pydantic import SerializationInfo
|
||||
|
||||
from numpydantic.interface.interface import Interface
|
||||
from numpydantic.types import NDArrayType
|
||||
from numpydantic.types import DtypeType, NDArrayType
|
||||
|
||||
try:
|
||||
import h5py
|
||||
|
@ -102,7 +124,25 @@ class H5Proxy:
|
|||
with h5py.File(self.file, "r") as h5f:
|
||||
obj = h5f.get(self.path)
|
||||
if self.field is not None:
|
||||
obj = obj.fields(self.field)
|
||||
if encoding := h5py.h5t.check_string_dtype(obj.dtype[self.field]):
|
||||
if isinstance(item, tuple):
|
||||
item = (*item, self.field)
|
||||
else:
|
||||
item = (item, self.field)
|
||||
|
||||
try:
|
||||
# single string
|
||||
return obj[item].decode(encoding.encoding)
|
||||
except AttributeError:
|
||||
# numpy array of bytes
|
||||
return np.char.decode(obj[item], encoding=encoding.encoding)
|
||||
|
||||
else:
|
||||
obj = obj.fields(self.field)
|
||||
else:
|
||||
if h5py.h5t.check_string_dtype(obj.dtype):
|
||||
obj = obj.asstr()
|
||||
|
||||
return obj[item]
|
||||
|
||||
def __setitem__(
|
||||
|
@ -222,6 +262,17 @@ class H5Interface(Interface):
|
|||
|
||||
return array
|
||||
|
||||
def get_dtype(self, array: NDArrayType) -> DtypeType:
|
||||
"""
|
||||
Get the dtype from the input array
|
||||
|
||||
Subclasses to correctly handle
|
||||
"""
|
||||
if h5py.h5t.check_string_dtype(array.dtype):
|
||||
return str
|
||||
else:
|
||||
return array.dtype
|
||||
|
||||
@classmethod
|
||||
def to_json(cls, array: H5Proxy, info: Optional[SerializationInfo] = None) -> dict:
|
||||
"""
|
||||
|
|
|
@ -126,7 +126,10 @@ class Interface(ABC, Generic[T]):
|
|||
if isinstance(self.dtype, tuple):
|
||||
valid = dtype in self.dtype
|
||||
elif self.dtype is np.str_:
|
||||
valid = getattr(dtype, "type", None) is np.str_ or dtype is np.str_
|
||||
valid = getattr(dtype, "type", None) in (np.str_, str) or dtype in (
|
||||
np.str_,
|
||||
str,
|
||||
)
|
||||
else:
|
||||
# try to match as any subclass, if self.dtype is a class
|
||||
try:
|
||||
|
|
|
@ -122,13 +122,22 @@ def hdf5_array(
|
|||
compound: bool = False,
|
||||
) -> H5ArrayPath:
|
||||
array_path = "/" + "_".join([str(s) for s in shape]) + "__" + dtype.__name__
|
||||
|
||||
if not compound:
|
||||
data = np.random.random(shape).astype(dtype)
|
||||
if dtype is str:
|
||||
data = np.random.random(shape).astype(bytes)
|
||||
else:
|
||||
data = np.random.random(shape).astype(dtype)
|
||||
_ = hdf5_file.create_dataset(array_path, data=data)
|
||||
return H5ArrayPath(Path(hdf5_file.filename), array_path)
|
||||
else:
|
||||
dt = np.dtype([("data", dtype), ("extra", "i8")])
|
||||
data = np.zeros(shape, dtype=dt)
|
||||
|
||||
if dtype is str:
|
||||
dt = np.dtype([("data", np.dtype("S10")), ("extra", "i8")])
|
||||
data = np.array([("hey", 0)] * np.prod(shape), dtype=dt).reshape(shape)
|
||||
else:
|
||||
dt = np.dtype([("data", dtype), ("extra", "i8")])
|
||||
data = np.zeros(shape, dtype=dt)
|
||||
_ = hdf5_file.create_dataset(array_path, data=data)
|
||||
return H5ArrayPath(Path(hdf5_file.filename), array_path, "data")
|
||||
|
||||
|
|
|
@ -78,8 +78,6 @@ def test_hdf5_shape(shape_cases, hdf5_array, compound):
|
|||
|
||||
@pytest.mark.parametrize("compound", [True, False])
|
||||
def test_hdf5_dtype(dtype_cases, hdf5_array, compound):
|
||||
if dtype_cases.dtype is str:
|
||||
pytest.skip("hdf5 cant do string arrays")
|
||||
_test_hdf5_case(dtype_cases, hdf5_array, compound)
|
||||
|
||||
|
||||
|
@ -157,3 +155,22 @@ def test_compound_dtype(tmp_path):
|
|||
assert all(instance.array[1, :] == 0)
|
||||
instance.array[1] = 2
|
||||
assert all(instance.array[1] == 2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("compound", [True, False])
|
||||
def test_strings(hdf5_array, compound):
|
||||
"""
|
||||
HDF5 proxy can get and set strings just like any other dtype
|
||||
"""
|
||||
array = hdf5_array((10, 10), str, compound=compound)
|
||||
|
||||
class MyModel(BaseModel):
|
||||
array: NDArray[Shape["10, 10"], str]
|
||||
|
||||
instance = MyModel(array=array)
|
||||
instance.array[0, 0] = "hey"
|
||||
assert instance.array[0, 0] == "hey"
|
||||
assert isinstance(instance.array[0, 1], str)
|
||||
|
||||
instance.array[1] = "sup"
|
||||
assert all(instance.array[1] == "sup")
|
||||
|
|
Loading…
Reference in a new issue