mirror of
https://github.com/p2p-ld/numpydantic.git
synced 2025-01-09 13:44:26 +00:00
Merge pull request #13 from p2p-ld/hdf5-str
Some checks are pending
Lint / Ruff Linting (push) Waiting to run
Lint / Black Formatting (push) Waiting to run
Tests / test (<2.0.0, macos-latest, 3.12) (push) Waiting to run
Tests / test (<2.0.0, macos-latest, 3.9) (push) Waiting to run
Tests / test (<2.0.0, ubuntu-latest, 3.12) (push) Waiting to run
Tests / test (<2.0.0, ubuntu-latest, 3.9) (push) Waiting to run
Tests / test (<2.0.0, windows-latest, 3.12) (push) Waiting to run
Tests / test (<2.0.0, windows-latest, 3.9) (push) Waiting to run
Tests / test (>=2.0.0, macos-latest, 3.12) (push) Waiting to run
Tests / test (>=2.0.0, macos-latest, 3.9) (push) Waiting to run
Tests / test (>=2.0.0, ubuntu-latest, 3.10) (push) Waiting to run
Tests / test (>=2.0.0, ubuntu-latest, 3.11) (push) Waiting to run
Tests / test (>=2.0.0, ubuntu-latest, 3.12) (push) Waiting to run
Tests / test (>=2.0.0, ubuntu-latest, 3.9) (push) Waiting to run
Tests / test (>=2.0.0, windows-latest, 3.12) (push) Waiting to run
Tests / test (>=2.0.0, windows-latest, 3.9) (push) Waiting to run
Tests / finish-coverage (push) Blocked by required conditions
Some checks are pending
Lint / Ruff Linting (push) Waiting to run
Lint / Black Formatting (push) Waiting to run
Tests / test (<2.0.0, macos-latest, 3.12) (push) Waiting to run
Tests / test (<2.0.0, macos-latest, 3.9) (push) Waiting to run
Tests / test (<2.0.0, ubuntu-latest, 3.12) (push) Waiting to run
Tests / test (<2.0.0, ubuntu-latest, 3.9) (push) Waiting to run
Tests / test (<2.0.0, windows-latest, 3.12) (push) Waiting to run
Tests / test (<2.0.0, windows-latest, 3.9) (push) Waiting to run
Tests / test (>=2.0.0, macos-latest, 3.12) (push) Waiting to run
Tests / test (>=2.0.0, macos-latest, 3.9) (push) Waiting to run
Tests / test (>=2.0.0, ubuntu-latest, 3.10) (push) Waiting to run
Tests / test (>=2.0.0, ubuntu-latest, 3.11) (push) Waiting to run
Tests / test (>=2.0.0, ubuntu-latest, 3.12) (push) Waiting to run
Tests / test (>=2.0.0, ubuntu-latest, 3.9) (push) Waiting to run
Tests / test (>=2.0.0, windows-latest, 3.12) (push) Waiting to run
Tests / test (>=2.0.0, windows-latest, 3.9) (push) Waiting to run
Tests / finish-coverage (push) Blocked by required conditions
Add support for strings in hdf5
This commit is contained in:
commit
2ed0be8ef3
6 changed files with 123 additions and 9 deletions
|
@ -2,6 +2,40 @@
|
||||||
|
|
||||||
## 1.*
|
## 1.*
|
||||||
|
|
||||||
|
### 1.5.0 - 24-09-02 - `str` support for HDF5
|
||||||
|
|
||||||
|
Strings in hdf5 are tricky! HDF5 doesn't have native support for unicode,
|
||||||
|
but it can be persuaded to store data in ASCII or virtualized utf-8 under somewhat obscure conditions.
|
||||||
|
|
||||||
|
This PR uses h5py's string methods to expose string datasets (compound or not)
|
||||||
|
via the h5proxy with the `asstr()` view method.
|
||||||
|
This also allows us to set strings with normal python strings,
|
||||||
|
although hdf5 datasets can only be created with `bytes` or other non-unicode encodings.
|
||||||
|
|
||||||
|
Since numpydantic isn't necessarily a tool for *creating* hdf5 files
|
||||||
|
(nobody should be doing that), but rather an interface to them,
|
||||||
|
tests are included for reading and validating (unskip the existing string tests)
|
||||||
|
as well as setting/getting.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import h5py
|
||||||
|
import numpy as np
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from numpydantic import NDArray
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
class MyModel(BaseModel):
|
||||||
|
array: NDArray[Any, str]
|
||||||
|
|
||||||
|
h5f = h5py.File('my_data.h5', 'w')
|
||||||
|
data = np.random.random((10,10)).astype(bytes)
|
||||||
|
_ = h5f.create_dataset('/dataset', data=data)
|
||||||
|
|
||||||
|
instance = MyModel(array=('my_data.h5', '/dataset'))
|
||||||
|
instance[0,0] = 'hey'
|
||||||
|
assert instance[0,0] == 'hey'
|
||||||
|
```
|
||||||
|
|
||||||
### 1.4.1 - 24-09-02 - `len()` support and dunder method testing
|
### 1.4.1 - 24-09-02 - `len()` support and dunder method testing
|
||||||
|
|
||||||
It's pretty natural to want to do `len(array)` as a shorthand for `array.shape[0]`,
|
It's pretty natural to want to do `len(array)` as a shorthand for `array.shape[0]`,
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[project]
|
[project]
|
||||||
name = "numpydantic"
|
name = "numpydantic"
|
||||||
version = "1.4.1"
|
version = "1.5.0"
|
||||||
description = "Type and shape validation and serialization for numpy arrays in pydantic models"
|
description = "Type and shape validation and serialization for numpy arrays in pydantic models"
|
||||||
authors = [
|
authors = [
|
||||||
{name = "sneakers-the-rat", email = "sneakers-the-rat@protonmail.com"},
|
{name = "sneakers-the-rat", email = "sneakers-the-rat@protonmail.com"},
|
||||||
|
|
|
@ -1,5 +1,27 @@
|
||||||
"""
|
"""
|
||||||
Interfaces for HDF5 Datasets
|
Interfaces for HDF5 Datasets
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
HDF5 arrays are accessed through a proxy class :class:`.H5Proxy` .
|
||||||
|
Getting/setting values should work as normal, **except** that setting
|
||||||
|
values on nested views is impossible -
|
||||||
|
|
||||||
|
Specifically this doesn't work:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
my_model.array[0][0] = 1
|
||||||
|
|
||||||
|
But this does work:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
my_model.array[0,0] = 1
|
||||||
|
|
||||||
|
To have direct access to the hdf5 dataset, use the
|
||||||
|
:meth:`.H5Proxy.open` method.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
@ -10,7 +32,7 @@ import numpy as np
|
||||||
from pydantic import SerializationInfo
|
from pydantic import SerializationInfo
|
||||||
|
|
||||||
from numpydantic.interface.interface import Interface
|
from numpydantic.interface.interface import Interface
|
||||||
from numpydantic.types import NDArrayType
|
from numpydantic.types import DtypeType, NDArrayType
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import h5py
|
import h5py
|
||||||
|
@ -102,7 +124,25 @@ class H5Proxy:
|
||||||
with h5py.File(self.file, "r") as h5f:
|
with h5py.File(self.file, "r") as h5f:
|
||||||
obj = h5f.get(self.path)
|
obj = h5f.get(self.path)
|
||||||
if self.field is not None:
|
if self.field is not None:
|
||||||
obj = obj.fields(self.field)
|
if encoding := h5py.h5t.check_string_dtype(obj.dtype[self.field]):
|
||||||
|
if isinstance(item, tuple):
|
||||||
|
item = (*item, self.field)
|
||||||
|
else:
|
||||||
|
item = (item, self.field)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# single string
|
||||||
|
return obj[item].decode(encoding.encoding)
|
||||||
|
except AttributeError:
|
||||||
|
# numpy array of bytes
|
||||||
|
return np.char.decode(obj[item], encoding=encoding.encoding)
|
||||||
|
|
||||||
|
else:
|
||||||
|
obj = obj.fields(self.field)
|
||||||
|
else:
|
||||||
|
if h5py.h5t.check_string_dtype(obj.dtype):
|
||||||
|
obj = obj.asstr()
|
||||||
|
|
||||||
return obj[item]
|
return obj[item]
|
||||||
|
|
||||||
def __setitem__(
|
def __setitem__(
|
||||||
|
@ -222,6 +262,17 @@ class H5Interface(Interface):
|
||||||
|
|
||||||
return array
|
return array
|
||||||
|
|
||||||
|
def get_dtype(self, array: NDArrayType) -> DtypeType:
|
||||||
|
"""
|
||||||
|
Get the dtype from the input array
|
||||||
|
|
||||||
|
Subclasses to correctly handle
|
||||||
|
"""
|
||||||
|
if h5py.h5t.check_string_dtype(array.dtype):
|
||||||
|
return str
|
||||||
|
else:
|
||||||
|
return array.dtype
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def to_json(cls, array: H5Proxy, info: Optional[SerializationInfo] = None) -> dict:
|
def to_json(cls, array: H5Proxy, info: Optional[SerializationInfo] = None) -> dict:
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -126,7 +126,10 @@ class Interface(ABC, Generic[T]):
|
||||||
if isinstance(self.dtype, tuple):
|
if isinstance(self.dtype, tuple):
|
||||||
valid = dtype in self.dtype
|
valid = dtype in self.dtype
|
||||||
elif self.dtype is np.str_:
|
elif self.dtype is np.str_:
|
||||||
valid = getattr(dtype, "type", None) is np.str_ or dtype is np.str_
|
valid = getattr(dtype, "type", None) in (np.str_, str) or dtype in (
|
||||||
|
np.str_,
|
||||||
|
str,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
# try to match as any subclass, if self.dtype is a class
|
# try to match as any subclass, if self.dtype is a class
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -122,13 +122,22 @@ def hdf5_array(
|
||||||
compound: bool = False,
|
compound: bool = False,
|
||||||
) -> H5ArrayPath:
|
) -> H5ArrayPath:
|
||||||
array_path = "/" + "_".join([str(s) for s in shape]) + "__" + dtype.__name__
|
array_path = "/" + "_".join([str(s) for s in shape]) + "__" + dtype.__name__
|
||||||
|
|
||||||
if not compound:
|
if not compound:
|
||||||
data = np.random.random(shape).astype(dtype)
|
if dtype is str:
|
||||||
|
data = np.random.random(shape).astype(bytes)
|
||||||
|
else:
|
||||||
|
data = np.random.random(shape).astype(dtype)
|
||||||
_ = hdf5_file.create_dataset(array_path, data=data)
|
_ = hdf5_file.create_dataset(array_path, data=data)
|
||||||
return H5ArrayPath(Path(hdf5_file.filename), array_path)
|
return H5ArrayPath(Path(hdf5_file.filename), array_path)
|
||||||
else:
|
else:
|
||||||
dt = np.dtype([("data", dtype), ("extra", "i8")])
|
|
||||||
data = np.zeros(shape, dtype=dt)
|
if dtype is str:
|
||||||
|
dt = np.dtype([("data", np.dtype("S10")), ("extra", "i8")])
|
||||||
|
data = np.array([("hey", 0)] * np.prod(shape), dtype=dt).reshape(shape)
|
||||||
|
else:
|
||||||
|
dt = np.dtype([("data", dtype), ("extra", "i8")])
|
||||||
|
data = np.zeros(shape, dtype=dt)
|
||||||
_ = hdf5_file.create_dataset(array_path, data=data)
|
_ = hdf5_file.create_dataset(array_path, data=data)
|
||||||
return H5ArrayPath(Path(hdf5_file.filename), array_path, "data")
|
return H5ArrayPath(Path(hdf5_file.filename), array_path, "data")
|
||||||
|
|
||||||
|
|
|
@ -78,8 +78,6 @@ def test_hdf5_shape(shape_cases, hdf5_array, compound):
|
||||||
|
|
||||||
@pytest.mark.parametrize("compound", [True, False])
|
@pytest.mark.parametrize("compound", [True, False])
|
||||||
def test_hdf5_dtype(dtype_cases, hdf5_array, compound):
|
def test_hdf5_dtype(dtype_cases, hdf5_array, compound):
|
||||||
if dtype_cases.dtype is str:
|
|
||||||
pytest.skip("hdf5 cant do string arrays")
|
|
||||||
_test_hdf5_case(dtype_cases, hdf5_array, compound)
|
_test_hdf5_case(dtype_cases, hdf5_array, compound)
|
||||||
|
|
||||||
|
|
||||||
|
@ -157,3 +155,22 @@ def test_compound_dtype(tmp_path):
|
||||||
assert all(instance.array[1, :] == 0)
|
assert all(instance.array[1, :] == 0)
|
||||||
instance.array[1] = 2
|
instance.array[1] = 2
|
||||||
assert all(instance.array[1] == 2)
|
assert all(instance.array[1] == 2)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("compound", [True, False])
|
||||||
|
def test_strings(hdf5_array, compound):
|
||||||
|
"""
|
||||||
|
HDF5 proxy can get and set strings just like any other dtype
|
||||||
|
"""
|
||||||
|
array = hdf5_array((10, 10), str, compound=compound)
|
||||||
|
|
||||||
|
class MyModel(BaseModel):
|
||||||
|
array: NDArray[Shape["10, 10"], str]
|
||||||
|
|
||||||
|
instance = MyModel(array=array)
|
||||||
|
instance.array[0, 0] = "hey"
|
||||||
|
assert instance.array[0, 0] == "hey"
|
||||||
|
assert isinstance(instance.array[0, 1], str)
|
||||||
|
|
||||||
|
instance.array[1] = "sup"
|
||||||
|
assert all(instance.array[1] == "sup")
|
||||||
|
|
Loading…
Reference in a new issue