Merge pull request #11 from p2p-ld/hdf5-compound-dtype

Add ability to index fields within hdf5 compound dtypes
This commit is contained in:
Jonny Saunders 2024-09-02 17:14:30 -07:00 committed by GitHub
commit 9364cacc90
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 165 additions and 22 deletions

View file

@ -2,6 +2,61 @@
## 1.* ## 1.*
### 1.4.0 - 24-09-02 - HDF5 Compound Dtype Support
HDF5 can have compound dtypes like:
```python
import numpy as np
import h5py
dtype = np.dtype([("data", "i8"), ("extra", "f8")])
data = np.zeros((10, 20), dtype=dtype)
with h5py.File('mydata.h5', "w") as h5f:
dset = h5f.create_dataset("/dataset", data=data)
```
```python
>>> dset[0:1]
array([[(0, 0.), (0, 0.), (0, 0.), (0, 0.), (0, 0.), (0, 0.), (0, 0.),
(0, 0.), (0, 0.), (0, 0.), (0, 0.), (0, 0.), (0, 0.), (0, 0.),
(0, 0.), (0, 0.), (0, 0.), (0, 0.), (0, 0.), (0, 0.)]],
dtype=[('data', '<i8'), ('extra', '<f8')])
```
Sometimes we want to split those out to separate fields like this:
```python
class MyModel(BaseModel):
data: NDArray[Any, np.int64]
extra: NDArray[Any, np.float64]
```
So that's what 1.4.0 allows, using an additional field in the H5ArrayPath:
```python
from numpydantic.interfaces.hdf5 import H5ArrayPath
my_model = MyModel(
data = H5ArrayPath(file='mydata.h5', path="/dataset", field="data"),
extra = H5ArrayPath(file='mydata.h5', path="/dataset", field="extra"),
)
# or just with tuples
my_model = MyModel(
data = ('mydata.h5', "/dataset", "data"),
extra = ('mydata.h5', "/dataset", "extra"),
)
```
```python
>>> my_model.data[0,0]
0
>>> my_model.data.dtype
np.dtype('int64')
```
### 1.3.3 - 24-08-13 - Callable type annotations ### 1.3.3 - 24-08-13 - Callable type annotations
Problem, when you use a numpydantic `"wrap"` validator, it gives the annotation as a `handler` function. Problem, when you use a numpydantic `"wrap"` validator, it gives the annotation as a `handler` function.

View file

@ -1,6 +1,6 @@
[project] [project]
name = "numpydantic" name = "numpydantic"
version = "1.3.3" version = "1.4.0"
description = "Type and shape validation and serialization for numpy arrays in pydantic models" description = "Type and shape validation and serialization for numpy arrays in pydantic models"
authors = [ authors = [
{name = "sneakers-the-rat", email = "sneakers-the-rat@protonmail.com"}, {name = "sneakers-the-rat", email = "sneakers-the-rat@protonmail.com"},

View file

@ -4,7 +4,7 @@ Interfaces for HDF5 Datasets
import sys import sys
from pathlib import Path from pathlib import Path
from typing import Any, NamedTuple, Optional, Tuple, Union from typing import Any, List, NamedTuple, Optional, Tuple, Union
import numpy as np import numpy as np
from pydantic import SerializationInfo from pydantic import SerializationInfo
@ -32,6 +32,8 @@ class H5ArrayPath(NamedTuple):
"""Location of HDF5 file""" """Location of HDF5 file"""
path: str path: str
"""Path within the HDF5 file""" """Path within the HDF5 file"""
field: Optional[Union[str, List[str]]] = None
"""Refer to a specific field within a compound dtype"""
class H5Proxy: class H5Proxy:
@ -51,12 +53,20 @@ class H5Proxy:
Args: Args:
file (pathlib.Path | str): Location of hdf5 file on filesystem file (pathlib.Path | str): Location of hdf5 file on filesystem
path (str): Path to array within hdf5 file path (str): Path to array within hdf5 file
field (str, list[str]): Optional - refer to a specific field within
a compound dtype
""" """
def __init__(self, file: Union[Path, str], path: str): def __init__(
self,
file: Union[Path, str],
path: str,
field: Optional[Union[str, List[str]]] = None,
):
self._h5f = None self._h5f = None
self.file = Path(file) self.file = Path(file)
self.path = path self.path = path
self.field = field
def array_exists(self) -> bool: def array_exists(self) -> bool:
"""Check that there is in fact an array at :attr:`.path` within :attr:`.file`""" """Check that there is in fact an array at :attr:`.path` within :attr:`.file`"""
@ -67,22 +77,49 @@ class H5Proxy:
@classmethod @classmethod
def from_h5array(cls, h5array: H5ArrayPath) -> "H5Proxy": def from_h5array(cls, h5array: H5ArrayPath) -> "H5Proxy":
"""Instantiate using :class:`.H5ArrayPath`""" """Instantiate using :class:`.H5ArrayPath`"""
return H5Proxy(file=h5array.file, path=h5array.path) return H5Proxy(file=h5array.file, path=h5array.path, field=h5array.field)
@property
def dtype(self) -> np.dtype:
"""
Get dtype of array, using :attr:`.field` if present
"""
with h5py.File(self.file, "r") as h5f:
obj = h5f.get(self.path)
if self.field is None:
return obj.dtype
else:
return obj.dtype[self.field]
def __getattr__(self, item: str): def __getattr__(self, item: str):
with h5py.File(self.file, "r") as h5f: with h5py.File(self.file, "r") as h5f:
obj = h5f.get(self.path) obj = h5f.get(self.path)
return getattr(obj, item) return getattr(obj, item)
def __getitem__(self, item: Union[int, slice]) -> np.ndarray: def __getitem__(
self, item: Union[int, slice, Tuple[Union[int, slice], ...]]
) -> np.ndarray:
with h5py.File(self.file, "r") as h5f: with h5py.File(self.file, "r") as h5f:
obj = h5f.get(self.path) obj = h5f.get(self.path)
if self.field is not None:
obj = obj.fields(self.field)
return obj[item] return obj[item]
def __setitem__(self, key: Union[int, slice], value: Union[int, float, np.ndarray]): def __setitem__(
self,
key: Union[int, slice, Tuple[Union[int, slice], ...]],
value: Union[int, float, np.ndarray],
):
with h5py.File(self.file, "r+", locking=True) as h5f: with h5py.File(self.file, "r+", locking=True) as h5f:
obj = h5f.get(self.path) obj = h5f.get(self.path)
if self.field is None:
obj[key] = value obj[key] = value
else:
if isinstance(key, tuple):
key = (*key, self.field)
obj[key] = value
else:
obj[key, self.field] = value
def open(self, mode: str = "r") -> "h5py.Dataset": def open(self, mode: str = "r") -> "h5py.Dataset":
""" """
@ -133,7 +170,7 @@ class H5Interface(Interface):
if isinstance(array, H5ArrayPath): if isinstance(array, H5ArrayPath):
return True return True
if isinstance(array, (tuple, list)) and len(array) == 2: if isinstance(array, (tuple, list)) and len(array) in (2, 3):
# check that the first arg is an hdf5 file # check that the first arg is an hdf5 file
try: try:
file = Path(array[0]) file = Path(array[0])
@ -163,6 +200,8 @@ class H5Interface(Interface):
array = H5Proxy.from_h5array(h5array=array) array = H5Proxy.from_h5array(h5array=array)
elif isinstance(array, (tuple, list)) and len(array) == 2: # pragma: no cover elif isinstance(array, (tuple, list)) and len(array) == 2: # pragma: no cover
array = H5Proxy(file=array[0], path=array[1]) array = H5Proxy(file=array[0], path=array[1])
elif isinstance(array, (tuple, list)) and len(array) == 3:
array = H5Proxy(file=array[0], path=array[1], field=array[2])
else: # pragma: no cover else: # pragma: no cover
# this should never happen really since `check` confirms this before # this should never happen really since `check` confirms this before
# we'd reach here, but just to complete the if else... # we'd reach here, but just to complete the if else...

View file

@ -116,12 +116,20 @@ def hdf5_array(
) -> Callable[[Tuple[int, ...], Union[np.dtype, type]], H5ArrayPath]: ) -> Callable[[Tuple[int, ...], Union[np.dtype, type]], H5ArrayPath]:
def _hdf5_array( def _hdf5_array(
shape: Tuple[int, ...] = (10, 10), dtype: Union[np.dtype, type] = float shape: Tuple[int, ...] = (10, 10),
dtype: Union[np.dtype, type] = float,
compound: bool = False,
) -> H5ArrayPath: ) -> H5ArrayPath:
array_path = "/" + "_".join([str(s) for s in shape]) + "__" + dtype.__name__ array_path = "/" + "_".join([str(s) for s in shape]) + "__" + dtype.__name__
if not compound:
data = np.random.random(shape).astype(dtype) data = np.random.random(shape).astype(dtype)
_ = hdf5_file.create_dataset(array_path, data=data) _ = hdf5_file.create_dataset(array_path, data=data)
return H5ArrayPath(Path(hdf5_file.filename), array_path) return H5ArrayPath(Path(hdf5_file.filename), array_path)
else:
dt = np.dtype([("data", dtype), ("extra", "i8")])
data = np.zeros(shape, dtype=dt)
_ = hdf5_file.create_dataset(array_path, data=data)
return H5ArrayPath(Path(hdf5_file.filename), array_path, "data")
return _hdf5_array return _hdf5_array

View file

@ -1,17 +1,22 @@
import pdb
import json import json
import h5py
import pytest import pytest
from pydantic import BaseModel, ValidationError from pydantic import BaseModel, ValidationError
import numpy as np
from numpydantic import NDArray, Shape
from numpydantic.interface import H5Interface from numpydantic.interface import H5Interface
from numpydantic.interface.hdf5 import H5ArrayPath from numpydantic.interface.hdf5 import H5ArrayPath, H5Proxy
from numpydantic.exceptions import DtypeError, ShapeError from numpydantic.exceptions import DtypeError, ShapeError
from tests.conftest import ValidationCase from tests.conftest import ValidationCase
def hdf5_array_case(case: ValidationCase, array_func) -> H5ArrayPath: def hdf5_array_case(
case: ValidationCase, array_func, compound: bool = False
) -> H5ArrayPath:
""" """
Args: Args:
case: case:
@ -22,11 +27,11 @@ def hdf5_array_case(case: ValidationCase, array_func) -> H5ArrayPath:
""" """
if issubclass(case.dtype, BaseModel): if issubclass(case.dtype, BaseModel):
pytest.skip("hdf5 cant support arbitrary python objects") pytest.skip("hdf5 cant support arbitrary python objects")
return array_func(case.shape, case.dtype) return array_func(case.shape, case.dtype, compound)
def _test_hdf5_case(case: ValidationCase, array_func): def _test_hdf5_case(case: ValidationCase, array_func, compound: bool = False) -> None:
array = hdf5_array_case(case, array_func) array = hdf5_array_case(case, array_func, compound)
if case.passes: if case.passes:
case.model(array=array) case.model(array=array)
else: else:
@ -66,14 +71,16 @@ def test_hdf5_check_not_hdf5(tmp_path):
assert not H5Interface.check(spec) assert not H5Interface.check(spec)
def test_hdf5_shape(shape_cases, hdf5_array): @pytest.mark.parametrize("compound", [True, False])
_test_hdf5_case(shape_cases, hdf5_array) def test_hdf5_shape(shape_cases, hdf5_array, compound):
_test_hdf5_case(shape_cases, hdf5_array, compound)
def test_hdf5_dtype(dtype_cases, hdf5_array): @pytest.mark.parametrize("compound", [True, False])
def test_hdf5_dtype(dtype_cases, hdf5_array, compound):
if dtype_cases.dtype is str: if dtype_cases.dtype is str:
pytest.skip("hdf5 cant do string arrays") pytest.skip("hdf5 cant do string arrays")
_test_hdf5_case(dtype_cases, hdf5_array) _test_hdf5_case(dtype_cases, hdf5_array, compound)
def test_hdf5_dataset_not_exists(hdf5_array, model_blank): def test_hdf5_dataset_not_exists(hdf5_array, model_blank):
@ -116,3 +123,37 @@ def test_to_json(hdf5_array, array_model):
assert json_dict["path"] == str(array.path) assert json_dict["path"] == str(array.path)
assert json_dict["attrs"] == {} assert json_dict["attrs"] == {}
assert json_dict["array"] == instance.array[:].tolist() assert json_dict["array"] == instance.array[:].tolist()
def test_compound_dtype(tmp_path):
"""
hdf5 proxy indexes compound dtypes as single fields when field is given
"""
h5f_path = tmp_path / "test.h5"
dataset_path = "/dataset"
field = "data"
dtype = np.dtype([(field, "i8"), ("extra", "f8")])
data = np.zeros((10, 20), dtype=dtype)
with h5py.File(h5f_path, "w") as h5f:
dset = h5f.create_dataset(dataset_path, data=data)
assert dset.dtype == dtype
proxy = H5Proxy(h5f_path, dataset_path, field=field)
assert proxy.dtype == np.dtype("int64")
assert proxy.shape == (10, 20)
assert proxy[0, 0] == 0
class MyModel(BaseModel):
array: NDArray[Shape["10, 20"], np.int64]
instance = MyModel(array=(h5f_path, dataset_path, field))
assert instance.array.dtype == np.dtype("int64")
assert instance.array.shape == (10, 20)
assert instance.array[0, 0] == 0
# set values too
instance.array[0, :] = 1
assert all(instance.array[0, :] == 1)
assert all(instance.array[1, :] == 0)
instance.array[1] = 2
assert all(instance.array[1] == 2)