mirror of
https://github.com/p2p-ld/numpydantic.git
synced 2025-01-10 05:54:26 +00:00
add ability to index hdf5 compound datasets
This commit is contained in:
parent
6a397a9aba
commit
03fe97b7e0
3 changed files with 96 additions and 20 deletions
|
@ -4,7 +4,7 @@ Interfaces for HDF5 Datasets
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, NamedTuple, Optional, Tuple, Union
|
from typing import Any, List, NamedTuple, Optional, Tuple, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from pydantic import SerializationInfo
|
from pydantic import SerializationInfo
|
||||||
|
@ -32,6 +32,8 @@ class H5ArrayPath(NamedTuple):
|
||||||
"""Location of HDF5 file"""
|
"""Location of HDF5 file"""
|
||||||
path: str
|
path: str
|
||||||
"""Path within the HDF5 file"""
|
"""Path within the HDF5 file"""
|
||||||
|
field: Optional[Union[str, List[str]]] = None
|
||||||
|
"""Refer to a specific field within a compound dtype"""
|
||||||
|
|
||||||
|
|
||||||
class H5Proxy:
|
class H5Proxy:
|
||||||
|
@ -51,12 +53,20 @@ class H5Proxy:
|
||||||
Args:
|
Args:
|
||||||
file (pathlib.Path | str): Location of hdf5 file on filesystem
|
file (pathlib.Path | str): Location of hdf5 file on filesystem
|
||||||
path (str): Path to array within hdf5 file
|
path (str): Path to array within hdf5 file
|
||||||
|
field (str, list[str]): Optional - refer to a specific field within
|
||||||
|
a compound dtype
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, file: Union[Path, str], path: str):
|
def __init__(
|
||||||
|
self,
|
||||||
|
file: Union[Path, str],
|
||||||
|
path: str,
|
||||||
|
field: Optional[Union[str, List[str]]] = None,
|
||||||
|
):
|
||||||
self._h5f = None
|
self._h5f = None
|
||||||
self.file = Path(file)
|
self.file = Path(file)
|
||||||
self.path = path
|
self.path = path
|
||||||
|
self.field = field
|
||||||
|
|
||||||
def array_exists(self) -> bool:
|
def array_exists(self) -> bool:
|
||||||
"""Check that there is in fact an array at :attr:`.path` within :attr:`.file`"""
|
"""Check that there is in fact an array at :attr:`.path` within :attr:`.file`"""
|
||||||
|
@ -67,21 +77,43 @@ class H5Proxy:
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_h5array(cls, h5array: H5ArrayPath) -> "H5Proxy":
|
def from_h5array(cls, h5array: H5ArrayPath) -> "H5Proxy":
|
||||||
"""Instantiate using :class:`.H5ArrayPath`"""
|
"""Instantiate using :class:`.H5ArrayPath`"""
|
||||||
return H5Proxy(file=h5array.file, path=h5array.path)
|
return H5Proxy(file=h5array.file, path=h5array.path, field=h5array.field)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dtype(self) -> np.dtype:
|
||||||
|
"""
|
||||||
|
Get dtype of array, using :attr:`.field` if present
|
||||||
|
"""
|
||||||
|
with h5py.File(self.file, "r") as h5f:
|
||||||
|
obj = h5f.get(self.path)
|
||||||
|
if self.field is None:
|
||||||
|
return obj.dtype
|
||||||
|
else:
|
||||||
|
return obj.dtype[self.field]
|
||||||
|
|
||||||
def __getattr__(self, item: str):
|
def __getattr__(self, item: str):
|
||||||
with h5py.File(self.file, "r") as h5f:
|
with h5py.File(self.file, "r") as h5f:
|
||||||
obj = h5f.get(self.path)
|
obj = h5f.get(self.path)
|
||||||
return getattr(obj, item)
|
return getattr(obj, item)
|
||||||
|
|
||||||
def __getitem__(self, item: Union[int, slice]) -> np.ndarray:
|
def __getitem__(
|
||||||
|
self, item: Union[int, slice, Tuple[Union[int, slice], ...]]
|
||||||
|
) -> np.ndarray:
|
||||||
with h5py.File(self.file, "r") as h5f:
|
with h5py.File(self.file, "r") as h5f:
|
||||||
obj = h5f.get(self.path)
|
obj = h5f.get(self.path)
|
||||||
|
if self.field is not None:
|
||||||
|
obj = obj.fields(self.field)
|
||||||
return obj[item]
|
return obj[item]
|
||||||
|
|
||||||
def __setitem__(self, key: Union[int, slice], value: Union[int, float, np.ndarray]):
|
def __setitem__(
|
||||||
|
self,
|
||||||
|
key: Union[int, slice, Tuple[Union[int, slice], ...]],
|
||||||
|
value: Union[int, float, np.ndarray],
|
||||||
|
):
|
||||||
with h5py.File(self.file, "r+", locking=True) as h5f:
|
with h5py.File(self.file, "r+", locking=True) as h5f:
|
||||||
obj = h5f.get(self.path)
|
obj = h5f.get(self.path)
|
||||||
|
if self.field is not None:
|
||||||
|
obj = obj.fields(self.field)
|
||||||
obj[key] = value
|
obj[key] = value
|
||||||
|
|
||||||
def open(self, mode: str = "r") -> "h5py.Dataset":
|
def open(self, mode: str = "r") -> "h5py.Dataset":
|
||||||
|
@ -133,7 +165,7 @@ class H5Interface(Interface):
|
||||||
if isinstance(array, H5ArrayPath):
|
if isinstance(array, H5ArrayPath):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
if isinstance(array, (tuple, list)) and len(array) == 2:
|
if isinstance(array, (tuple, list)) and len(array) in (2, 3):
|
||||||
# check that the first arg is an hdf5 file
|
# check that the first arg is an hdf5 file
|
||||||
try:
|
try:
|
||||||
file = Path(array[0])
|
file = Path(array[0])
|
||||||
|
@ -163,6 +195,8 @@ class H5Interface(Interface):
|
||||||
array = H5Proxy.from_h5array(h5array=array)
|
array = H5Proxy.from_h5array(h5array=array)
|
||||||
elif isinstance(array, (tuple, list)) and len(array) == 2: # pragma: no cover
|
elif isinstance(array, (tuple, list)) and len(array) == 2: # pragma: no cover
|
||||||
array = H5Proxy(file=array[0], path=array[1])
|
array = H5Proxy(file=array[0], path=array[1])
|
||||||
|
elif isinstance(array, (tuple, list)) and len(array) == 3:
|
||||||
|
array = H5Proxy(file=array[0], path=array[1], field=array[2])
|
||||||
else: # pragma: no cover
|
else: # pragma: no cover
|
||||||
# this should never happen really since `check` confirms this before
|
# this should never happen really since `check` confirms this before
|
||||||
# we'd reach here, but just to complete the if else...
|
# we'd reach here, but just to complete the if else...
|
||||||
|
|
|
@ -116,12 +116,20 @@ def hdf5_array(
|
||||||
) -> Callable[[Tuple[int, ...], Union[np.dtype, type]], H5ArrayPath]:
|
) -> Callable[[Tuple[int, ...], Union[np.dtype, type]], H5ArrayPath]:
|
||||||
|
|
||||||
def _hdf5_array(
|
def _hdf5_array(
|
||||||
shape: Tuple[int, ...] = (10, 10), dtype: Union[np.dtype, type] = float
|
shape: Tuple[int, ...] = (10, 10),
|
||||||
|
dtype: Union[np.dtype, type] = float,
|
||||||
|
compound: bool = False,
|
||||||
) -> H5ArrayPath:
|
) -> H5ArrayPath:
|
||||||
array_path = "/" + "_".join([str(s) for s in shape]) + "__" + dtype.__name__
|
array_path = "/" + "_".join([str(s) for s in shape]) + "__" + dtype.__name__
|
||||||
data = np.random.random(shape).astype(dtype)
|
if not compound:
|
||||||
_ = hdf5_file.create_dataset(array_path, data=data)
|
data = np.random.random(shape).astype(dtype)
|
||||||
return H5ArrayPath(Path(hdf5_file.filename), array_path)
|
_ = hdf5_file.create_dataset(array_path, data=data)
|
||||||
|
return H5ArrayPath(Path(hdf5_file.filename), array_path)
|
||||||
|
else:
|
||||||
|
dt = np.dtype([("data", dtype), ("extra", "i8")])
|
||||||
|
data = np.zeros(shape, dtype=dt)
|
||||||
|
_ = hdf5_file.create_dataset(array_path, data=data)
|
||||||
|
return H5ArrayPath(Path(hdf5_file.filename), array_path, "data")
|
||||||
|
|
||||||
return _hdf5_array
|
return _hdf5_array
|
||||||
|
|
||||||
|
|
|
@ -1,17 +1,22 @@
|
||||||
import pdb
|
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
import h5py
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from pydantic import BaseModel, ValidationError
|
from pydantic import BaseModel, ValidationError
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from numpydantic import NDArray, Shape
|
||||||
from numpydantic.interface import H5Interface
|
from numpydantic.interface import H5Interface
|
||||||
from numpydantic.interface.hdf5 import H5ArrayPath
|
from numpydantic.interface.hdf5 import H5ArrayPath, H5Proxy
|
||||||
from numpydantic.exceptions import DtypeError, ShapeError
|
from numpydantic.exceptions import DtypeError, ShapeError
|
||||||
|
|
||||||
from tests.conftest import ValidationCase
|
from tests.conftest import ValidationCase
|
||||||
|
|
||||||
|
|
||||||
def hdf5_array_case(case: ValidationCase, array_func) -> H5ArrayPath:
|
def hdf5_array_case(
|
||||||
|
case: ValidationCase, array_func, compound: bool = False
|
||||||
|
) -> H5ArrayPath:
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
case:
|
case:
|
||||||
|
@ -22,11 +27,11 @@ def hdf5_array_case(case: ValidationCase, array_func) -> H5ArrayPath:
|
||||||
"""
|
"""
|
||||||
if issubclass(case.dtype, BaseModel):
|
if issubclass(case.dtype, BaseModel):
|
||||||
pytest.skip("hdf5 cant support arbitrary python objects")
|
pytest.skip("hdf5 cant support arbitrary python objects")
|
||||||
return array_func(case.shape, case.dtype)
|
return array_func(case.shape, case.dtype, compound)
|
||||||
|
|
||||||
|
|
||||||
def _test_hdf5_case(case: ValidationCase, array_func):
|
def _test_hdf5_case(case: ValidationCase, array_func, compound: bool = False) -> None:
|
||||||
array = hdf5_array_case(case, array_func)
|
array = hdf5_array_case(case, array_func, compound)
|
||||||
if case.passes:
|
if case.passes:
|
||||||
case.model(array=array)
|
case.model(array=array)
|
||||||
else:
|
else:
|
||||||
|
@ -66,14 +71,16 @@ def test_hdf5_check_not_hdf5(tmp_path):
|
||||||
assert not H5Interface.check(spec)
|
assert not H5Interface.check(spec)
|
||||||
|
|
||||||
|
|
||||||
def test_hdf5_shape(shape_cases, hdf5_array):
|
@pytest.mark.parametrize("compound", [True, False])
|
||||||
_test_hdf5_case(shape_cases, hdf5_array)
|
def test_hdf5_shape(shape_cases, hdf5_array, compound):
|
||||||
|
_test_hdf5_case(shape_cases, hdf5_array, compound)
|
||||||
|
|
||||||
|
|
||||||
def test_hdf5_dtype(dtype_cases, hdf5_array):
|
@pytest.mark.parametrize("compound", [True, False])
|
||||||
|
def test_hdf5_dtype(dtype_cases, hdf5_array, compound):
|
||||||
if dtype_cases.dtype is str:
|
if dtype_cases.dtype is str:
|
||||||
pytest.skip("hdf5 cant do string arrays")
|
pytest.skip("hdf5 cant do string arrays")
|
||||||
_test_hdf5_case(dtype_cases, hdf5_array)
|
_test_hdf5_case(dtype_cases, hdf5_array, compound)
|
||||||
|
|
||||||
|
|
||||||
def test_hdf5_dataset_not_exists(hdf5_array, model_blank):
|
def test_hdf5_dataset_not_exists(hdf5_array, model_blank):
|
||||||
|
@ -116,3 +123,30 @@ def test_to_json(hdf5_array, array_model):
|
||||||
assert json_dict["path"] == str(array.path)
|
assert json_dict["path"] == str(array.path)
|
||||||
assert json_dict["attrs"] == {}
|
assert json_dict["attrs"] == {}
|
||||||
assert json_dict["array"] == instance.array[:].tolist()
|
assert json_dict["array"] == instance.array[:].tolist()
|
||||||
|
|
||||||
|
|
||||||
|
def test_compound_dtype(tmp_path):
|
||||||
|
"""
|
||||||
|
hdf5 proxy indexes compound dtypes as single fields when field is given
|
||||||
|
"""
|
||||||
|
h5f_path = tmp_path / "test.h5"
|
||||||
|
dataset_path = "/dataset"
|
||||||
|
field = "data"
|
||||||
|
dtype = np.dtype([(field, "i8"), ("extra", "f8")])
|
||||||
|
data = np.zeros((10, 20), dtype=dtype)
|
||||||
|
with h5py.File(h5f_path, "w") as h5f:
|
||||||
|
dset = h5f.create_dataset(dataset_path, data=data)
|
||||||
|
assert dset.dtype == dtype
|
||||||
|
|
||||||
|
proxy = H5Proxy(h5f_path, dataset_path, field=field)
|
||||||
|
assert proxy.dtype == np.dtype("int64")
|
||||||
|
assert proxy.shape == (10, 20)
|
||||||
|
assert proxy[0, 0] == 0
|
||||||
|
|
||||||
|
class MyModel(BaseModel):
|
||||||
|
array: NDArray[Shape["10, 20"], np.int64]
|
||||||
|
|
||||||
|
instance = MyModel(array=(h5f_path, dataset_path, field))
|
||||||
|
assert instance.array.dtype == np.dtype("int64")
|
||||||
|
assert instance.array.shape == (10, 20)
|
||||||
|
assert instance.array[0, 0] == 0
|
||||||
|
|
Loading…
Reference in a new issue