docs! add recursive any shaped arrays!

This commit is contained in:
sneakers-the-rat 2024-05-20 21:16:16 -07:00
parent ce74a0482a
commit a4d82f0879
Signed by untrusted user who does not match committer: jonny
GPG key ID: 6DCB96EF1E4D232D
9 changed files with 378 additions and 54 deletions

View file

@ -9,5 +9,6 @@
dask dask
hdf5 hdf5
numpy numpy
video
zarr zarr
``` ```

View file

@ -0,0 +1,6 @@
# Video
```{eval-rst}
.. automodule:: numpydantic.interface.video
:members:
```

View file

@ -6,10 +6,12 @@
# -- Project information ----------------------------------------------------- # -- Project information -----------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
import importlib.metadata as metadata
project = "numpydantic" project = "numpydantic"
copyright = "2024, Jonny Saunders" copyright = "2024, Jonny Saunders"
author = "Jonny Saunders" author = "Jonny Saunders"
release = "v0.0.0" release = metadata.version("numpydantic")
# -- General configuration --------------------------------------------------- # -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

View file

@ -25,12 +25,12 @@ or implement `__get_pydantic_core_schema__` on your type to fully support it.
And setting `arbitrary_types_allowed = True` still prohibits you from And setting `arbitrary_types_allowed = True` still prohibits you from
generating JSON Schema, serialization to JSON generating JSON Schema, serialization to JSON
## Features: ## Features:
- **Types** - Annotations (based on [npytyping](https://github.com/ramonhagenaars/nptyping)) - **Types** - Annotations (based on [npytyping](https://github.com/ramonhagenaars/nptyping))
for specifying arrays in pydantic models for specifying arrays in pydantic models
- **Validation** - Shape, dtype, and other array validations - **Validation** - Shape, dtype, and other array validations
- **Interfaces** - Works with {mod}`~.interface.numpy`, {mod}`~.interface.dask`, {mod}`~.interface.hdf5`, {mod}`~.interface.zarr`, - **Interfaces** - Works with {mod}`~.interface.numpy`, {mod}`~.interface.dask`, {mod}`~.interface.hdf5`,
{mod}`~.interface.video`, and {mod}`~.interface.zarr`,
and a simple extension system to make it work with whatever else you want! and a simple extension system to make it work with whatever else you want!
- **Serialization** - Dump an array as a JSON-compatible array-of-arrays with enough metadata to be able to - **Serialization** - Dump an array as a JSON-compatible array-of-arrays with enough metadata to be able to
recreate the model in the native format recreate the model in the native format
@ -47,6 +47,26 @@ Coming soon:
minimum and maximum precision ranges, and so on as type maps provided by interface classes :) minimum and maximum precision ranges, and so on as type maps provided by interface classes :)
- (see [todo](./todo.md)) - (see [todo](./todo.md))
## Installation
numpydantic tries to keep dependencies minimal, so by default it only comes with
dependencies to use the numpy interface. Add the extra relevant to your favorite
array library to be able to use it!
```shell
pip install numpydantic
# dask
pip install 'numpydantic[dask]'
# hdf5
pip install 'numpydantic[hdf5]'
# video
pip install 'numpydantic[video]'
# zarr
pip install 'numpydantic[zarr]'
# all array formats
pip intsall 'numpydantic[array]'
```
## Usage ## Usage
Specify an array using [nptyping syntax](https://github.com/ramonhagenaars/nptyping/blob/master/USERDOCS.md) Specify an array using [nptyping syntax](https://github.com/ramonhagenaars/nptyping/blob/master/USERDOCS.md)
@ -55,7 +75,10 @@ and use it with your favorite array library :)
Use the {class}`~numpydantic.NDArray` class like you would any other python type, Use the {class}`~numpydantic.NDArray` class like you would any other python type,
combine it with {class}`typing.Union`, make it {class}`~typing.Optional`, etc. combine it with {class}`typing.Union`, make it {class}`~typing.Optional`, etc.
For example, to support a For example, to specify a very special type of image that can either be
- a 2D float array where the axes can be any size, or
- a 3D uint8 array where the third axis must be size 3
- a 1080p video
```python ```python
from typing import Union from typing import Union
@ -65,43 +88,36 @@ import numpy as np
from numpydantic import NDArray, Shape from numpydantic import NDArray, Shape
class Image(BaseModel): class Image(BaseModel):
"""
Images: grayscale, RGB, RGBA, and videos too!
"""
array: Union[ array: Union[
NDArray[Shape["* x, * y"], np.uint8], NDArray[Shape["* x, * y"], float],
NDArray[Shape["* x, * y, 3 rgb"], np.uint8], NDArray[Shape["* x, * y, 3 rgb"], np.uint8],
NDArray[Shape["* t, * x, * y, 4 rgba"], np.float64] NDArray[Shape["* t, 1080 y, 1920 x, 3 rgb"], np.uint8]
] ]
``` ```
And then use that as a transparent interface to your favorite array library! And then use that as a transparent interface to your favorite array library!
### Numpy ### Interfaces
#### Numpy
The Coca-Cola of array libraries The Coca-Cola of array libraries
```python ```python
import numpy as np import numpy as np
# works # works
frame_gray = Image(array=np.ones((1280, 720), dtype=np.uint8)) frame_gray = Image(array=np.ones((1280, 720), dtype=float))
frame_rgb = Image(array=np.ones((1280, 720, 3), dtype=np.uint8)) frame_rgb = Image(array=np.ones((1280, 720, 3), dtype=np.uint8))
frame_rgba = Image(array=np.ones((1280, 720, 4), dtype=np.uint8))
video_rgb = Image(array=np.ones((100, 1280, 720, 3), dtype=np.uint8))
# fails # fails
wrong_n_dimensions = Image(array=np.ones((1280,), dtype=np.uint8)) wrong_n_dimensions = Image(array=np.ones((1280,), dtype=float))
wrong_shape = Image(array=np.ones((1280,720,10), dtype=np.uint8)) wrong_shape = Image(array=np.ones((1280,720,10), dtype=np.uint8))
wrong_type = Image(array=np.ones((1280,720,3), dtype=np.float64))
# shapes and types are checked together, so.. # shapes and types are checked together, so this also fails
# this works wrong_shape_dtype_combo = Image(array=np.ones((1280, 720, 3), dtype=float))
float_video = Image(array=np.ones((100, 1280, 720, 4), dtype=float))
# this doesn't
wrong_shape_float_video = Image(array=np.ones((100, 1280, 720, 3), dtype=float))
``` ```
### Dask #### Dask
High performance chunked arrays! The backend for many new array libraries! High performance chunked arrays! The backend for many new array libraries!
@ -110,14 +126,12 @@ Works exactly the same as numpy arrays
```python ```python
import dask.array as da import dask.array as da
# validate a huge video # validate a humongous image without having to load it into memory
video_array = da.zeros(shape=(1920,1080,1000000,3), dtype=np.uint8) video_array = da.zeros(shape=(1e10,1e20,3), dtype=np.uint8)
# this works
dask_video = Image(array=video_array) dask_video = Image(array=video_array)
``` ```
### HDF5 #### HDF5
Array work increasingly can't fit on memory, but dealing with arrays on disk Array work increasingly can't fit on memory, but dealing with arrays on disk
can become a pain in concurrent applications. Numpydantic allows you to can become a pain in concurrent applications. Numpydantic allows you to
@ -136,7 +150,7 @@ array_path = "/nested/array"
# make an HDF5 array # make an HDF5 array
h5f = h5py.File(h5f_file, "w") h5f = h5py.File(h5f_file, "w")
array = np.random.random((1920,1080,3)).astype(np.uint8) array = np.random.randint(0, 255, (1920,1080,3), np.uint8)
h5f.create_dataset(array_path, data=array) h5f.create_dataset(array_path, data=array)
h5f.close() h5f.close()
``` ```
@ -172,17 +186,229 @@ object and leave the file open between calls:
>>> h5f_image.array.close() >>> h5f_image.array.close()
``` ```
### Zarr #### Video
Videos are just arrays with fancy encoding! Numpydantic can validate shape and dtype
as well as lazy load chunks of frames with arraylike syntax!
Say we have some video `data.mp4` ...
```python
video = Image(array='data.mp4')
# get a single frame
video.array[5]
# or a range of frames!
video.array[5:10]
# or whatever slicing you want to do!
video.array[5:50:5, 0:10, 50:70]
```
As elsewhere, a proxy class is a transparent pass-through interface to the underlying
opencv class, so we can get the rest of the video properties ...
```python
import cv2
# get the total frames from opencv
video.array.get(cv2.CAP_PROP_FRAME_COUNT)
# the proxy class also provides a convenience property
video.array.n_frames
```
#### Zarr
Zarr works similarly! Zarr works similarly!
Use it with any of Zarr's backends: Nested, Zipfile, S3, it's all the same! Use it with any of Zarr's backends: Nested, Zipfile, S3, it's all the same!
```{todo} Eg. create a nested zarr array on disk and use it...
Add the zarr examples!
```python
import zarr
from numpydantic.interface.zarr import ZarrArrayPath
array_file = 'data/array.zarr'
nested_path = 'data/sets/here'
root = zarr.open(array_file, mode='w')
nested_array = root.zeros(
nested_path,
shape=(1000, 1080, 1920, 3),
dtype=np.uint8
)
# validates just fine!
zarr_video = Image(array=ZarrArrayPath(array_file, nested_path))
# or just pass a tuple, the interface can discover it's a zarr array
zarr_video = Image(array=(array_file, nested_path))
``` ```
### JSON Schema
Numpydantic generates JSON Schema for all its array specifications, so for the above
model, we get a schema for each of the possible array types that properly handles
the shape and dtype constraints and includes the origin numpy type as a `dtype` annotation.
```python
Image.model_json_schema()
```
```json
{
"properties": {
"array": {
"anyOf": [
{
"items": {"items": {"type": "number"}, "type": "array"},
"type": "array"
},
{
"dtype": "numpy.uint8",
"items": {
"items": {
"items": {
"maximum": 255,
"minimum": 0,
"type": "integer"
},
"maxItems": 3,
"minItems": 3,
"type": "array"
},
"type": "array"
},
"type": "array"
},
{
"dtype": "numpy.uint8",
"items": {
"items": {
"items": {
"items": {
"maximum": 255,
"minimum": 0,
"type": "integer"
},
"maxItems": 3,
"minItems": 3,
"type": "array"
},
"maxItems": 1920,
"minItems": 1920,
"type": "array"
},
"maxItems": 1080,
"minItems": 1080,
"type": "array"
},
"type": "array"
}
],
"title": "Array"
}
},
"required": ["array"],
"title": "Image",
"type": "object"
}
```
numpydantic can even handle shapes with unbounded numbers of dimensions by using
recursive JSON schema!!!
So the any-shaped array (using nptyping's ellipsis notation):
```python
class AnyShape(BaseModel):
array: NDArray[Shape["*, ..."], np.uint8]
```
is rendered to JSON-Schema like this:
```json
{
"$defs": {
"any-shape-array-9b5d89838a990d79": {
"anyOf": [
{
"items": {
"$ref": "#/$defs/any-shape-array-9b5d89838a990d79"
},
"type": "array"
},
{"maximum": 255, "minimum": 0, "type": "integer"}
]
}
},
"properties": {
"array": {
"dtype": "numpy.uint8",
"items": {"$ref": "#/$defs/any-shape-array-9b5d89838a990d79"},
"title": "Array",
"type": "array"
}
},
"required": ["array"],
"title": "AnyShape",
"type": "object"
}
```
where the key `"any-shape-array-9b5d89838a990d79"` uses a (blake2b) hash of the
inner dtype specification so that having multiple any-shaped arrays in a single
model schema are deduplicated without conflicts.
### Dumping
One of the main reasons to use chunked array libraries like zarr is to avoid
needing to load the entire array into memory. When dumping data to JSON, numpydantic
tries to mirror this behavior, by default only dumping the metadata that is
necessary to identify the array.
For example, with zarr:
```python
array = zarr.array([[1,2,3],[4,5,6],[7,8,9]], dtype=float)
instance = Image(array=array)
dumped = instance.model_dump_json()
```
```json
{
"array":
{
"Chunk shape": "(3, 3)",
"Chunks initialized": "1/1",
"Compressor": "Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",
"Data type": "float64",
"No. bytes": "72",
"No. bytes stored": "421",
"Order": "C",
"Read-only": "False",
"Shape": "(3, 3)",
"Storage ratio": "0.2",
"Store type": "zarr.storage.KVStore",
"Type": "zarr.core.Array",
"hexdigest": "c51604eace325fe42bbebf39146c0956bd2ed13c"
}
}
```
To print the whole array, we use pydantic's serialization contexts:
```python
dumped = instance.model_dump_json(context={'zarr_dump_array': True})
```
```json
{
"array":
{
"same thing,": "except also...",
"array": [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]],
"hexdigest": "c51604eace325fe42bbebf39146c0956bd2ed13c"
}
}
```
```{toctree} ```{toctree}

View file

@ -22,12 +22,12 @@ dask = [
hdf5 = [ hdf5 = [
"h5py>=3.10.0" "h5py>=3.10.0"
] ]
zarr = [
"zarr>=2.17.2",
]
video = [ video = [
"opencv-python>=4.9.0.80", "opencv-python>=4.9.0.80",
] ]
zarr = [
"zarr>=2.17.2",
]
arrays = [ arrays = [
"numpydantic[dask,hdf5,zarr,video]" "numpydantic[dask,hdf5,zarr,video]"
] ]

View file

@ -125,12 +125,29 @@ class VideoProxy:
raise ValueError(f"Could not get frame {frame}") raise ValueError(f"Could not get frame {frame}")
return frame return frame
def _complete_slice(self, slice_: slice) -> slice:
"""Get a fully-built slice that can be passed to range"""
if slice_.step is None:
slice_ = slice(slice_.start, slice_.stop, 1)
if slice_.stop is None:
slice_ = slice(slice_.start, self.n_frames, slice_.step)
if slice_.start is None:
slice_ = slice(0, slice_.stop, slice_.step)
return slice_
def __getitem__(self, item: Union[int, slice, tuple]) -> np.ndarray: def __getitem__(self, item: Union[int, slice, tuple]) -> np.ndarray:
if isinstance(item, int): if isinstance(item, int):
# want a single frame # want a single frame
return self._get_frame(item) return self._get_frame(item)
elif isinstance(item, slice):
# slice of frames
item = self._complete_slice(item)
frames = []
for i in range(item.start, item.stop, item.step):
frames.append(self._get_frame(i))
return np.stack(frames)
else: else:
# slices are passes as tuples # slices are passed as tuples
# first arg needs to be handled specially # first arg needs to be handled specially
if isinstance(item[0], int): if isinstance(item[0], int):
# single frame # single frame
@ -142,13 +159,7 @@ class VideoProxy:
elif isinstance(item[0], slice): elif isinstance(item[0], slice):
frames = [] frames = []
# make a new slice since range cant take Nones, filling in missing vals # make a new slice since range cant take Nones, filling in missing vals
fslice = item[0] fslice = self._complete_slice(item[0])
if fslice.step is None:
fslice = slice(fslice.start, fslice.stop, 1)
if fslice.stop is None:
fslice = slice(fslice.start, self.n_frames, fslice.step)
if fslice.start is None:
fslice = slice(0, fslice.stop, fslice.step)
for i in range(fslice.start, fslice.stop, fslice.step): for i in range(fslice.start, fslice.stop, fslice.step):
frames.append(self._get_frame(i)) frames.append(self._get_frame(i))

View file

@ -3,7 +3,9 @@ Helper functions for use with :class:`~numpydantic.NDArray` - see the note in
:mod:`~numpydantic.ndarray` for why these are separated. :mod:`~numpydantic.ndarray` for why these are separated.
""" """
from typing import Any, Callable, Union import hashlib
import json
from typing import Any, Callable, Optional, Union
import nptyping.structure import nptyping.structure
import numpy as np import numpy as np
@ -124,6 +126,8 @@ def list_of_lists_schema(shape: Shape, array_type: CoreSchema) -> ListSchema:
# make the current level list schema, accounting for shape # make the current level list schema, accounting for shape
if arg == "*": if arg == "*":
list_schema = core_schema.list_schema(inner_schema, metadata=metadata) list_schema = core_schema.list_schema(inner_schema, metadata=metadata)
elif arg == "...":
list_schema = _unbounded_shape(inner_schema, metadata=metadata)
else: else:
arg = int(arg) arg = int(arg)
list_schema = core_schema.list_schema( list_schema = core_schema.list_schema(
@ -132,6 +136,50 @@ def list_of_lists_schema(shape: Shape, array_type: CoreSchema) -> ListSchema:
return list_schema return list_schema
def _hash_schema(schema: CoreSchema) -> str:
"""
Make a hex-encoded 8-byte blake2b hash from a pydantic core schema.
Collisions are really not important or likely here, but we do want the same schema
to produce the same hash.
"""
schema_str = json.dumps(
schema, sort_keys=True, indent=None, separators=(",", ":")
).encode("utf-8")
hasher = hashlib.blake2b(digest_size=8)
hasher.update(schema_str)
return hasher.hexdigest()
def _unbounded_shape(
inner_type: CoreSchema, metadata: Optional[dict] = None
) -> core_schema.DefinitionsSchema:
"""
Make a recursive schema that refers to itself using a hashed version of the inner
type
"""
schema_hash = _hash_schema(inner_type)
array_ref = f"any-shape-array-{schema_hash}"
schema = core_schema.definitions_schema(
core_schema.list_schema(
core_schema.definition_reference_schema(array_ref), metadata=metadata
),
[
core_schema.union_schema(
[
core_schema.list_schema(
core_schema.definition_reference_schema(array_ref)
),
inner_type,
],
ref=array_ref,
)
],
)
return schema
def make_json_schema( def make_json_schema(
shape: ShapeType, dtype: DtypeType, _handler: _handler_type shape: ShapeType, dtype: DtypeType, _handler: _handler_type
) -> ListSchema: ) -> ListSchema:
@ -154,7 +202,8 @@ def make_json_schema(
# get the names of the shape constraints, if any # get the names of the shape constraints, if any
if shape is Any: if shape is Any:
list_schema = core_schema.list_schema(core_schema.any_schema()) list_schema = _unbounded_shape(dtype_schema)
# list_schema = core_schema.list_schema(core_schema.any_schema())
else: else:
list_schema = list_of_lists_schema(shape, dtype_schema) list_schema = list_of_lists_schema(shape, dtype_schema)

View file

@ -122,6 +122,12 @@ def test_video_getitem(avi_video):
assert single_slice.shape == (10, 5, 3) assert single_slice.shape == (10, 5, 3)
# also get a range of frames # also get a range of frames
# range without further slices
range_slice = instance.array[3:5]
assert range_slice.shape == (2, 100, 50, 3)
assert range_slice[0, 3, 3, 0] == 3
assert range_slice[0, 4, 4, 0] == 0
# full range # full range
range_slice = instance.array[3:5, 0:10, 0:5] range_slice = instance.array[3:5, 0:10, 0:5]
assert range_slice.shape == (2, 10, 5, 3) assert range_slice.shape == (2, 10, 5, 3)

View file

@ -14,9 +14,6 @@ from numpydantic.exceptions import ShapeError, DtypeError
from numpydantic import dtype from numpydantic import dtype
# from .fixtures import tmp_output_dir_func
def test_ndarray_type(): def test_ndarray_type():
class Model(BaseModel): class Model(BaseModel):
array: NDArray[Shape["2 x, * y"], Number] array: NDArray[Shape["2 x, * y"], Number]
@ -186,17 +183,43 @@ def test_json_schema_dtype_builtin(dtype, expected, array_model):
assert inner_type["type"] == expected assert inner_type["type"] == expected
@pytest.mark.skip("Not implemented yet") def _recursive_array(schema):
def test_json_schema_wildcard(): assert "$defs" in schema
""" # get the key uses for the array
NDarray types should generate a JSON schema without shape constraints array_key = list(schema["$defs"].keys())[0]
"""
pass # the array property should be a ref to the recursive array
# get the innermost part of the field schema
field_schema = schema["properties"]["array"]
while "items" in field_schema:
field_schema = field_schema["items"]
assert field_schema["$ref"] == f"#/$defs/{array_key}"
# and the recursive array should indeed be recursive...
# specifically it should be an array whose items can be itself or
# of the type specified by the dtype
any_of = schema["$defs"][array_key]["anyOf"]
assert any_of[0]["items"]["$ref"] == f"#/$defs/{array_key}"
assert any_of[0]["type"] == "array"
# here we are just assuming that it's a uint8 array..
assert any_of[1]["type"] == "integer"
assert any_of[1]["maximum"] == 255
assert any_of[1]["minimum"] == 0
@pytest.mark.skip("Not implemented yet")
def test_json_schema_ellipsis(): def test_json_schema_ellipsis():
""" """
NDArray types should create a recursive JSON schema for any-shaped arrays NDArray types should create a recursive JSON schema for any-shaped arrays
""" """
pass
class AnyShape(BaseModel):
array: NDArray[Shape["*, ..."], np.uint8]
schema = AnyShape.model_json_schema()
_recursive_array(schema)
class ConstrainedAnyShape(BaseModel):
array: NDArray[Shape["3, 4, ..."], np.uint8]
schema = ConstrainedAnyShape.model_json_schema()
_recursive_array(schema)