From a4d82f08790342fc7d20b6ad260f8efeec5c48d0 Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Mon, 20 May 2024 21:16:16 -0700 Subject: [PATCH] docs! add recursive any shaped arrays! --- docs/api/interface/index.md | 1 + docs/api/interface/video.md | 6 + docs/conf.py | 4 +- docs/index.md | 284 ++++++++++++++++++++++++++--- pyproject.toml | 6 +- src/numpydantic/interface/video.py | 27 ++- src/numpydantic/schema.py | 53 +++++- tests/test_interface/test_video.py | 6 + tests/test_ndarray.py | 45 +++-- 9 files changed, 378 insertions(+), 54 deletions(-) create mode 100644 docs/api/interface/video.md diff --git a/docs/api/interface/index.md b/docs/api/interface/index.md index 7ade4fc..02e48ba 100644 --- a/docs/api/interface/index.md +++ b/docs/api/interface/index.md @@ -9,5 +9,6 @@ dask hdf5 numpy +video zarr ``` \ No newline at end of file diff --git a/docs/api/interface/video.md b/docs/api/interface/video.md new file mode 100644 index 0000000..07b7666 --- /dev/null +++ b/docs/api/interface/video.md @@ -0,0 +1,6 @@ +# Video + +```{eval-rst} +.. automodule:: numpydantic.interface.video + :members: +``` \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index c31d0a7..38db560 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -6,10 +6,12 @@ # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information +import importlib.metadata as metadata + project = "numpydantic" copyright = "2024, Jonny Saunders" author = "Jonny Saunders" -release = "v0.0.0" +release = metadata.version("numpydantic") # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/docs/index.md b/docs/index.md index a79eee2..08ed398 100644 --- a/docs/index.md +++ b/docs/index.md @@ -25,12 +25,12 @@ or implement `__get_pydantic_core_schema__` on your type to fully support it. And setting `arbitrary_types_allowed = True` still prohibits you from generating JSON Schema, serialization to JSON - ## Features: - **Types** - Annotations (based on [npytyping](https://github.com/ramonhagenaars/nptyping)) for specifying arrays in pydantic models - **Validation** - Shape, dtype, and other array validations -- **Interfaces** - Works with {mod}`~.interface.numpy`, {mod}`~.interface.dask`, {mod}`~.interface.hdf5`, {mod}`~.interface.zarr`, +- **Interfaces** - Works with {mod}`~.interface.numpy`, {mod}`~.interface.dask`, {mod}`~.interface.hdf5`, + {mod}`~.interface.video`, and {mod}`~.interface.zarr`, and a simple extension system to make it work with whatever else you want! - **Serialization** - Dump an array as a JSON-compatible array-of-arrays with enough metadata to be able to recreate the model in the native format @@ -47,6 +47,26 @@ Coming soon: minimum and maximum precision ranges, and so on as type maps provided by interface classes :) - (see [todo](./todo.md)) +## Installation + +numpydantic tries to keep dependencies minimal, so by default it only comes with +dependencies to use the numpy interface. Add the extra relevant to your favorite +array library to be able to use it! + +```shell +pip install numpydantic +# dask +pip install 'numpydantic[dask]' +# hdf5 +pip install 'numpydantic[hdf5]' +# video +pip install 'numpydantic[video]' +# zarr +pip install 'numpydantic[zarr]' +# all array formats +pip intsall 'numpydantic[array]' +``` + ## Usage Specify an array using [nptyping syntax](https://github.com/ramonhagenaars/nptyping/blob/master/USERDOCS.md) @@ -55,7 +75,10 @@ and use it with your favorite array library :) Use the {class}`~numpydantic.NDArray` class like you would any other python type, combine it with {class}`typing.Union`, make it {class}`~typing.Optional`, etc. -For example, to support a +For example, to specify a very special type of image that can either be +- a 2D float array where the axes can be any size, or +- a 3D uint8 array where the third axis must be size 3 +- a 1080p video ```python from typing import Union @@ -65,43 +88,36 @@ import numpy as np from numpydantic import NDArray, Shape class Image(BaseModel): - """ - Images: grayscale, RGB, RGBA, and videos too! - """ array: Union[ - NDArray[Shape["* x, * y"], np.uint8], + NDArray[Shape["* x, * y"], float], NDArray[Shape["* x, * y, 3 rgb"], np.uint8], - NDArray[Shape["* t, * x, * y, 4 rgba"], np.float64] + NDArray[Shape["* t, 1080 y, 1920 x, 3 rgb"], np.uint8] ] ``` And then use that as a transparent interface to your favorite array library! -### Numpy +### Interfaces + +#### Numpy The Coca-Cola of array libraries ```python import numpy as np # works -frame_gray = Image(array=np.ones((1280, 720), dtype=np.uint8)) +frame_gray = Image(array=np.ones((1280, 720), dtype=float)) frame_rgb = Image(array=np.ones((1280, 720, 3), dtype=np.uint8)) -frame_rgba = Image(array=np.ones((1280, 720, 4), dtype=np.uint8)) -video_rgb = Image(array=np.ones((100, 1280, 720, 3), dtype=np.uint8)) # fails -wrong_n_dimensions = Image(array=np.ones((1280,), dtype=np.uint8)) +wrong_n_dimensions = Image(array=np.ones((1280,), dtype=float)) wrong_shape = Image(array=np.ones((1280,720,10), dtype=np.uint8)) -wrong_type = Image(array=np.ones((1280,720,3), dtype=np.float64)) -# shapes and types are checked together, so.. -# this works -float_video = Image(array=np.ones((100, 1280, 720, 4), dtype=float)) -# this doesn't -wrong_shape_float_video = Image(array=np.ones((100, 1280, 720, 3), dtype=float)) +# shapes and types are checked together, so this also fails +wrong_shape_dtype_combo = Image(array=np.ones((1280, 720, 3), dtype=float)) ``` -### Dask +#### Dask High performance chunked arrays! The backend for many new array libraries! @@ -110,14 +126,12 @@ Works exactly the same as numpy arrays ```python import dask.array as da -# validate a huge video -video_array = da.zeros(shape=(1920,1080,1000000,3), dtype=np.uint8) - -# this works +# validate a humongous image without having to load it into memory +video_array = da.zeros(shape=(1e10,1e20,3), dtype=np.uint8) dask_video = Image(array=video_array) ``` -### HDF5 +#### HDF5 Array work increasingly can't fit on memory, but dealing with arrays on disk can become a pain in concurrent applications. Numpydantic allows you to @@ -136,7 +150,7 @@ array_path = "/nested/array" # make an HDF5 array h5f = h5py.File(h5f_file, "w") -array = np.random.random((1920,1080,3)).astype(np.uint8) +array = np.random.randint(0, 255, (1920,1080,3), np.uint8) h5f.create_dataset(array_path, data=array) h5f.close() ``` @@ -172,17 +186,229 @@ object and leave the file open between calls: >>> h5f_image.array.close() ``` -### Zarr +#### Video + +Videos are just arrays with fancy encoding! Numpydantic can validate shape and dtype +as well as lazy load chunks of frames with arraylike syntax! + +Say we have some video `data.mp4` ... + +```python +video = Image(array='data.mp4') +# get a single frame +video.array[5] +# or a range of frames! +video.array[5:10] +# or whatever slicing you want to do! +video.array[5:50:5, 0:10, 50:70] +``` + +As elsewhere, a proxy class is a transparent pass-through interface to the underlying +opencv class, so we can get the rest of the video properties ... + +```python +import cv2 + +# get the total frames from opencv +video.array.get(cv2.CAP_PROP_FRAME_COUNT) +# the proxy class also provides a convenience property +video.array.n_frames +``` + +#### Zarr Zarr works similarly! Use it with any of Zarr's backends: Nested, Zipfile, S3, it's all the same! -```{todo} -Add the zarr examples! +Eg. create a nested zarr array on disk and use it... + +```python +import zarr +from numpydantic.interface.zarr import ZarrArrayPath + +array_file = 'data/array.zarr' +nested_path = 'data/sets/here' + +root = zarr.open(array_file, mode='w') +nested_array = root.zeros( + nested_path, + shape=(1000, 1080, 1920, 3), + dtype=np.uint8 +) + +# validates just fine! +zarr_video = Image(array=ZarrArrayPath(array_file, nested_path)) +# or just pass a tuple, the interface can discover it's a zarr array +zarr_video = Image(array=(array_file, nested_path)) ``` +### JSON Schema +Numpydantic generates JSON Schema for all its array specifications, so for the above +model, we get a schema for each of the possible array types that properly handles +the shape and dtype constraints and includes the origin numpy type as a `dtype` annotation. + +```python +Image.model_json_schema() +``` + +```json +{ + "properties": { + "array": { + "anyOf": [ + { + "items": {"items": {"type": "number"}, "type": "array"}, + "type": "array" + }, + { + "dtype": "numpy.uint8", + "items": { + "items": { + "items": { + "maximum": 255, + "minimum": 0, + "type": "integer" + }, + "maxItems": 3, + "minItems": 3, + "type": "array" + }, + "type": "array" + }, + "type": "array" + }, + { + "dtype": "numpy.uint8", + "items": { + "items": { + "items": { + "items": { + "maximum": 255, + "minimum": 0, + "type": "integer" + }, + "maxItems": 3, + "minItems": 3, + "type": "array" + }, + "maxItems": 1920, + "minItems": 1920, + "type": "array" + }, + "maxItems": 1080, + "minItems": 1080, + "type": "array" + }, + "type": "array" + } + ], + "title": "Array" + } + }, + "required": ["array"], + "title": "Image", + "type": "object" +} +``` + +numpydantic can even handle shapes with unbounded numbers of dimensions by using +recursive JSON schema!!! + +So the any-shaped array (using nptyping's ellipsis notation): + +```python +class AnyShape(BaseModel): + array: NDArray[Shape["*, ..."], np.uint8] +``` + +is rendered to JSON-Schema like this: + +```json +{ + "$defs": { + "any-shape-array-9b5d89838a990d79": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/any-shape-array-9b5d89838a990d79" + }, + "type": "array" + }, + {"maximum": 255, "minimum": 0, "type": "integer"} + ] + } + }, + "properties": { + "array": { + "dtype": "numpy.uint8", + "items": {"$ref": "#/$defs/any-shape-array-9b5d89838a990d79"}, + "title": "Array", + "type": "array" + } + }, + "required": ["array"], + "title": "AnyShape", + "type": "object" +} +``` + +where the key `"any-shape-array-9b5d89838a990d79"` uses a (blake2b) hash of the +inner dtype specification so that having multiple any-shaped arrays in a single +model schema are deduplicated without conflicts. + +### Dumping + +One of the main reasons to use chunked array libraries like zarr is to avoid +needing to load the entire array into memory. When dumping data to JSON, numpydantic +tries to mirror this behavior, by default only dumping the metadata that is +necessary to identify the array. + +For example, with zarr: + +```python +array = zarr.array([[1,2,3],[4,5,6],[7,8,9]], dtype=float) +instance = Image(array=array) +dumped = instance.model_dump_json() +``` + +```json +{ + "array": + { + "Chunk shape": "(3, 3)", + "Chunks initialized": "1/1", + "Compressor": "Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)", + "Data type": "float64", + "No. bytes": "72", + "No. bytes stored": "421", + "Order": "C", + "Read-only": "False", + "Shape": "(3, 3)", + "Storage ratio": "0.2", + "Store type": "zarr.storage.KVStore", + "Type": "zarr.core.Array", + "hexdigest": "c51604eace325fe42bbebf39146c0956bd2ed13c" + } +} +``` + +To print the whole array, we use pydantic's serialization contexts: + +```python +dumped = instance.model_dump_json(context={'zarr_dump_array': True}) +``` +```json +{ + "array": + { + "same thing,": "except also...", + "array": [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]], + "hexdigest": "c51604eace325fe42bbebf39146c0956bd2ed13c" + } +} +``` ```{toctree} diff --git a/pyproject.toml b/pyproject.toml index a0fefe6..52d8140 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,12 +22,12 @@ dask = [ hdf5 = [ "h5py>=3.10.0" ] -zarr = [ - "zarr>=2.17.2", -] video = [ "opencv-python>=4.9.0.80", ] +zarr = [ + "zarr>=2.17.2", +] arrays = [ "numpydantic[dask,hdf5,zarr,video]" ] diff --git a/src/numpydantic/interface/video.py b/src/numpydantic/interface/video.py index 92fe1e0..a320744 100644 --- a/src/numpydantic/interface/video.py +++ b/src/numpydantic/interface/video.py @@ -125,12 +125,29 @@ class VideoProxy: raise ValueError(f"Could not get frame {frame}") return frame + def _complete_slice(self, slice_: slice) -> slice: + """Get a fully-built slice that can be passed to range""" + if slice_.step is None: + slice_ = slice(slice_.start, slice_.stop, 1) + if slice_.stop is None: + slice_ = slice(slice_.start, self.n_frames, slice_.step) + if slice_.start is None: + slice_ = slice(0, slice_.stop, slice_.step) + return slice_ + def __getitem__(self, item: Union[int, slice, tuple]) -> np.ndarray: if isinstance(item, int): # want a single frame return self._get_frame(item) + elif isinstance(item, slice): + # slice of frames + item = self._complete_slice(item) + frames = [] + for i in range(item.start, item.stop, item.step): + frames.append(self._get_frame(i)) + return np.stack(frames) else: - # slices are passes as tuples + # slices are passed as tuples # first arg needs to be handled specially if isinstance(item[0], int): # single frame @@ -142,13 +159,7 @@ class VideoProxy: elif isinstance(item[0], slice): frames = [] # make a new slice since range cant take Nones, filling in missing vals - fslice = item[0] - if fslice.step is None: - fslice = slice(fslice.start, fslice.stop, 1) - if fslice.stop is None: - fslice = slice(fslice.start, self.n_frames, fslice.step) - if fslice.start is None: - fslice = slice(0, fslice.stop, fslice.step) + fslice = self._complete_slice(item[0]) for i in range(fslice.start, fslice.stop, fslice.step): frames.append(self._get_frame(i)) diff --git a/src/numpydantic/schema.py b/src/numpydantic/schema.py index 5b22826..cf430a8 100644 --- a/src/numpydantic/schema.py +++ b/src/numpydantic/schema.py @@ -3,7 +3,9 @@ Helper functions for use with :class:`~numpydantic.NDArray` - see the note in :mod:`~numpydantic.ndarray` for why these are separated. """ -from typing import Any, Callable, Union +import hashlib +import json +from typing import Any, Callable, Optional, Union import nptyping.structure import numpy as np @@ -124,6 +126,8 @@ def list_of_lists_schema(shape: Shape, array_type: CoreSchema) -> ListSchema: # make the current level list schema, accounting for shape if arg == "*": list_schema = core_schema.list_schema(inner_schema, metadata=metadata) + elif arg == "...": + list_schema = _unbounded_shape(inner_schema, metadata=metadata) else: arg = int(arg) list_schema = core_schema.list_schema( @@ -132,6 +136,50 @@ def list_of_lists_schema(shape: Shape, array_type: CoreSchema) -> ListSchema: return list_schema +def _hash_schema(schema: CoreSchema) -> str: + """ + Make a hex-encoded 8-byte blake2b hash from a pydantic core schema. + Collisions are really not important or likely here, but we do want the same schema + to produce the same hash. + """ + schema_str = json.dumps( + schema, sort_keys=True, indent=None, separators=(",", ":") + ).encode("utf-8") + hasher = hashlib.blake2b(digest_size=8) + hasher.update(schema_str) + return hasher.hexdigest() + + +def _unbounded_shape( + inner_type: CoreSchema, metadata: Optional[dict] = None +) -> core_schema.DefinitionsSchema: + """ + Make a recursive schema that refers to itself using a hashed version of the inner + type + """ + + schema_hash = _hash_schema(inner_type) + array_ref = f"any-shape-array-{schema_hash}" + + schema = core_schema.definitions_schema( + core_schema.list_schema( + core_schema.definition_reference_schema(array_ref), metadata=metadata + ), + [ + core_schema.union_schema( + [ + core_schema.list_schema( + core_schema.definition_reference_schema(array_ref) + ), + inner_type, + ], + ref=array_ref, + ) + ], + ) + return schema + + def make_json_schema( shape: ShapeType, dtype: DtypeType, _handler: _handler_type ) -> ListSchema: @@ -154,7 +202,8 @@ def make_json_schema( # get the names of the shape constraints, if any if shape is Any: - list_schema = core_schema.list_schema(core_schema.any_schema()) + list_schema = _unbounded_shape(dtype_schema) + # list_schema = core_schema.list_schema(core_schema.any_schema()) else: list_schema = list_of_lists_schema(shape, dtype_schema) diff --git a/tests/test_interface/test_video.py b/tests/test_interface/test_video.py index e6c96b9..cee7cbb 100644 --- a/tests/test_interface/test_video.py +++ b/tests/test_interface/test_video.py @@ -122,6 +122,12 @@ def test_video_getitem(avi_video): assert single_slice.shape == (10, 5, 3) # also get a range of frames + # range without further slices + range_slice = instance.array[3:5] + assert range_slice.shape == (2, 100, 50, 3) + assert range_slice[0, 3, 3, 0] == 3 + assert range_slice[0, 4, 4, 0] == 0 + # full range range_slice = instance.array[3:5, 0:10, 0:5] assert range_slice.shape == (2, 10, 5, 3) diff --git a/tests/test_ndarray.py b/tests/test_ndarray.py index f39dc8d..39972f1 100644 --- a/tests/test_ndarray.py +++ b/tests/test_ndarray.py @@ -14,9 +14,6 @@ from numpydantic.exceptions import ShapeError, DtypeError from numpydantic import dtype -# from .fixtures import tmp_output_dir_func - - def test_ndarray_type(): class Model(BaseModel): array: NDArray[Shape["2 x, * y"], Number] @@ -186,17 +183,43 @@ def test_json_schema_dtype_builtin(dtype, expected, array_model): assert inner_type["type"] == expected -@pytest.mark.skip("Not implemented yet") -def test_json_schema_wildcard(): - """ - NDarray types should generate a JSON schema without shape constraints - """ - pass +def _recursive_array(schema): + assert "$defs" in schema + # get the key uses for the array + array_key = list(schema["$defs"].keys())[0] + + # the array property should be a ref to the recursive array + # get the innermost part of the field schema + field_schema = schema["properties"]["array"] + while "items" in field_schema: + field_schema = field_schema["items"] + assert field_schema["$ref"] == f"#/$defs/{array_key}" + + # and the recursive array should indeed be recursive... + # specifically it should be an array whose items can be itself or + # of the type specified by the dtype + any_of = schema["$defs"][array_key]["anyOf"] + assert any_of[0]["items"]["$ref"] == f"#/$defs/{array_key}" + assert any_of[0]["type"] == "array" + # here we are just assuming that it's a uint8 array.. + assert any_of[1]["type"] == "integer" + assert any_of[1]["maximum"] == 255 + assert any_of[1]["minimum"] == 0 -@pytest.mark.skip("Not implemented yet") def test_json_schema_ellipsis(): """ NDArray types should create a recursive JSON schema for any-shaped arrays """ - pass + + class AnyShape(BaseModel): + array: NDArray[Shape["*, ..."], np.uint8] + + schema = AnyShape.model_json_schema() + _recursive_array(schema) + + class ConstrainedAnyShape(BaseModel): + array: NDArray[Shape["3, 4, ..."], np.uint8] + + schema = ConstrainedAnyShape.model_json_schema() + _recursive_array(schema)