From a4d82f08790342fc7d20b6ad260f8efeec5c48d0 Mon Sep 17 00:00:00 2001
From: sneakers-the-rat <sneakers-the-rat@protonmail.com>
Date: Mon, 20 May 2024 21:16:16 -0700
Subject: [PATCH] docs! add recursive any shaped arrays!

---
 docs/api/interface/index.md        |   1 +
 docs/api/interface/video.md        |   6 +
 docs/conf.py                       |   4 +-
 docs/index.md                      | 284 ++++++++++++++++++++++++++---
 pyproject.toml                     |   6 +-
 src/numpydantic/interface/video.py |  27 ++-
 src/numpydantic/schema.py          |  53 +++++-
 tests/test_interface/test_video.py |   6 +
 tests/test_ndarray.py              |  45 +++--
 9 files changed, 378 insertions(+), 54 deletions(-)
 create mode 100644 docs/api/interface/video.md

diff --git a/docs/api/interface/index.md b/docs/api/interface/index.md
index 7ade4fc..02e48ba 100644
--- a/docs/api/interface/index.md
+++ b/docs/api/interface/index.md
@@ -9,5 +9,6 @@
 dask
 hdf5
 numpy
+video
 zarr
 ```
\ No newline at end of file
diff --git a/docs/api/interface/video.md b/docs/api/interface/video.md
new file mode 100644
index 0000000..07b7666
--- /dev/null
+++ b/docs/api/interface/video.md
@@ -0,0 +1,6 @@
+# Video
+
+```{eval-rst}
+.. automodule:: numpydantic.interface.video
+    :members:
+```
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
index c31d0a7..38db560 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -6,10 +6,12 @@
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 
+import importlib.metadata as metadata
+
 project = "numpydantic"
 copyright = "2024, Jonny Saunders"
 author = "Jonny Saunders"
-release = "v0.0.0"
+release = metadata.version("numpydantic")
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
diff --git a/docs/index.md b/docs/index.md
index a79eee2..08ed398 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -25,12 +25,12 @@ or implement `__get_pydantic_core_schema__` on your type to fully support it.
 And setting `arbitrary_types_allowed = True` still prohibits you from 
 generating JSON Schema, serialization to JSON
 
-
 ## Features:
 - **Types** - Annotations (based on [npytyping](https://github.com/ramonhagenaars/nptyping))
   for specifying arrays in pydantic models
 - **Validation** - Shape, dtype, and other array validations
-- **Interfaces** - Works with {mod}`~.interface.numpy`, {mod}`~.interface.dask`, {mod}`~.interface.hdf5`, {mod}`~.interface.zarr`, 
+- **Interfaces** - Works with {mod}`~.interface.numpy`, {mod}`~.interface.dask`, {mod}`~.interface.hdf5`,
+  {mod}`~.interface.video`, and {mod}`~.interface.zarr`,
   and a simple extension system to make it work with whatever else you want!
 - **Serialization** - Dump an array as a JSON-compatible array-of-arrays with enough metadata to be able to 
   recreate the model in the native format
@@ -47,6 +47,26 @@ Coming soon:
   minimum and maximum precision ranges, and so on as type maps provided by interface classes :)
 - (see [todo](./todo.md))
 
+## Installation
+
+numpydantic tries to keep dependencies minimal, so by default it only comes with 
+dependencies to use the numpy interface. Add the extra relevant to your favorite
+array library to be able to use it!
+
+```shell
+pip install numpydantic
+# dask
+pip install 'numpydantic[dask]'
+# hdf5
+pip install 'numpydantic[hdf5]'
+# video
+pip install 'numpydantic[video]'
+# zarr
+pip install 'numpydantic[zarr]'
+# all array formats
+pip intsall 'numpydantic[array]'
+```
+
 ## Usage
 
 Specify an array using [nptyping syntax](https://github.com/ramonhagenaars/nptyping/blob/master/USERDOCS.md)
@@ -55,7 +75,10 @@ and use it with your favorite array library :)
 Use the {class}`~numpydantic.NDArray` class like you would any other python type,
 combine it with {class}`typing.Union`, make it {class}`~typing.Optional`, etc.
 
-For example, to support a 
+For example, to specify a very special type of image that can either be
+- a 2D float array where the axes can be any size, or 
+- a 3D uint8 array where the third axis must be size 3
+- a 1080p video 
 
 ```python
 from typing import Union
@@ -65,43 +88,36 @@ import numpy as np
 from numpydantic import NDArray, Shape
 
 class Image(BaseModel):
-    """
-    Images: grayscale, RGB, RGBA, and videos too!
-    """
     array: Union[
-        NDArray[Shape["* x, * y"], np.uint8],
+        NDArray[Shape["* x, * y"], float],
         NDArray[Shape["* x, * y, 3 rgb"], np.uint8],
-        NDArray[Shape["* t, * x, * y, 4 rgba"], np.float64]
+        NDArray[Shape["* t, 1080 y, 1920 x, 3 rgb"], np.uint8]
     ]
 ```
 
 And then use that as a transparent interface to your favorite array library!
 
-### Numpy
+### Interfaces
+
+#### Numpy
 
 The Coca-Cola of array libraries
 
 ```python
 import numpy as np
 # works
-frame_gray = Image(array=np.ones((1280, 720), dtype=np.uint8))
+frame_gray = Image(array=np.ones((1280, 720), dtype=float))
 frame_rgb  = Image(array=np.ones((1280, 720, 3), dtype=np.uint8))
-frame_rgba = Image(array=np.ones((1280, 720, 4), dtype=np.uint8))
-video_rgb  = Image(array=np.ones((100, 1280, 720, 3), dtype=np.uint8))
 
 # fails
-wrong_n_dimensions = Image(array=np.ones((1280,), dtype=np.uint8))
+wrong_n_dimensions = Image(array=np.ones((1280,), dtype=float))
 wrong_shape = Image(array=np.ones((1280,720,10), dtype=np.uint8))
-wrong_type = Image(array=np.ones((1280,720,3), dtype=np.float64))
 
-# shapes and types are checked together, so..
-# this works
-float_video = Image(array=np.ones((100, 1280, 720, 4), dtype=float))
-# this doesn't
-wrong_shape_float_video = Image(array=np.ones((100, 1280, 720, 3), dtype=float))
+# shapes and types are checked together, so this also fails
+wrong_shape_dtype_combo = Image(array=np.ones((1280, 720, 3), dtype=float))
 ```
 
-### Dask
+#### Dask
 
 High performance chunked arrays! The backend for many new array libraries! 
 
@@ -110,14 +126,12 @@ Works exactly the same as numpy arrays
 ```python
 import dask.array as da
 
-# validate a huge video
-video_array = da.zeros(shape=(1920,1080,1000000,3), dtype=np.uint8)
-
-# this works
+# validate a humongous image without having to load it into memory
+video_array = da.zeros(shape=(1e10,1e20,3), dtype=np.uint8)
 dask_video = Image(array=video_array)
 ```
 
-### HDF5
+#### HDF5
 
 Array work increasingly can't fit on memory, but dealing with arrays on disk 
 can become a pain in concurrent applications. Numpydantic allows you to 
@@ -136,7 +150,7 @@ array_path = "/nested/array"
 
 # make an HDF5 array
 h5f = h5py.File(h5f_file, "w")
-array = np.random.random((1920,1080,3)).astype(np.uint8)
+array = np.random.randint(0, 255, (1920,1080,3), np.uint8)
 h5f.create_dataset(array_path, data=array)
 h5f.close()
 ```
@@ -172,17 +186,229 @@ object and leave the file open between calls:
 >>> h5f_image.array.close()
 ```
 
-### Zarr
+#### Video
+
+Videos are just arrays with fancy encoding! Numpydantic can validate shape and dtype
+as well as lazy load chunks of frames with arraylike syntax!
+
+Say we have some video `data.mp4` ...
+
+```python
+video = Image(array='data.mp4')
+# get a single frame
+video.array[5]
+# or a range of frames!
+video.array[5:10]
+# or whatever slicing you want to do!
+video.array[5:50:5, 0:10, 50:70]
+```
+
+As elsewhere, a proxy class is a transparent pass-through interface to the underlying
+opencv class, so we can get the rest of the video properties ...
+
+```python
+import cv2
+
+# get the total frames from opencv
+video.array.get(cv2.CAP_PROP_FRAME_COUNT)
+# the proxy class also provides a convenience property
+video.array.n_frames
+```
+
+#### Zarr
 
 Zarr works similarly!
 
 Use it with any of Zarr's backends: Nested, Zipfile, S3, it's all the same!
 
-```{todo}
-Add the zarr examples!
+Eg. create a nested zarr array on disk and use it...
+
+```python
+import zarr
+from numpydantic.interface.zarr import ZarrArrayPath
+
+array_file = 'data/array.zarr'
+nested_path = 'data/sets/here'
+
+root = zarr.open(array_file, mode='w')
+nested_array = root.zeros(
+    nested_path, 
+    shape=(1000, 1080, 1920, 3), 
+    dtype=np.uint8
+)
+
+# validates just fine!
+zarr_video = Image(array=ZarrArrayPath(array_file, nested_path))
+# or just pass a tuple, the interface can discover it's a zarr array
+zarr_video = Image(array=(array_file, nested_path))
 ```
 
+### JSON Schema
 
+Numpydantic generates JSON Schema for all its array specifications, so for the above
+model, we get a schema for each of the possible array types that properly handles
+the shape and dtype constraints and includes the origin numpy type as a `dtype` annotation.
+
+```python
+Image.model_json_schema()
+```
+
+```json
+{
+  "properties": {
+    "array": {
+      "anyOf": [
+        {
+          "items": {"items": {"type": "number"}, "type": "array"},
+          "type": "array"
+        },
+        {
+          "dtype": "numpy.uint8",
+          "items": {
+            "items": {
+              "items": {
+                "maximum": 255,
+                "minimum": 0,
+                "type": "integer"
+              },
+              "maxItems": 3,
+              "minItems": 3,
+              "type": "array"
+            },
+            "type": "array"
+          },
+          "type": "array"
+        },
+        {
+          "dtype": "numpy.uint8",
+          "items": {
+            "items": {
+              "items": {
+                "items": {
+                  "maximum": 255,
+                  "minimum": 0,
+                  "type": "integer"
+                },
+                "maxItems": 3,
+                "minItems": 3,
+                "type": "array"
+              },
+              "maxItems": 1920,
+              "minItems": 1920,
+              "type": "array"
+            },
+            "maxItems": 1080,
+            "minItems": 1080,
+            "type": "array"
+          },
+          "type": "array"
+        }
+      ],
+      "title": "Array"
+    }
+  },
+  "required": ["array"],
+  "title": "Image",
+  "type": "object"
+}
+```
+
+numpydantic can even handle shapes with unbounded numbers of dimensions by using
+recursive JSON schema!!!
+
+So the any-shaped array (using nptyping's ellipsis notation):
+
+```python
+class AnyShape(BaseModel):
+    array: NDArray[Shape["*, ..."], np.uint8]
+```
+
+is rendered to JSON-Schema like this:
+
+```json
+{
+  "$defs": {
+    "any-shape-array-9b5d89838a990d79": {
+      "anyOf": [
+        {
+          "items": {
+            "$ref": "#/$defs/any-shape-array-9b5d89838a990d79"
+          },
+          "type": "array"
+        },
+        {"maximum": 255, "minimum": 0, "type": "integer"}
+      ]
+    }
+  },
+  "properties": {
+    "array": {
+      "dtype": "numpy.uint8",
+      "items": {"$ref": "#/$defs/any-shape-array-9b5d89838a990d79"},
+      "title": "Array",
+      "type": "array"
+    }
+  },
+  "required": ["array"],
+  "title": "AnyShape",
+  "type": "object"
+}
+```
+
+where the key `"any-shape-array-9b5d89838a990d79"` uses a (blake2b) hash of the
+inner dtype specification so that having multiple any-shaped arrays in a single 
+model schema are deduplicated without conflicts.
+
+### Dumping
+
+One of the main reasons to use chunked array libraries like zarr is to avoid
+needing to load the entire array into memory. When dumping data to JSON, numpydantic 
+tries to mirror this behavior, by default only dumping the metadata that is
+necessary to identify the array.
+
+For example, with zarr:
+
+```python
+array = zarr.array([[1,2,3],[4,5,6],[7,8,9]], dtype=float)
+instance = Image(array=array)
+dumped = instance.model_dump_json()
+```
+
+```json
+{
+  "array":
+  {
+    "Chunk shape": "(3, 3)",
+    "Chunks initialized": "1/1",
+    "Compressor": "Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",
+    "Data type": "float64",
+    "No. bytes": "72",
+    "No. bytes stored": "421",
+    "Order": "C",
+    "Read-only": "False",
+    "Shape": "(3, 3)",
+    "Storage ratio": "0.2",
+    "Store type": "zarr.storage.KVStore",
+    "Type": "zarr.core.Array",
+    "hexdigest": "c51604eace325fe42bbebf39146c0956bd2ed13c"
+  }
+}
+```
+
+To print the whole array, we use pydantic's serialization contexts:
+
+```python
+dumped = instance.model_dump_json(context={'zarr_dump_array': True})
+```
+```json
+{
+  "array":
+  {
+    "same thing,": "except also...",
+    "array": [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]],
+    "hexdigest": "c51604eace325fe42bbebf39146c0956bd2ed13c"
+  }
+}
+```
 
 
 ```{toctree}
diff --git a/pyproject.toml b/pyproject.toml
index a0fefe6..52d8140 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,12 +22,12 @@ dask = [
 hdf5 = [
     "h5py>=3.10.0"
 ]
-zarr = [
-    "zarr>=2.17.2",
-]
 video = [
     "opencv-python>=4.9.0.80",
 ]
+zarr = [
+    "zarr>=2.17.2",
+]
 arrays = [
     "numpydantic[dask,hdf5,zarr,video]"
 ]
diff --git a/src/numpydantic/interface/video.py b/src/numpydantic/interface/video.py
index 92fe1e0..a320744 100644
--- a/src/numpydantic/interface/video.py
+++ b/src/numpydantic/interface/video.py
@@ -125,12 +125,29 @@ class VideoProxy:
             raise ValueError(f"Could not get frame {frame}")
         return frame
 
+    def _complete_slice(self, slice_: slice) -> slice:
+        """Get a fully-built slice that can be passed to range"""
+        if slice_.step is None:
+            slice_ = slice(slice_.start, slice_.stop, 1)
+        if slice_.stop is None:
+            slice_ = slice(slice_.start, self.n_frames, slice_.step)
+        if slice_.start is None:
+            slice_ = slice(0, slice_.stop, slice_.step)
+        return slice_
+
     def __getitem__(self, item: Union[int, slice, tuple]) -> np.ndarray:
         if isinstance(item, int):
             # want a single frame
             return self._get_frame(item)
+        elif isinstance(item, slice):
+            # slice of frames
+            item = self._complete_slice(item)
+            frames = []
+            for i in range(item.start, item.stop, item.step):
+                frames.append(self._get_frame(i))
+            return np.stack(frames)
         else:
-            # slices are passes as tuples
+            # slices are passed as tuples
             # first arg needs to be handled specially
             if isinstance(item[0], int):
                 # single frame
@@ -142,13 +159,7 @@ class VideoProxy:
             elif isinstance(item[0], slice):
                 frames = []
                 # make a new slice since range cant take Nones, filling in missing vals
-                fslice = item[0]
-                if fslice.step is None:
-                    fslice = slice(fslice.start, fslice.stop, 1)
-                if fslice.stop is None:
-                    fslice = slice(fslice.start, self.n_frames, fslice.step)
-                if fslice.start is None:
-                    fslice = slice(0, fslice.stop, fslice.step)
+                fslice = self._complete_slice(item[0])
 
                 for i in range(fslice.start, fslice.stop, fslice.step):
                     frames.append(self._get_frame(i))
diff --git a/src/numpydantic/schema.py b/src/numpydantic/schema.py
index 5b22826..cf430a8 100644
--- a/src/numpydantic/schema.py
+++ b/src/numpydantic/schema.py
@@ -3,7 +3,9 @@ Helper functions for use with :class:`~numpydantic.NDArray` - see the note in
 :mod:`~numpydantic.ndarray` for why these are separated.
 """
 
-from typing import Any, Callable, Union
+import hashlib
+import json
+from typing import Any, Callable, Optional, Union
 
 import nptyping.structure
 import numpy as np
@@ -124,6 +126,8 @@ def list_of_lists_schema(shape: Shape, array_type: CoreSchema) -> ListSchema:
         # make the current level list schema, accounting for shape
         if arg == "*":
             list_schema = core_schema.list_schema(inner_schema, metadata=metadata)
+        elif arg == "...":
+            list_schema = _unbounded_shape(inner_schema, metadata=metadata)
         else:
             arg = int(arg)
             list_schema = core_schema.list_schema(
@@ -132,6 +136,50 @@ def list_of_lists_schema(shape: Shape, array_type: CoreSchema) -> ListSchema:
     return list_schema
 
 
+def _hash_schema(schema: CoreSchema) -> str:
+    """
+    Make a hex-encoded 8-byte blake2b hash from a pydantic core schema.
+    Collisions are really not important or likely here, but we do want the same schema
+    to produce the same hash.
+    """
+    schema_str = json.dumps(
+        schema, sort_keys=True, indent=None, separators=(",", ":")
+    ).encode("utf-8")
+    hasher = hashlib.blake2b(digest_size=8)
+    hasher.update(schema_str)
+    return hasher.hexdigest()
+
+
+def _unbounded_shape(
+    inner_type: CoreSchema, metadata: Optional[dict] = None
+) -> core_schema.DefinitionsSchema:
+    """
+    Make a recursive schema that refers to itself using a hashed version of the inner
+    type
+    """
+
+    schema_hash = _hash_schema(inner_type)
+    array_ref = f"any-shape-array-{schema_hash}"
+
+    schema = core_schema.definitions_schema(
+        core_schema.list_schema(
+            core_schema.definition_reference_schema(array_ref), metadata=metadata
+        ),
+        [
+            core_schema.union_schema(
+                [
+                    core_schema.list_schema(
+                        core_schema.definition_reference_schema(array_ref)
+                    ),
+                    inner_type,
+                ],
+                ref=array_ref,
+            )
+        ],
+    )
+    return schema
+
+
 def make_json_schema(
     shape: ShapeType, dtype: DtypeType, _handler: _handler_type
 ) -> ListSchema:
@@ -154,7 +202,8 @@ def make_json_schema(
 
     # get the names of the shape constraints, if any
     if shape is Any:
-        list_schema = core_schema.list_schema(core_schema.any_schema())
+        list_schema = _unbounded_shape(dtype_schema)
+        # list_schema = core_schema.list_schema(core_schema.any_schema())
     else:
         list_schema = list_of_lists_schema(shape, dtype_schema)
 
diff --git a/tests/test_interface/test_video.py b/tests/test_interface/test_video.py
index e6c96b9..cee7cbb 100644
--- a/tests/test_interface/test_video.py
+++ b/tests/test_interface/test_video.py
@@ -122,6 +122,12 @@ def test_video_getitem(avi_video):
     assert single_slice.shape == (10, 5, 3)
 
     # also get a range of frames
+    # range without further slices
+    range_slice = instance.array[3:5]
+    assert range_slice.shape == (2, 100, 50, 3)
+    assert range_slice[0, 3, 3, 0] == 3
+    assert range_slice[0, 4, 4, 0] == 0
+
     # full range
     range_slice = instance.array[3:5, 0:10, 0:5]
     assert range_slice.shape == (2, 10, 5, 3)
diff --git a/tests/test_ndarray.py b/tests/test_ndarray.py
index f39dc8d..39972f1 100644
--- a/tests/test_ndarray.py
+++ b/tests/test_ndarray.py
@@ -14,9 +14,6 @@ from numpydantic.exceptions import ShapeError, DtypeError
 from numpydantic import dtype
 
 
-# from .fixtures import tmp_output_dir_func
-
-
 def test_ndarray_type():
     class Model(BaseModel):
         array: NDArray[Shape["2 x, * y"], Number]
@@ -186,17 +183,43 @@ def test_json_schema_dtype_builtin(dtype, expected, array_model):
         assert inner_type["type"] == expected
 
 
-@pytest.mark.skip("Not implemented yet")
-def test_json_schema_wildcard():
-    """
-    NDarray types should generate a JSON schema without shape constraints
-    """
-    pass
+def _recursive_array(schema):
+    assert "$defs" in schema
+    # get the key uses for the array
+    array_key = list(schema["$defs"].keys())[0]
+
+    # the array property should be a ref to the recursive array
+    # get the innermost part of the field schema
+    field_schema = schema["properties"]["array"]
+    while "items" in field_schema:
+        field_schema = field_schema["items"]
+    assert field_schema["$ref"] == f"#/$defs/{array_key}"
+
+    # and the recursive array should indeed be recursive...
+    # specifically it should be an array whose items can be itself or
+    # of the type specified by the dtype
+    any_of = schema["$defs"][array_key]["anyOf"]
+    assert any_of[0]["items"]["$ref"] == f"#/$defs/{array_key}"
+    assert any_of[0]["type"] == "array"
+    # here we are just assuming that it's a uint8 array..
+    assert any_of[1]["type"] == "integer"
+    assert any_of[1]["maximum"] == 255
+    assert any_of[1]["minimum"] == 0
 
 
-@pytest.mark.skip("Not implemented yet")
 def test_json_schema_ellipsis():
     """
     NDArray types should create a recursive JSON schema for any-shaped arrays
     """
-    pass
+
+    class AnyShape(BaseModel):
+        array: NDArray[Shape["*, ..."], np.uint8]
+
+    schema = AnyShape.model_json_schema()
+    _recursive_array(schema)
+
+    class ConstrainedAnyShape(BaseModel):
+        array: NDArray[Shape["3, 4, ..."], np.uint8]
+
+    schema = ConstrainedAnyShape.model_json_schema()
+    _recursive_array(schema)