docs and docs and docs and docs

2025-01-09 21:44:27 +00:00 · 2024-09-21 04:18:22 -07:00 · 2024-09-21 04:18:22 -07:00 · d3ad8dac5c
commit d3ad8dac5c
parent 02855852b7
17 changed files with 1525 additions and 54 deletions
--- a/docs/conf.py
+++ b/docs/conf.py
@ -25,7 +25,7 @@ extensions = [
    "sphinx.ext.doctest",
    "sphinx_design",
    "sphinxcontrib.mermaid",
-    "myst_parser",
+    "myst_nb",
    "sphinx.ext.todo",
 ]

@ -77,3 +77,8 @@ napoleon_attr_annotations = True
 # todo
 todo_include_todos = True
 todo_link_only = True
+
+# myst
+# myst-nb
+nb_render_markdown_format = "myst"
+nb_execution_show_tb = True
--- a/docs/data/test.avi
+++ b/docs/data/test.avi
--- a/docs/data/test.h5
+++ b/docs/data/test.h5
--- a/docs/data/test.zarr/.zarray
+++ b/docs/data/test.zarr/.zarray
@ -0,0 +1,22 @@
+{
+    "chunks": [
+        2,
+        2
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 5,
+        "cname": "lz4",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dtype": "<i8",
+    "fill_value": 0,
+    "filters": null,
+    "order": "C",
+    "shape": [
+        2,
+        2
+    ],
+    "zarr_format": 2
+}
--- a/docs/data/test.zarr/0.0
+++ b/docs/data/test.zarr/0.0
--- a/docs/development.md
+++ b/docs/development.md
@ -0,0 +1,84 @@
+# Development
+
+## Versioning
+
+This package uses a colloquial form of [semantic versioning 2](https://semver.org/).
+
+Specifically:
+
+- Major version `2.*.*` is reserved for the transition from nptyping to using
+  `TypeVarTuple`, `Generic`, and `Protocol`. Until `2.*.*`...
+  - breaking changes will be indicated with an advance in `MINOR`
+      version, taking the place of `MAJOR` in semver
+  - backwards-compatible bugfixes **and** additions in functionality
+    will be indicated by a `PATCH` release, taking the place of `MINOR` and
+    `PATCH` in semver.
+- After `2.*.*`, semver as usual will resume
+
+You are encouraged to set an upper bound on your version dependencies until
+we pass `2.*.*`, as the major function of numpydantic is stable,
+but there is still a decent amount of jostling things around to be expected.
+
+
+### API Stability
+
+- All breaking changes to the **public API** will be signaled by a major
+  version's worth of deprecation warnings
+- All breaking changes to the **development API** will be signaled by a
+  minor version's worth of deprecation warnings.
+- Changes to the remainder of the package, whether marked as private with a
+  leading underscore or not, including the import structure of the package,
+  are not considered part of the API and should not be relied on as stable 
+  until explicitly marked otherwise.
+
+#### Public API
+
+**Only the {class}`.NDArray` and {class}`.Shape` classes should be considered
+part of the stable public API.**
+
+All associated functionality for validation should also be considered
+a stable part of the `NDArray` and `Shape` classes - functionality
+will only be added here, and the departure for the string-form of the 
+shape specifications (and its removal) will take place in `v3.*.*`
+
+End-users of numpydantic should pin an upper bound for the `MAJOR` version
+until after `v2.*.*`, after which time it is up to your discretion - 
+no breaking changes are planned, but they would be signaled by a major version change.
+
+#### Development API
+
+**Only the {class}`.Interface` class and its subclasses, 
+along with the Public API,
+should be considered part of the stable development API.**
+
+The `Interface` class is the primary point of external development expected
+for numpydantic. It is still somewhat in flux, but it is prioritized for stability
+and deprecation warnings above the rest of the package. 
+
+Dependent packages that define their own `Interface`s should pin an upper
+bound for the `PATCH` version until `2.*.*`, and afterwards likely pin a `MINOR` version.
+Tests are designed such that it should be easy to test major features against
+each interface, and that work is also ongoing. Once the test suite reaches
+maturity, it should be possible for any downstream interfaces to simply use those to
+ensure they are compatible with the latest version.
+
+## Release Schedule
+
+There is no release schedule. Versions are released according to need and available labor.
+
+## Contributing
+
+### Dev environment
+
+```{todo}
+Document dev environment
+
+Really it's very simple, you just clone a fork and install
+the `dev` environment like `pip install '.[dev]'`
+```
+
+### Pull Requests
+
+```{todo}
+Document pull requests if we ever receive one
+```
--- a/docs/hooks.md
+++ b/docs/hooks.md
@ -1,11 +0,0 @@
-# Hooks
-
-What hooks do we want to expose to downstream users so they can use this without needing
-to override everything?
-
-```{todo} 
-**NWB Compatibility**
-
-**Precision:** NWB allows for a sort of hierarchy of type specification - 
-a less precise type also allows the data to be specified in a more precise type
-```
--- a/docs/index.md
+++ b/docs/index.md
@ -86,8 +86,8 @@ isinstance(np.zeros((1,2,3), dtype=float), array_type)
  and a simple extension system to make it work with whatever else you want! Provides
  a uniform and transparent interface so you can both use common indexing operations
  and also access any special features of a given array library.
- **Serialization** - Dump an array as a JSON-compatible array-of-arrays with enough metadata to be able to 
-  recreate the model in the native format
+- [**Serialization**](./serialization.md) - Dump an array as a JSON-compatible array-of-arrays with enough metadata to be able to 
+  recreate the model in the native format. Full roundtripping is supported :)
 - **Schema Generation** - Correct JSON Schema for arrays, complete with shape and dtype constraints, to
  make your models interoperable 

@ -496,6 +496,14 @@ api/types

 ```

+```{toctree}
+:maxdepth: 2
+:caption: Meta
+:hidden: true
+
+development
+```
+
 ## See Also

 - [`jaxtyping`](https://docs.kidger.site/jaxtyping/)
--- a/docs/serialization.md
+++ b/docs/serialization.md
@ -1,2 +1,313 @@
+---
+file_format: mystnb
+mystnb:
+    output_stderr: remove
+    render_text_lexer: python
+    render_markdown_format: myst
+myst:
+    enable_extensions: ["colon_fence"]
+---
+
 # Serialization

+## Python
+
+In most cases, dumping to python should work as expected.
+
+When a given array framework doesn't provide a tidy means of interacting
+with it from python, we substitute a proxy class like {class}`.hdf5.H5Proxy`,
+but aside from that numpydantic {class}`.NDArray` annotations
+should be passthrough when using {func}`~pydantic.BaseModel.model_dump` .
+
+## JSON
+
+JSON is the ~ ♥ fun one ♥ ~
+
+There isn't necessarily a single optimal way to represent all possible
+arrays in JSON. The standard way that n-dimensional arrays are rendered
+in json is as a list-of-lists (or array of arrays, in JSON parlance),
+but that's almost never what is desirable, especially for large arrays.
+
+### Normal Style[^normalstyle]
+
+Lists-of-lists are the standard, however, so it is the default behavior
+for all interfaces, and all interfaces must support it.
+
+```{code-cell}
+---
+tags: [hide-cell]
+---
+
+from pathlib import Path
+from pydantic import BaseModel
+from numpydantic import NDArray, Shape
+from numpydantic.interface.dask import DaskJsonDict
+from numpydantic.interface.numpy import NumpyJsonDict
+import numpy as np
+import dask.array as da
+import zarr
+import json
+from rich import print
+from rich.console import Console
+
+def print_json(string:str):
+    data = json.loads(string)
+    console = Console(width=74)
+    console.print(data)
+```
+
+For our humble model:
+
+```{code-cell}
+class MyModel(BaseModel):
+    array: NDArray
+```
+
+We should get the same thing for each interface:
+
+```{code-cell}
+model = MyModel(array=[[1,2],[3,4]])
+print(model.model_dump_json())
+```
+
+```{code-cell}
+model = MyModel(array=da.array([[1,2],[3,4]], dtype=int))
+print(model.model_dump_json())
+```
+
+```{code-cell}
+model = MyModel(array=zarr.array([[1,2],[3,4]], dtype=int))
+print(model.model_dump_json())
+```
+
+```{code-cell}
+model = MyModel(array="docs/data/test.avi")
+print(model.model_dump_json())
+```
+
+(ok maybe not that last one, since the video reader still incorrectly
+reads grayscale videos as BGR values for now, but you get the idea)
+
+Since by default arrays are dumped into unadorned JSON arrays,
+when they are re-validated, they will always be handled by the
+{class}`.NumpyInterface`
+
+```{code-cell}
+dask_array = da.array([[1,2],[3,4]], dtype=int)
+model = MyModel(array=dask_array)
+type(model.array)
+```
+
+```{code-cell}
+model_json = model.model_dump_json()
+deserialized_model = MyModel.model_validate_json(model_json)
+type(deserialized_model.array)
+```
+
+All information about `dtype` will be lost, and numbers will be either parsed
+as `int` ({class}`numpy.int64`) or `float` ({class}`numpy.float64`)
+
+## Roundtripping
+
+To roundtrip make arrays round-trippable, use the `round_trip` argument
+to {func}`~pydantic.BaseModel.model_dump_json`
+
+
+```{code-cell}
+print_json(model.model_dump_json(round_trip=True))
+```
+
+Each interface should[^notenforced] implement a dataclass that describes a
+json-able roundtrip form (see {class}`.interface.JsonDict`).
+
+That dataclass then has a {meth}`JsonDict.is_valid` method that checks
+whether an incoming dict matches its schema
+
+```{code-cell}
+roundtrip_json = json.loads(model.model_dump_json(round_trip=True))['array']
+DaskJsonDict.is_valid(roundtrip_json)
+```
+
+```{code-cell}
+NumpyJsonDict.is_valid(roundtrip_json)
+```
+
+#### Controlling paths
+
+When possible, the full content of the array is omitted in favor
+of the path to the file that provided it.
+
+```{code-cell}
+model = MyModel(array="docs/data/test.avi")
+print_json(model.model_dump_json(round_trip=True))
+```
+
+```{code-cell}
+model = MyModel(array=("docs/data/test.h5", "/data"))
+print_json(model.model_dump_json(round_trip=True))
+```
+
+You may notice the relative, rather than absolute paths.
+
+
+We expect that when people are dumping data to json in this roundtripped
+form that they are either working locally 
+(e.g. transmitting an array specification across a socket in multiprocessing
+or in a computing cluster),
+or exporting to some directory structure of data, 
+where they are making an index file that refers to datasets in a directory
+as part of a data standard or vernacular format.
+
+By default, numpydantic uses the current working directory as the root to find
+paths relative to, but this can be controlled by the [`relative_to`](#relative_to)
+context parameter:
+
+For example if you're working on data in many subdirectories,
+you might want to serialize relative to each of them:
+
+```{code-cell}
+print_json(
+  model.model_dump_json(
+    round_trip=True, 
+    context={"relative_to": Path('./docs/data')}
+  ))
+```
+
+Or in the other direction:
+
+```{code-cell}
+print_json(
+  model.model_dump_json(
+    round_trip=True, 
+    context={"relative_to": Path('../')}
+  ))
+```
+
+Or you might be working in some completely different place,
+numpydantic will try and find the way from here to there as long as it exists,
+even if it means traversing to the root of the readthedocs filesystem
+
+```{code-cell}
+print_json(
+  model.model_dump_json(
+    round_trip=True, 
+    context={"relative_to": Path('/a/long/distance/directory')}
+    ))
+```
+
+You can force absolute paths with the `absolute_paths` context parameter
+
+```{code-cell}
+print_json(
+  model.model_dump_json(
+    round_trip=True, 
+    context={"absolute_paths": True}
+    ))
+```
+
+#### Durable Interface Metadata
+
+Numpydantic tries to be [stable](./development.md#api-stability),
+but we're not perfect. To preserve the full information about the
+interface that's needed to load the data referred to by the value,
+use the `mark_interface` contest parameter:
+
+```{code-cell}
+print_json(
+  model.model_dump_json(
+    round_trip=True, 
+    context={"mark_interface": True}
+    ))
+```
+
+```{todo}
+We will also add a separate `mark_version` parameter for marking
+the specific version of the relevant interface package, like `zarr`, or `numpy`,
+patience.
+```
+
+## Context parameters
+
+A reference listing of all the things that can be passed to
+{func}`~pydantic.BaseModel.model_dump_json`
+
+
+### `mark_interface`
+
+Nest an additional layer of metadata for unambigous serialization that
+can be absolutely resolved across numpydantic versions 
+(for now for downstream metadata purposes only, 
+automatically resolving to a numpydantic version is not yet possible.)
+
+Supported interfaces:
+
+- (all)
+
+```{code-cell}
+model = MyModel(array=[[1,2],[3,4]])
+data = model.model_dump_json(
+  round_trip=True,
+  context={"mark_interface": True}
+)
+print_json(data)
+```
+
+### `absolute_paths`
+
+Make all paths (that exist) absolute.
+
+Supported interfaces:
+
+- (all)
+
+```{code-cell}
+model = MyModel(array=("docs/data/test.h5", "/data"))
+data = model.model_dump_json(
+    round_trip=True, 
+    context={"absolute_paths": True}
+    )
+print_json(data)
+```
+
+### `relative_to`
+
+Make all paths (that exist) relative to the given path
+
+Supported interfaces:
+
+- (all)
+
+```{code-cell}
+model = MyModel(array=("docs/data/test.h5", "/data"))
+data = model.model_dump_json(
+    round_trip=True, 
+    context={"relative_to": Path('../')}
+    )
+print_json(data)
+```
+
+### `dump_array`
+
+Dump the raw array contents when serializing to json inside an `array` field
+
+Supported interfaces:
+- {class}`.ZarrInterface`
+
+```{code-cell}
+model = MyModel(array=("docs/data/test.zarr",))
+data = model.model_dump_json(
+    round_trip=True, 
+    context={"dump_array": True}
+    )
+print_json(data)
+```
+
+
+
+[^normalstyle]: o ya we're posting JSON [normal style](https://normal.style)
+[^notenforced]: This is only *functionally* enforced at the moment, where 
+  a roundtrip test confirms that dtype and type are preserved,
+  but there is no formal test for each interface having its own serialization class
+
+
+
--- a/pdm.lock
+++ b/pdm.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -73,12 +73,14 @@ tests = [
    "coveralls<4.0.0,>=3.3.1",
 ]
 docs = [
+    "numpydantic[arrays]",
    "sphinx<8.0.0,>=7.2.6",
    "furo>=2024.1.29",
    "myst-parser<3.0.0,>=2.0.0",
    "autodoc-pydantic<3.0.0,>=2.0.1",
    "sphinx-design<1.0.0,>=0.5.0",
    "sphinxcontrib-mermaid>=0.9.2",
+    "myst-nb>=1.1.1",
 ]
 dev = [
    "numpydantic[tests,docs]",
--- a/src/numpydantic/interface/interface.py
+++ b/src/numpydantic/interface/interface.py
@ -63,10 +63,15 @@ class JsonDict:
        return TypeAdapter(cls)

    @classmethod
-    def is_valid(cls, val: dict) -> bool:
+    def is_valid(cls, val: dict, raise_on_error: bool = False) -> bool:
        """
        Check whether a given dictionary matches this JsonDict specification

+        Args:
+            val (dict): The dictionary to check for validity
+            raise_on_error (bool): If ``True``, raise the validation error
+                rather than returning a bool. (default: ``False``)
+
        Returns:
            bool - true if valid, false if not
        """
@ -74,7 +79,9 @@ class JsonDict:
        try:
            _ = adapter.validate_python(val)
            return True
-        except ValidationError:
+        except ValidationError as e:
+            if raise_on_error:
+                raise e
            return False


--- a/src/numpydantic/interface/video.py
+++ b/src/numpydantic/interface/video.py
@ -159,6 +159,9 @@ class VideoProxy:
        return self[:]

    def __getitem__(self, item: Union[int, slice, tuple]) -> np.ndarray:
+        if not self.path.exists():
+            raise FileNotFoundError(f"Video file {self.path} does not exist!")
+
        if isinstance(item, int):
            # want a single frame
            return self._get_frame(item)
--- a/src/numpydantic/interface/zarr.py
+++ b/src/numpydantic/interface/zarr.py
@ -178,12 +178,12 @@ class ZarrInterface(Interface):
        :meth:`zarr.core.Array.info_items`
        plus the :meth:`zarr.core.Array.hexdigest` as a :class:`.ZarrJsonDict`

-        If either the ``zarr_dump_array`` value in the context dictionary is ``True``
+        If either the ``dump_array`` value in the context dictionary is ``True``
        or the zarr array is an in-memory array, dump the array as well
        (since without a persistent array it would be impossible to roundtrip and
        dumping to JSON would be meaningless)

-        Passing ``'zarr_dump_array': True`` to the serialization ``context``
+        Passing ```dump_array': True`` to the serialization ``context``
        looks like this::

            model.model_dump_json(context={'zarr_dump_array': True})
@ -193,7 +193,7 @@ class ZarrInterface(Interface):
        if info.round_trip:
            dump_array = False
            if info is not None and info.context is not None:
-                dump_array = info.context.get("zarr_dump_array", False)
+                dump_array = info.context.get("dump_array", False)
            is_file = False

            as_json = {"type": cls.name}
--- a/src/numpydantic/ndarray.py
+++ b/src/numpydantic/ndarray.py
@ -24,10 +24,10 @@ from numpydantic.exceptions import InterfaceError
 from numpydantic.interface import Interface
 from numpydantic.maps import python_to_nptyping
 from numpydantic.schema import (
-    _jsonize_array,
    get_validate_interface,
    make_json_schema,
 )
+from numpydantic.serialization import jsonize_array
 from numpydantic.types import DtypeType, NDArrayType, ShapeType
 from numpydantic.vendor.nptyping.error import InvalidArgumentsError
 from numpydantic.vendor.nptyping.ndarray import NDArrayMeta as _NDArrayMeta
@ -181,7 +181,7 @@ class NDArray(NPTypingType, metaclass=NDArrayMeta):
        return core_schema.with_info_plain_validator_function(
            get_validate_interface(shape, dtype),
            serialization=core_schema.plain_serializer_function_ser_schema(
-                _jsonize_array, when_used="json", info_arg=True
+                jsonize_array, when_used="json", info_arg=True
            ),
            metadata=json_schema,
        )
--- a/src/numpydantic/schema.py
+++ b/src/numpydantic/schema.py
@ -5,15 +5,15 @@ Helper functions for use with :class:`~numpydantic.NDArray` - see the note in

 import hashlib
 import json
-from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Optional

 import numpy as np
-from pydantic import BaseModel, SerializationInfo
+from pydantic import BaseModel
 from pydantic_core import CoreSchema, core_schema
 from pydantic_core.core_schema import ListSchema, ValidationInfo

 from numpydantic import dtype as dt
-from numpydantic.interface import Interface, JsonDict
+from numpydantic.interface import Interface
 from numpydantic.maps import np_to_python
 from numpydantic.types import DtypeType, NDArrayType, ShapeType
 from numpydantic.vendor.nptyping.structure import StructureMeta
@ -278,16 +278,3 @@ def get_validate_interface(shape: ShapeType, dtype: DtypeType) -> Callable:
        return value

    return validate_interface
-
-
-def _jsonize_array(value: Any, info: SerializationInfo) -> Union[list, dict]:
-    """Use an interface class to render an array as JSON"""
-    interface_cls = Interface.match_output(value)
-    array = interface_cls.to_json(value, info)
-    if isinstance(array, JsonDict):
-        array = array.to_dict()
-
-    if info.context and info.context.get("mark_interface", False):
-        array = interface_cls.mark_json(array)
-
-    return array
--- a/src/numpydantic/serialization.py
+++ b/src/numpydantic/serialization.py
@ -0,0 +1,94 @@
+from pathlib import Path
+from typing import Any, Callable, TypeVar, Union
+
+from pydantic_core.core_schema import SerializationInfo
+
+from numpydantic.interface import Interface, JsonDict
+
+T = TypeVar("T")
+U = TypeVar("U")
+
+
+def jsonize_array(value: Any, info: SerializationInfo) -> Union[list, dict]:
+    """Use an interface class to render an array as JSON"""
+    interface_cls = Interface.match_output(value)
+    array = interface_cls.to_json(value, info)
+    if isinstance(array, JsonDict):
+        array = array.to_dict()
+
+    if info.context:
+        if info.context.get("mark_interface", False):
+            array = interface_cls.mark_json(array)
+        if info.context.get("absolute_paths", False):
+            array = _absolutize_paths(array)
+        else:
+            relative_to = info.context.get("relative_to", ".")
+            array = _relativize_paths(array, relative_to)
+
+    return array
+
+
+def _relativize_paths(value: dict, relative_to: str = ".") -> dict:
+    """
+    Make paths relative to either the current directory or the provided
+    ``relative_to`` directory, if provided in the context
+    """
+    relative_to = Path(relative_to).resolve()
+
+    def _r_path(v: Any) -> Any:
+        try:
+            path = Path(v)
+            if not path.exists():
+                return v
+            return str(relative_path(path, relative_to))
+        except:
+            return v
+
+    return _walk_and_apply(value, _r_path)
+
+
+def _absolutize_paths(value: dict) -> dict:
+    def _a_path(v: Any) -> Any:
+        try:
+            path = Path(v)
+            if not path.exists():
+                return v
+            return str(path.resolve())
+        except:
+            return v
+
+    return _walk_and_apply(value, _a_path)
+
+
+def _walk_and_apply(value: T, f: Callable[[U], U]) -> T:
+    """
+    Walk an object, applying a function
+    """
+    if isinstance(value, dict):
+        for k, v in value.items():
+            if isinstance(v, dict):
+                _walk_and_apply(v, f)
+            elif isinstance(v, list):
+                value[k] = [_walk_and_apply(sub_v, f) for sub_v in v]
+            else:
+                value[k] = f(v)
+    elif isinstance(value, list):
+        value = [_walk_and_apply(v, f) for v in value]
+    else:
+        value = f(value)
+    return value
+
+
+def relative_path(target: Path, origin: Path) -> Path:
+    """
+    return path of target relative to origin, even if they're
+    not in the same subpath
+
+    References:
+        - https://stackoverflow.com/a/71874881
+    """
+    try:
+        return Path(target).resolve().relative_to(Path(origin).resolve())
+    except ValueError:  # target does not start with origin
+        # recursion with origin (eventually origin is root so try will succeed)
+        return Path("..").joinpath(relative_path(target, Path(origin).parent))