From 5e3ad790d70ecf41e181df47a3e0a01f74041afd Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Mon, 20 May 2024 21:39:54 -0700 Subject: [PATCH] restructuring docs, getting started on design but need 2 go home --- docs/api/dtype.md | 8 +- docs/api/schema.md | 7 ++ docs/api/types.md | 8 +- docs/{overview.md => design.md} | 15 +++- docs/index.md | 8 +- docs/interfaces.md | 0 docs/ndarray.md | 135 -------------------------------- 7 files changed, 37 insertions(+), 144 deletions(-) create mode 100644 docs/api/schema.md rename docs/{overview.md => design.md} (58%) create mode 100644 docs/interfaces.md delete mode 100644 docs/ndarray.md diff --git a/docs/api/dtype.md b/docs/api/dtype.md index 19bf54a..5cba08e 100644 --- a/docs/api/dtype.md +++ b/docs/api/dtype.md @@ -1 +1,7 @@ -# DType \ No newline at end of file +# dtype + +```{eval-rst} +.. automodule:: numpydantic.dtype + :members: + :undoc-members: +``` \ No newline at end of file diff --git a/docs/api/schema.md b/docs/api/schema.md new file mode 100644 index 0000000..fe123d1 --- /dev/null +++ b/docs/api/schema.md @@ -0,0 +1,7 @@ +# schema + +```{eval-rst} +.. automodule:: numpydantic.schema + :members: + :undoc-members: +``` \ No newline at end of file diff --git a/docs/api/types.md b/docs/api/types.md index 3b30843..513efb4 100644 --- a/docs/api/types.md +++ b/docs/api/types.md @@ -1 +1,7 @@ -# Types \ No newline at end of file +# types + +```{eval-rst} +.. automodule:: numpydantic.types + :members: + :undoc-members: +``` \ No newline at end of file diff --git a/docs/overview.md b/docs/design.md similarity index 58% rename from docs/overview.md rename to docs/design.md index d1185dd..84e963f 100644 --- a/docs/overview.md +++ b/docs/design.md @@ -1,9 +1,18 @@ -# Overview +# Design + +## Why do this? + +We want to bring the tidyness of modeling data with pydantic to the universe of +software that uses arrays - particularly formats and packages that need to be very +particular about what *kind* of arrays they are able to handle or match a specific schema. + +## Challenges The Python type annotation system is weird and not like the rest of Python! (at least until [PEP 0649](https://peps.python.org/pep-0649/) gets mainlined). -Similarly, Pydantic 2's core_schema system is wonderful but still relatively poorly -documented for custom types! This package does the work of plugging them in +Similarly, Pydantic 2's core_schema system is wonderful but still has a few mysteries +lurking under the documented surface. +This package does the work of plugging them in together to make some kind of type validation frankenstein. The first problem is that type annotations are evaluated statically by python, mypy, diff --git a/docs/index.md b/docs/index.md index 08ed398..c28f97c 100644 --- a/docs/index.md +++ b/docs/index.md @@ -416,9 +416,8 @@ dumped = instance.model_dump_json(context={'zarr_dump_array': True}) :caption: Contents :hidden: true -overview -ndarray -hooks +design +interfaces todo ``` @@ -427,12 +426,13 @@ todo :caption: API :hidden: true -api/interface/index api/index +api/interface/index api/dtype api/ndarray api/maps api/monkeypatch +api/schema api/types ``` diff --git a/docs/interfaces.md b/docs/interfaces.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/ndarray.md b/docs/ndarray.md deleted file mode 100644 index 7ebba29..0000000 --- a/docs/ndarray.md +++ /dev/null @@ -1,135 +0,0 @@ -# Constrained Arrays - -## Implementation details - -```{todo} -**Docs:** - -Describe implementation details! -``` - -## Examples - -### Declaration - -Type with a single {class}`~numpydantic.NDArray` class, or use a {class}`~typing.Union` -to express more complex array constraints. - -This package is effectively a Pydantic interface to [nptyping](https://github.com/ramonhagenaars/nptyping), -so any array syntax is valid there. (see [TODO](todo) for caveats) - -```python -from typing import Union -from pydantic import BaseModel -from src.numpydantic import NDArray, Shape, UInt8, Float, Int - - -class Image(BaseModel): - """ - Data values. Data can be in 1-D, 2-D, 3-D, or 4-D. The first dimension should always represent time. This can also be used to store binary data (e.g., image frames). This can also be a link to data stored in an external file. - """ - array: Union[ - NDArray[Shape["* x, * y"], UInt8], - NDArray[Shape["* x, * y, 3 rgb"], UInt8], - NDArray[Shape["* x, * y, 4 rgba"], UInt8], - NDArray[Shape["* t, * x, * y, 3 rgb"], UInt8], - NDArray[Shape["* t, * x, * y, 4 rgba"], Float] - ] -``` - -### Validation: - -```python -import numpy as np -# works -frame_gray = Image(array=np.ones((1280, 720), dtype=np.uint8)) -frame_rgb = Image(array=np.ones((1280, 720, 3), dtype=np.uint8)) -frame_rgba = Image(array=np.ones((1280, 720, 4), dtype=np.uint8)) -video_rgb = Image(array=np.ones((100, 1280, 720, 3), dtype=np.uint8)) - -# fails -wrong_n_dimensions = Image(array=np.ones((1280,), dtype=np.uint8)) -wrong_shape = Image(array=np.ones((1280,720,10), dtype=np.uint8)) -wrong_type = Image(array=np.ones((1280,720,3), dtype=np.float64)) - -# shapes and types are checked together -float_video = Image(array=np.ones((100, 1280, 720, 4),dtype=float)) -wrong_shape_float_video = Image(array=np.ones((100, 1280, 720, 3),dtype=float)) -``` - -### JSON schema generation: - -```python -class MyArray(BaseModel): - array: NDArray[Shape["2 x, * y, 4 z"], Float] -``` - -```python ->>> print(json.dumps(MyArray.model_json_schema(), indent=2)) -``` - -```json -{ - "properties": { - "array": { - "items": { - "items": { - "items": { - "type": "number" - }, - "maxItems": 4, - "minItems": 4, - "type": "array" - }, - "type": "array" - }, - "maxItems": 2, - "minItems": 2, - "title": "Array", - "type": "array" - } - }, - "required": [ - "array" - ], - "title": "MyArray", - "type": "object" -} -``` - -### Serialization - -```python -class SmolArray(BaseModel): - array: NDArray[Shape["2 x, 2 y"], Int] - -class BigArray(BaseModel): - array: NDArray[Shape["1000 x, 1000 y"], Int] -``` - -Serialize small arrays as lists of lists, and big arrays as a b64-encoded blosc compressed string - -```python ->>> smol = SmolArray(array=np.array([[1,2],[3,4]], dtype=int)) ->>> big = BigArray(array=np.random.randint(0,255,(1000,1000),int)) - ->>> print(smol.model_dump_json()) -{"array":[[1,2],[3,4]]} ->>> print(big.model_dump_json()) -{ - "array": "( long b64 encoded string )", - "shape": [1000, 1000], - "dtype": "int64", - "unpack_fns": ["base64.b64decode", "blosc2.unpack_array2"], -} -``` - -## TODO - -```{todo} -Implement structured arrays -``` - -```{todo} -Implement pandas dataframe validation? -``` \ No newline at end of file