diff --git a/docs/conf.py b/docs/conf.py index 38db560..bae1f75 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -24,6 +24,7 @@ extensions = [ "sphinx.ext.viewcode", "sphinx.ext.doctest", "sphinx_design", + "sphinxcontrib.mermaid", "myst_parser", "sphinx.ext.todo", ] diff --git a/docs/design.md b/docs/design.md index 84e963f..9a92e74 100644 --- a/docs/design.md +++ b/docs/design.md @@ -6,6 +6,10 @@ We want to bring the tidyness of modeling data with pydantic to the universe of software that uses arrays - particularly formats and packages that need to be very particular about what *kind* of arrays they are able to handle or match a specific schema. +To support a new generation of data formats and data analysis libraries that can +model the *structure* of data independently from its *implementation,* we made +numpydantic as a bridge between abstract schemas and programmatic use. + ## Challenges The Python type annotation system is weird and not like the rest of Python! @@ -17,10 +21,26 @@ together to make some kind of type validation frankenstein. The first problem is that type annotations are evaluated statically by python, mypy, etc. This means you can't use typical python syntax for declaring types - it has to -be present at the time `__new__` is called, rather than `__init__`. +be present at the time `__new__` is called, rather than `__init__`. So -- pydantic schema -- validation -- serialization -- lazy loading -- compression \ No newline at end of file +Different implementations of arrays behave differently! HDF5 files need to be carefully +opened and closed to avoid corruption, video files don't typically allow normal array +slicing operations, and only some array libraries support lazy loading of arrays on disk. + +We can't anticipate all the possible array libraries that exist now or in the future, +so it has to be possible to extend support to them without needing to go through +a potentially lengthy contribution process. + +## Strategy + +Numpydantic uses {class}`~numpydantic.NDArray` as an abstract specification of +an array that uses one of several [interface](interfaces.md) classes to validate +and interact with an array. These interface classes will set the instance attribute +either as the passed array itself, or a transparent proxy class (eg. +{class}`~numpydantic.interface.hdf5.H5Proxy`) in the case that the native array format +doesn't support numpy-like array operations out of the box. + +- type hinting +- nptyping syntax +- not trying to be an array library +- dtyping, mapping & schematization \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index c28f97c..3e03ac5 100644 --- a/docs/index.md +++ b/docs/index.md @@ -8,7 +8,9 @@ A python package for specifying, validating, and serializing arrays with arbitra but ... -3) if you try and specify an array in pydantic, this happens: +3) Typical type annotations would only work for a single array library implementation +4) They wouldn't allow you to specify array shapes and dtypes, and +5) If you try and specify an array in pydantic, this happens: ```python >>> from pydantic import BaseModel @@ -22,8 +24,39 @@ Set `arbitrary_types_allowed=True` in the model_config to ignore this error or implement `__get_pydantic_core_schema__` on your type to fully support it. ``` -And setting `arbitrary_types_allowed = True` still prohibits you from -generating JSON Schema, serialization to JSON +**Solution:** + +Numpydantic allows you to do this: + +```python +from pydantic import BaseModel +from numpydantic import NDArray, Shape + +class MyModel(BaseModel): + array: NDArray[Shape["3 x, 4 y, * z"], int] +``` + +And use it with your favorite array library: + +```python +import numpy as np +import dask.array as da +import zarr + +# numpy +model = MyModel(array=np.zeros((3, 4, 5), dtype=int)) +# dask +model = MyModel(array=da.zeros((3, 4, 5), dtype=int)) +# hdf5 datasets +model = MyModel(array=('data.h5', '/nested/dataset')) +# zarr arrays +model = MyModel(array=zarr.zeros((3,4,5), dtype=int)) +model = MyModel(array='data.zarr') +model = MyModel(array=('data.zarr', '/nested/dataset')) +# video files +model = MyModel(array="data.mp4") +``` + ## Features: - **Types** - Annotations (based on [npytyping](https://github.com/ramonhagenaars/nptyping)) @@ -31,7 +64,9 @@ generating JSON Schema, serialization to JSON - **Validation** - Shape, dtype, and other array validations - **Interfaces** - Works with {mod}`~.interface.numpy`, {mod}`~.interface.dask`, {mod}`~.interface.hdf5`, {mod}`~.interface.video`, and {mod}`~.interface.zarr`, - and a simple extension system to make it work with whatever else you want! + and a simple extension system to make it work with whatever else you want! Provides + a uniform and transparent interface so you can both use common indexing operations + and also access any special features of a given array library. - **Serialization** - Dump an array as a JSON-compatible array-of-arrays with enough metadata to be able to recreate the model in the native format - **Schema Generation** - Correct JSON Schema for arrays, complete with shape and dtype constraints, to diff --git a/docs/interfaces.md b/docs/interfaces.md index e69de29..f94f964 100644 --- a/docs/interfaces.md +++ b/docs/interfaces.md @@ -0,0 +1,60 @@ +# Interfaces + + +```{mermaid} +flowchart LR + classDef data fill:#2b8cee,color:#ffffff; + classDef X fill:transparent,border:none,color:#ff0000; + + input + + subgraph Interface + match + end + + subgraph Numpy + numpy_check["check"] + end + + subgraph Dask + direction TB + + dask_check["check"] + + subgraph Validation + direction TB + + before_validation --> validate_dtype + validate_dtype --> validate_shape + validate_shape --> after_validation + end + + dask_check --> Validation + + end + + subgraph Zarr + zarr_check["check"] + end + + subgraph Model + output + end + + zarr_x["X"] + numpy_x["X"] + + input --> match + match --> numpy_check + match --> zarr_check + match --> Dask + zarr_check --> zarr_x + numpy_check --> numpy_x + + Validation --> Model + + class input data + class output data + class zarr_x X + class numpy_x X +``` \ No newline at end of file diff --git a/pdm.lock b/pdm.lock index 1678d2b..6e66162 100644 --- a/pdm.lock +++ b/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "arrays", "dask", "dev", "docs", "hdf5", "tests", "video"] strategy = ["cross_platform", "inherit_metadata"] lock_version = "4.4.1" -content_hash = "sha256:893fe47e35966aa6ed1564645326f6f67d1c64b984b5ea6f6b45f58b4fd732c2" +content_hash = "sha256:1f5280f8be86c071c2692fa04f9d885eba84895275404db27f04f317ac5e6f2b" [[package]] name = "alabaster" @@ -792,7 +792,7 @@ name = "opencv-python" version = "4.9.0.80" requires_python = ">=3.6" summary = "Wrapper package for OpenCV python bindings." -groups = ["video"] +groups = ["arrays", "dev", "tests", "video"] dependencies = [ "numpy>=1.17.0; python_version >= \"3.7\"", "numpy>=1.17.3; python_version >= \"3.8\"", @@ -1293,6 +1293,17 @@ files = [ {file = "sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl", hash = "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178"}, ] +[[package]] +name = "sphinxcontrib-mermaid" +version = "0.9.2" +requires_python = ">=3.7" +summary = "Mermaid diagrams in yours Sphinx powered docs" +groups = ["dev", "docs"] +files = [ + {file = "sphinxcontrib-mermaid-0.9.2.tar.gz", hash = "sha256:252ef13dd23164b28f16d8b0205cf184b9d8e2b714a302274d9f59eb708e77af"}, + {file = "sphinxcontrib_mermaid-0.9.2-py3-none-any.whl", hash = "sha256:6795a72037ca55e65663d2a2c1a043d636dc3d30d418e56dd6087d1459d98a5d"}, +] + [[package]] name = "sphinxcontrib-qthelp" version = "1.0.7" diff --git a/pyproject.toml b/pyproject.toml index 52d8140..cce549e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,9 @@ docs = [ "furo>=2024.1.29", "myst-parser<3.0.0,>=2.0.0", "autodoc-pydantic<3.0.0,>=2.0.1", - "sphinx-design<1.0.0,>=0.5.0"] + "sphinx-design<1.0.0,>=0.5.0", + "sphinxcontrib-mermaid>=0.9.2", +] dev = [ "numpydantic[tests,docs]", "sphinx-autobuild>=2021.3.14",