From 2803c752b935479327424d80767c697df2574eb3 Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Thu, 23 May 2024 00:27:00 -0700 Subject: [PATCH] first draft of docs for 1.0 --- docs/api/dtype.md | 3 +- docs/api/meta.md | 6 ++ docs/design.md | 61 +++++++++++++++++-- docs/index.md | 6 ++ docs/interfaces.md | 61 +++++++++++++++++++ docs/syntax.md | 118 +++++++++++++++++++++++++++++++++++++ docs/todo.md | 6 +- pyproject.toml | 2 + src/numpydantic/dtype.py | 3 +- src/numpydantic/ndarray.py | 18 ++---- src/numpydantic/schema.py | 11 +++- 11 files changed, 271 insertions(+), 24 deletions(-) create mode 100644 docs/api/meta.md create mode 100644 docs/syntax.md diff --git a/docs/api/dtype.md b/docs/api/dtype.md index 5cba08e..dd37634 100644 --- a/docs/api/dtype.md +++ b/docs/api/dtype.md @@ -4,4 +4,5 @@ .. automodule:: numpydantic.dtype :members: :undoc-members: -``` \ No newline at end of file + :imported-members: +``` \ No newline at end of file diff --git a/docs/api/meta.md b/docs/api/meta.md new file mode 100644 index 0000000..6b282cb --- /dev/null +++ b/docs/api/meta.md @@ -0,0 +1,6 @@ +# meta + +```{eval-rst} +.. automodule:: numpydantic.meta + :members: +``` \ No newline at end of file diff --git a/docs/design.md b/docs/design.md index 9a92e74..b06c4af 100644 --- a/docs/design.md +++ b/docs/design.md @@ -10,6 +10,13 @@ To support a new generation of data formats and data analysis libraries that can model the *structure* of data independently from its *implementation,* we made numpydantic as a bridge between abstract schemas and programmatic use. +The closest prior work is likely [`jaxtyping`](https://github.com/patrick-kidger/jaxtyping), +but its support for multiple array libraries was backed into from its initial +design as a `jax` specification package, and so its extensibility and readability is +relatively low. Its `Dtype[ArrayClass, "{shape_expression}"]` syntax is not well +suited for modeling arrays intended to be general across implementations, and +makes it challenging to adapt to pydantic's schema generation system. + ## Challenges The Python type annotation system is weird and not like the rest of Python! @@ -40,7 +47,53 @@ either as the passed array itself, or a transparent proxy class (eg. {class}`~numpydantic.interface.hdf5.H5Proxy`) in the case that the native array format doesn't support numpy-like array operations out of the box. -- type hinting -- nptyping syntax -- not trying to be an array library -- dtyping, mapping & schematization \ No newline at end of file +The `interface` validation process thus often transforms the type of the passed array - +eg. when specifying an array in an HDF5 file, one will pass some reference to +a `Path` and the location of a dataset within that file, but the returned value from the +interface validator will be an {class}`~numpydantic.interface.hdf5.H5Proxy` +to the dataset. This confuses python's static type checker and IDE integrations like +pylance/pyright/mypy, which naively expect the type to literally be an +{class}`~numpydantic.NDArray` instance. To address this, numpydantic generates a `.pyi` +stub file on import (see {mod}`numpydantic.meta` ) that declares the type of `NDArray` +as the union of all {attr}`.Interface.return_types` . + +```{todo} +To better support static type hinting and inspection (ie. so the type checker +is not only aware of the union of all `return_types`, but the specific array +type that was passed on model instantiation, as well as potentially +do shape and dtype checks during type checking (eg. so a wrongly shaped or dtyped +array assignment will be highlighted as wrong), we will be exploring adding +mypy/pylance/pyright hooks for dynamic type evaluation. +``` + +Since type annotations are static, each `NDArray[]` usage effectively creates a new +class. The `shape` and `dtype` specifications are thus not available at the time +that the validation is performed (see how [pydantic handles Annotated types](https://github.com/pydantic/pydantic/blob/87adc65888ce54ef4314ef874f7ecba52f129f84/pydantic/_internal/_generate_schema.py#L1788) +at the time that the class definition is evaluated by generating pydantic "core schemas", +which are passed to the rust `pydantic_core` for fast validation, which can't be +done with python-based validation functions). The validation function for each +`NDArray` pseudo-subclass is a {func}`closure ` +that uses the *class declaration*-timed `shape` and `dtype` annotations with the +*instantiation*-timed array object to find the matching validator interface and apply it. + +We are initially adopting `nptyping`'s syntax for array specification. It is a longstanding +answer to the desire for more granular array type annotations, but it also was +developed before some key developments in python and its typing system, and is +no longer actively maintained. We make some minor modifications to its +{mod}`~numpydantic.dtype` specification (eg. to allow builtin python types like `int` +and `float`), but any existing `nptyping` annotations can be used as-is with +`numpydantic`. In [v2.*](todo.md#v2) we will be reimplementing it, as well as +making an extended syntax for shape and dtype specifications, so that the +only required dependencies are {mod}`numpy` and {mod}`pydantic`. This will also +let us better hook into pydantic 2's use of `Annotated`, eliminating some +of the complexity in how specification information is passed to the validators. + +Numpydantic is *not* an array library, but a tool that allows you to use existing +array libraries with pydantic. It tries to be a transparent passthrough to +whatever library you are using, adding only minimal convenience classes to +make array usage roughly uniform across array libraries, but otherwise exposing +as much of the functionality of the library as possible. + +It is designed to be something that you don't have +to think too carefully about before adding it as a dependency - it is simple, +clean, unsurprising, well tested, and has three required dependencies. \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index 3e03ac5..3d9da31 100644 --- a/docs/index.md +++ b/docs/index.md @@ -80,6 +80,7 @@ Coming soon: constraints like chunk sizes, as well as make array specifications more introspectable and friendly to runtime usage. - **Advanced dtype handling** - handling dtypes that only exist in some array backends, allowing minimum and maximum precision ranges, and so on as type maps provided by interface classes :) +- **More Elaborate Arrays** - structured dtypes, recarrays, xarray-style labeled arrays... - (see [todo](./todo.md)) ## Installation @@ -452,6 +453,7 @@ dumped = instance.model_dump_json(context={'zarr_dump_array': True}) :hidden: true design +syntax interfaces todo ``` @@ -466,9 +468,13 @@ api/interface/index api/dtype api/ndarray api/maps +api/meta api/monkeypatch api/schema api/types ``` +## See Also + +- [`jaxtyping`](https://docs.kidger.site/jaxtyping/) diff --git a/docs/interfaces.md b/docs/interfaces.md index f94f964..7ed4154 100644 --- a/docs/interfaces.md +++ b/docs/interfaces.md @@ -1,5 +1,66 @@ # Interfaces +Interfaces are the bridge between the abstract {class}`~numpydantic.NDArray` specification +and concrete array libraries. They are subclasses of the abstract {class}`.Interface` +class. + +They contain methods for coercion, validation, serialization, and any other +implementation-specific functionality. + +## Discovery + +Interfaces are discovered through the {meth}`.Interface.interfaces` method - +returning all subclasses of `Interface`. To use a custom interface, it just +needs to be defined/imported by the time you intend to use it when instantiating +a pydantic model. + +Each interface implements a {meth}`.Interface.enabled` method that determines +whether that interface can be used. Typically that means checking if its dependencies +are present in the environment, but can also control conditional use. + +## Matching + +When a pydantic model is instantiated and an `NDArray` is to be validated, +{meth}`.Interface.match` first, uh, finds the matching interface. + +Each interface must define a {meth}`.Interface.check` class that accepts the +array to be validated and returns whether it can be used. Interfaces can +have any `check`ing logic they want, and so can eg. determine if a path +is a particular type of file, but should return quickly and do little work +since they are called frequently. + +Validation fails if an argument doesn't match any interface. + +```{note} +The {class}`.NumpyInterface` is special cased and is only checked if +no other interface matches. It attempts to cast the input argument to a +{class}`numpy.ndarray` to see if it is arraylike, and since many +lazy-loaded array libraries will attempt to load the whole array into memory +when cast to an `ndarray`, we only try as a last resort. +``` + +## Validation + +Validation is a chain of lifecycle methods, with a single argument passed and returned +to and from each: + +{meth}`.Interface.validate` calls in order: + +- {meth}`.Interface.before_validation` +- {meth}`.Interface.validate_dtype` +- {meth}`.Interface.validate_shape` +- {meth}`.Interface.after_validation` + +The `before` and `after` methods provide hooks for coercion, loading, etc. such that +`validate` can accept one of the types in the interface's +{attr}`~.Interface.input_types` and return the {attr}`~.Interface.return_type` . + +## Diagram + +```{todo} +Sorry this is unreadable, need to recall how to change the theme for +generated mermaid diagrams but it is very late and i want to push this. +``` ```{mermaid} flowchart LR diff --git a/docs/syntax.md b/docs/syntax.md new file mode 100644 index 0000000..074fa24 --- /dev/null +++ b/docs/syntax.md @@ -0,0 +1,118 @@ +# Syntax + +General form: + +```python +field: NDArray[Shape["{shape_expression}"], dtype] +``` + +## Dtype + +Dtype checking is for the most part as simple as an `isinstance` check - +the `dtype` attribute of the array is checked against the `dtype` provided in the +`NDArray` annotation. Both numpy and builtin python types can be used. + +A tuple of types can also be passed: + +```python +field: NDArray[Shape["2, 3"], (np.int8, np.uint8)] +``` + +Like `nptyping`, the {mod}`~numpydantic.dtype` module provides convenient access +and aliases to the common dtypes, but also provides "generic" dtypes like +{class}`~numpydantic.dtype.Float` that is a tuple of all subclasses of +{class}`numpy.floating`. Numpy interprets `float` as being equivalent to +{class}`numpy.float64`, and {class}`numpy.floating` is an abstract parent class, +so "generic" tuple dtypes fill that narrow gap. + +```{todo} +Future versions will support interfaces providing type maps for declaring +equality between dtypes that may be specific to that library but should be +considered equivalent to numpy or other library's dtypes. +``` + +```{todo} +Future versions will also support declaring minimum or maximum precisions, +so one might say "at least a 16-bit float" and also accept a 32-bit float. +``` + +## Shape + +Full documentation of nptyping's shape syntax is available in the [nptyping docs](https://github.com/ramonhagenaars/nptyping/blob/master/USERDOCS.md#Shape-expressions), +but for the sake of self-contained docs, the high points are: + +### Numerical Shape + +A comma-separated list of integers. + +For a 2-dimensional, 3 x 4-shaped array: + +```python +Shape["3, 4"] +``` + +### Wildcards + +Wildcards indicate a dimension can be any size + +For a 2-dimensional, 3 x any-shaped array: + +```python +Shape["3, *"] +``` + +### Labels + +Dimensions can be given labels, and in future versions these labels will be +propagated to the generated JSON Schema + +```python +Shape["3 x, 4 y, 5 z"] +``` + +### Arbitrary dimensions + +After some specified dimensions, one can express that there can be any number +of additional dimensions with an `...` like + +```python +Shape["3, 4, ..."] +``` + +### Any-Shaped + +If `dtype` is also `Any`, one can just use + +```python +field: NDArray +``` + +If a `dtype` is being passed, use the `'*'` wildcard along with the `'...'` + +```python +field: NDArray[Shape['*, ...'], int] +``` + +## Caveats + +```{todo} +numpydantic currently does not support structured dtypes or {class}`numpy.recarray` +specifications like nptyping does. It will in future versions. +``` + +````{todo} +numpydantic also does not support the variable shape definition form like + +```python +Shape['Dim, Dim'] +``` + +where there are two dimensions of any shape as long as they are equal +because at the moment it appears impossible to express dynamic constraints +(ie. `minItems`/`maxItems` that depend on the shape of another array) +in JSON Schema. A future minor version will allow them by generating a JSON +schema with a warning that the equal shape constraint will not be represented. + +See: https://github.com/orgs/json-schema-org/discussions/730 + +```` \ No newline at end of file diff --git a/docs/todo.md b/docs/todo.md index 86d9c28..6d94b8f 100644 --- a/docs/todo.md +++ b/docs/todo.md @@ -1,15 +1,15 @@ # TODO -## Syntax -```{todo} +## v2 + We will be moving away from using nptyping in v2.0.0. It was written for an older era in python before the dramatic changes in the Python type system and is no longer actively maintained. We will be reimplementing a syntax that extends its array specification syntax to include things like ranges and extensible dtypes with varying precision (and is much less finnicky to deal with). -``` + ## Validation diff --git a/pyproject.toml b/pyproject.toml index cce549e..8fddbc3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,8 +10,10 @@ dependencies = [ "nptyping>=2.5.0", "numpy>=1.24.0", ] +homepage = "https://numpydantic.readthedocs.io" requires-python = "<4.0,>=3.9" readme = "README.md" +repository = "https://github.com/p2p-ld/numpydantic" license = {text = "MIT"} diff --git a/src/numpydantic/dtype.py b/src/numpydantic/dtype.py index bbe91c3..55bcca6 100644 --- a/src/numpydantic/dtype.py +++ b/src/numpydantic/dtype.py @@ -9,7 +9,7 @@ interfaces. This module also allows for convenient access to all abstract dtypes in a single module, rather than needing to import each individually. -Some types like :ref:`Integer` are compound types - tuples of multiple dtypes. +Some types like `Integer` are compound types - tuples of multiple dtypes. Check these using ``in`` rather than ``==``. This interface will develop in future versions to allow a single dtype check. """ @@ -59,6 +59,7 @@ Timedelta64 = np.timedelta64 SignedInteger = (np.int8, np.int16, np.int32, np.int64, np.short) UnsignedInteger = (np.uint8, np.uint16, np.uint32, np.uint64, np.ushort) Integer = tuple([*SignedInteger, *UnsignedInteger]) +"""All integer types""" Int = Integer # Int should translate to the "generic" int type. Float16 = np.float16 diff --git a/src/numpydantic/ndarray.py b/src/numpydantic/ndarray.py index 2ef7da7..ddc1316 100644 --- a/src/numpydantic/ndarray.py +++ b/src/numpydantic/ndarray.py @@ -13,7 +13,7 @@ Extension of nptyping NDArray for pydantic that allows for JSON-Schema serializa """ -from typing import TYPE_CHECKING, Any, Tuple +from typing import Any, Tuple import numpy as np from nptyping.error import InvalidArgumentsError @@ -37,13 +37,6 @@ from numpydantic.schema import ( ) from numpydantic.types import DtypeType, ShapeType -if TYPE_CHECKING: # pragma: no cover - pass - -""" -python types that pydantic/json schema can't support (and Any will be used instead) -""" - class NDArrayMeta(_NDArrayMeta, implementation="NDArray"): """ @@ -90,12 +83,9 @@ class NDArray(NPTypingType, metaclass=NDArrayMeta): Constrained array type allowing npytyping syntax for dtype and shape validation and serialization. - Integrates with pydantic such that - - JSON schema for list of list encoding - - Serialized as LoL, with automatic compression for large arrays - - Automatic coercion from lists on instantiation - - Also supports validation on :class:`.NDArrayProxy` types for lazy loading. + This class is not intended to be instantiated or used for type checking, it + implements the ``__get_pydantic_core_schema__` method to invoke + the relevant :ref:`interface ` for validation and serialization. References: - https://docs.pydantic.dev/latest/usage/types/custom/#handling-third-party-types diff --git a/src/numpydantic/schema.py b/src/numpydantic/schema.py index cf430a8..0233610 100644 --- a/src/numpydantic/schema.py +++ b/src/numpydantic/schema.py @@ -129,7 +129,16 @@ def list_of_lists_schema(shape: Shape, array_type: CoreSchema) -> ListSchema: elif arg == "...": list_schema = _unbounded_shape(inner_schema, metadata=metadata) else: - arg = int(arg) + try: + arg = int(arg) + except ValueError as e: + raise ValueError( + "Array shapes must be integers, wildcards, or ellipses. " + "Shape variables (for declaring that one dimension must be the " + "same size as another) are not supported because it is " + "impossible to express dynamic minItems/maxItems in JSON Schema. " + "See: https://github.com/orgs/json-schema-org/discussions/730" + ) from e list_schema = core_schema.list_schema( inner_schema, min_length=arg, max_length=arg, metadata=metadata )