From d6750f8df1567f0c33f3fe885452d9fe406feb3a Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Wed, 10 Jul 2024 00:09:17 -0700 Subject: [PATCH] we don't need blosc tho --- docs/meta/todo.md | 4 + nwb_linkml/pyproject.toml | 1 - nwb_linkml/src/nwb_linkml/types/ndarray.py | 199 -------------------- nwb_linkml/src/nwb_linkml/types/ndarray.pyi | 4 - 4 files changed, 4 insertions(+), 204 deletions(-) delete mode 100644 nwb_linkml/src/nwb_linkml/types/ndarray.py delete mode 100644 nwb_linkml/src/nwb_linkml/types/ndarray.pyi diff --git a/docs/meta/todo.md b/docs/meta/todo.md index 2c9064a..a5bb947 100644 --- a/docs/meta/todo.md +++ b/docs/meta/todo.md @@ -12,6 +12,9 @@ Cleanup - [ ] Update pydantic generator - [ ] Make a minimal pydanticgen-only package to slim linkml deps? - [ ] Disambiguate "maps" terminology - split out simple maps from the eg. dataset mapping classes +- [ ] Remove unnecessary imports + - dask + - nptyping Important things that are not implemented yet! @@ -25,6 +28,7 @@ Important things that are not implemented yet! - Or do we want to just say "no dynamictables, just subclass and add more slots since it's super easy to do that." - method to return a dataframe - append rows/this should just be a df basically. + - existing handler is fucked, for example, in `maps/hdmf` - [ ] Handle indirect indexing eg. https://pynwb.readthedocs.io/en/stable/tutorials/general/plot_timeintervals.html#accessing-referenced-timeseries ## Docs TODOs diff --git a/nwb_linkml/pyproject.toml b/nwb_linkml/pyproject.toml index 2fcb0ca..9ac8d79 100644 --- a/nwb_linkml/pyproject.toml +++ b/nwb_linkml/pyproject.toml @@ -20,7 +20,6 @@ dependencies = [ "h5py>=3.9.0", "pydantic-settings>=2.0.3", "dask>=2023.9.2", - "blosc2>=2.2.7", "tqdm>=4.66.1", 'typing-extensions>=4.12.2;python_version<"3.11"', "numpydantic>=1.2.1", diff --git a/nwb_linkml/src/nwb_linkml/types/ndarray.py b/nwb_linkml/src/nwb_linkml/types/ndarray.py deleted file mode 100644 index 60c614a..0000000 --- a/nwb_linkml/src/nwb_linkml/types/ndarray.py +++ /dev/null @@ -1,199 +0,0 @@ -""" -Extension of nptyping NDArray for pydantic that allows for JSON-Schema serialization - -* Order to store data in (row first) -""" - -# ruff: noqa: ANN001 -# ruff: noqa: ANN202 -# FIXME: this has been moved to numpydantic, remove. - -import base64 -import sys -from copy import copy -from pathlib import Path -from typing import Any, Callable - -import blosc2 -import h5py -import nptyping.structure -import numpy as np -from dask.array.core import Array as DaskArray -from nptyping import NDArray as _NDArray -from nptyping.ndarray import NDArrayMeta as _NDArrayMeta -from nptyping.nptyping_type import NPTypingType -from nptyping.shape_expression import check_shape -from pydantic_core import core_schema - -from nwb_linkml.maps.dtype import allowed_precisions, np_to_python - - -def _list_of_lists_schema(shape, array_type_handler): - """ - Make a pydantic JSON schema for an array as a list of lists - """ - shape_parts = shape.__args__[0].split(",") - split_parts = [p.split(" ")[1] if len(p.split(" ")) == 2 else None for p in shape_parts] - - # Construct a list of list schema - # go in reverse order - construct list schemas such that - # the final schema is the one that checks the first dimension - shape_labels = reversed(split_parts) - shape_args = reversed(shape.prepared_args) - list_schema = None - for arg, label in zip(shape_args, shape_labels): - # which handler to use? for the first we use the actual type - # handler, everywhere else we use the prior list handler - inner_schema = array_type_handler if list_schema is None else list_schema - - # make a label annotation, if we have one - metadata = {"name": label} if label is not None else None - - # make the current level list schema, accounting for shape - if arg == "*": - list_schema = core_schema.list_schema(inner_schema, metadata=metadata) - else: - arg = int(arg) - list_schema = core_schema.list_schema( - inner_schema, min_length=arg, max_length=arg, metadata=metadata - ) - return list_schema - - -class NDArrayMeta(_NDArrayMeta, implementation="NDArray"): - """ - Kept here to allow for hooking into metaclass, which has - been necessary on and off as we work this class into a stable - state - """ - - -class NDArray(NPTypingType, metaclass=NDArrayMeta): - """ - Following the example here: https://docs.pydantic.dev/latest/usage/types/custom/#handling-third-party-types - """ - - __args__ = (Any, Any) - - @classmethod - def __get_pydantic_core_schema__( - cls, - _source_type: "NDArray", - _handler: Callable[[Any], core_schema.CoreSchema], - ) -> core_schema.CoreSchema: - - shape, dtype = _source_type.__args__ - # get pydantic core schema for the given specified type - if isinstance(dtype, nptyping.structure.StructureMeta): - raise NotImplementedError("Jonny finish this") - # functools.reduce(operator.or_, [int, float, str]) - else: - array_type_handler = _handler.generate_schema(np_to_python[dtype]) - - def validate_dtype(value: np.ndarray) -> np.ndarray: - if dtype is Any: - return value - - assert ( - value.dtype == dtype or value.dtype.name in allowed_precisions[dtype.__name__] - ), f"Invalid dtype! expected {dtype}, got {value.dtype}" - return value - - def validate_shape(value: Any) -> np.ndarray: - assert shape is Any or check_shape( - value.shape, shape - ), f"Invalid shape! expected shape {shape.prepared_args}, got shape {value.shape}" - return value - - def coerce_list(value: Any) -> np.ndarray: - if isinstance(value, list): - value = np.array(value) - return value - - # get the names of the shape constraints, if any - if shape is Any: - list_schema = core_schema.list_schema(core_schema.any_schema()) - else: - list_schema = _list_of_lists_schema(shape, array_type_handler) - - def array_to_list(instance: np.ndarray | DaskArray) -> list | dict: - if isinstance(instance, DaskArray): - arr = instance.__array__() - elif isinstance(instance, NDArrayProxy): - arr = instance[:] - else: - arr = instance - - # If we're larger than 16kB then compress array! - if sys.getsizeof(arr) > 16 * 1024: - packed = blosc2.pack_array2(arr) - packed = base64.b64encode(packed) - ret = { - "array": packed, - "shape": copy(arr.shape), - "dtype": copy(arr.dtype.name), - "unpack_fns": ["base64.b64decode", "blosc2.unpack_array2"], - } - return ret - else: - return arr.tolist() - - return core_schema.json_or_python_schema( - json_schema=list_schema, - python_schema=core_schema.chain_schema( - [ - core_schema.no_info_plain_validator_function(coerce_list), - core_schema.union_schema( - [ - core_schema.is_instance_schema(cls=np.ndarray), - core_schema.is_instance_schema(cls=DaskArray), - core_schema.is_instance_schema(cls=NDArrayProxy), - ] - ), - core_schema.no_info_plain_validator_function(validate_dtype), - core_schema.no_info_plain_validator_function(validate_shape), - ] - ), - serialization=core_schema.plain_serializer_function_ser_schema( - array_to_list, when_used="json" - ), - ) - - -class NDArrayProxy: - """ - Thin proxy to numpy arrays stored within hdf5 files, - only read into memory when accessed, but otherwise - passthrough all attempts to access attributes. - """ - - def __init__(self, h5f_file: Path | str, path: str): - """ - Args: - h5f_file (:class:`pathlib.Path`): Path to source HDF5 file - path (str): Location within HDF5 file where this array is located - """ - self.h5f_file = Path(h5f_file) - self.path = path - - def __getattr__(self, item): - with h5py.File(self.h5f_file, "r") as h5f: - obj = h5f.get(self.path) - return getattr(obj, item) - - def __getitem__(self, slice) -> np.ndarray: - with h5py.File(self.h5f_file, "r") as h5f: - obj = h5f.get(self.path) - return obj[slice] - - def __setitem__(self, slice, value): - raise NotImplementedError("Can't write into an arrayproxy yet!") - - @classmethod - def __get_pydantic_core_schema__( - cls, - _source_type: _NDArray, - _handler: Callable[[Any], core_schema.CoreSchema], - ) -> core_schema.CoreSchema: - - return NDArray.__get_pydantic_core_schema__(cls, _source_type, _handler) diff --git a/nwb_linkml/src/nwb_linkml/types/ndarray.pyi b/nwb_linkml/src/nwb_linkml/types/ndarray.pyi deleted file mode 100644 index a90b8d3..0000000 --- a/nwb_linkml/src/nwb_linkml/types/ndarray.pyi +++ /dev/null @@ -1,4 +0,0 @@ -import numpy as np - -NDArray = np.ndarray -NDArrayProxy = np.ndarray \ No newline at end of file