diff --git a/nwb_linkml/.coveragerc b/nwb_linkml/.coveragerc index 99fa480..e36cbf1 100644 --- a/nwb_linkml/.coveragerc +++ b/nwb_linkml/.coveragerc @@ -5,4 +5,5 @@ omit = */nwb_schema_language/* */nwb_linkml/models/* */tests/* - */plot.py \ No newline at end of file + */plot.py + */nwb_linkml/types/df.py \ No newline at end of file diff --git a/nwb_linkml/poetry.lock b/nwb_linkml/poetry.lock index 2cfded1..d70db4e 100644 --- a/nwb_linkml/poetry.lock +++ b/nwb_linkml/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. [[package]] name = "annotated-types" @@ -2364,7 +2364,7 @@ files = [ ] [package.dependencies] -greenlet = {version = "!=0.4.17", markers = "platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\""} +greenlet = {version = "!=0.4.17", markers = "platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\""} typing-extensions = ">=4.2.0" [package.extras] @@ -2416,6 +2416,26 @@ files = [ {file = "toolz-0.12.0.tar.gz", hash = "sha256:88c570861c440ee3f2f6037c4654613228ff40c93a6c25e0eba70d17282c6194"}, ] +[[package]] +name = "tqdm" +version = "4.66.1" +description = "Fast, Extensible Progress Meter" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tqdm-4.66.1-py3-none-any.whl", hash = "sha256:d302b3c5b53d47bce91fea46679d9c3c6508cf6332229aa1e7d8653723793386"}, + {file = "tqdm-4.66.1.tar.gz", hash = "sha256:d88e651f9db8d8551a62556d3cff9e3034274ca5d66e93197cf2490e2dcb69c7"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[package.extras] +dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout", "pytest-xdist"] +notebook = ["ipywidgets (>=6)"] +slack = ["slack-sdk"] +telegram = ["requests"] + [[package]] name = "typing-extensions" version = "4.8.0" @@ -2635,4 +2655,4 @@ tests = ["coverage", "coveralls", "pytest", "pytest-cov", "pytest-depends", "pyt [metadata] lock-version = "2.0" python-versions = ">=3.11,<3.13" -content-hash = "6c8e41c25a97368f31f190b4d76c3dd839a3b2b364c8653dcdd00ffc258a2547" +content-hash = "b4afc2881969650e4a9f75c4148aeaa1d1af308f27170ebd76a1c79fcdf71555" diff --git a/nwb_linkml/pyproject.toml b/nwb_linkml/pyproject.toml index 196319c..39e7fb6 100644 --- a/nwb_linkml/pyproject.toml +++ b/nwb_linkml/pyproject.toml @@ -32,6 +32,7 @@ pytest-profiling = {version = "^1.7.0", optional = true} pydantic-settings = "^2.0.3" dask = "^2023.9.2" blosc2 = "^2.2.7" +tqdm = "^4.66.1" [tool.poetry.extras] diff --git a/nwb_linkml/src/nwb_linkml/__init__.py b/nwb_linkml/src/nwb_linkml/__init__.py index 93d0c17..4d5d3cf 100644 --- a/nwb_linkml/src/nwb_linkml/__init__.py +++ b/nwb_linkml/src/nwb_linkml/__init__.py @@ -1,5 +1,3 @@ from nwb_linkml.monkeypatch import apply_patches apply_patches() -from nwb_linkml.maps import preload - diff --git a/nwb_linkml/src/nwb_linkml/io/hdf5.py b/nwb_linkml/src/nwb_linkml/io/hdf5.py index 177908c..7757059 100644 --- a/nwb_linkml/src/nwb_linkml/io/hdf5.py +++ b/nwb_linkml/src/nwb_linkml/io/hdf5.py @@ -20,16 +20,19 @@ Other TODO: """ import pdb import warnings -from typing import Optional, Dict, overload, Type, Union +from typing import Optional, Dict, overload, Type, Union, List from pathlib import Path from types import ModuleType from typing import TYPE_CHECKING, NamedTuple import json import subprocess import shutil +import os import h5py from pydantic import BaseModel +from tqdm import tqdm +import numpy as np from nwb_linkml.maps.hdf5 import H5SourceItem, flatten_hdf, ReadPhases, ReadQueue from nwb_linkml.translate import generate_from_nwbfile @@ -241,6 +244,62 @@ def get_model(cls: h5py.Group | h5py.Dataset) -> Type[BaseModel]: mod = get_model(cls.parent) return mod.model_fields[cls.name.split('/')[-1]].annotation +def find_references(h5f: h5py.File, path: str) -> List[str]: + """ + Find all objects that make a reference to a given object in + + * Attributes + * Dataset-level dtype (a dataset of references) + * Compound datasets (a dataset with one "column" of references) + + Notes: + This is extremely slow because we collect all references first, + rather than checking them as we go and quitting early. PR if you want to make this faster! + + Args: + h5f (:class:`h5py.File`): Open hdf5 file + path (str): Path to search for references to + + Returns: + list[str]: List of paths that reference the given path + """ + references = [] + + def _find_references(name, obj: h5py.Group | h5py.Dataset): + pbar.update() + refs = [] + for attr in obj.attrs.values(): + if isinstance(attr, h5py.h5r.Reference): + refs.append(attr) + + + if isinstance(obj, h5py.Dataset): + # dataset is all references + if obj.dtype.metadata is not None and isinstance(obj.dtype.metadata.get('ref', None), h5py.h5r.Reference): + refs.extend(obj[:].tolist()) + # compound dtype + elif isinstance(obj.dtype, np.dtypes.VoidDType): + for name in obj.dtype.names: + if isinstance(obj[name][0], h5py.h5r.Reference): + refs.extend(obj[name].tolist()) + + + for ref in refs: + assert isinstance(ref, h5py.h5r.Reference) + refname = h5f[ref].name + if name == path: + references.append(name) + return + + pbar = tqdm() + try: + h5f.visititems(_find_references) + finally: + pbar.close() + return references + + + def truncate_file(source: Path, target: Optional[Path] = None, n:int=10) -> Path: """ @@ -266,33 +325,41 @@ def truncate_file(source: Path, target: Optional[Path] = None, n:int=10) -> Path # and also a temporary file that we'll make with h5repack target_tmp = target.parent / (target.stem + '_tmp.hdf5') - # copy the whole thing if target.exists(): target.unlink() + print(f'Copying {source} to {target}...') shutil.copy(source, target) + os.chmod(target, 0o774) - h5f_target = h5py.File(str(target), 'r+') - def _prune_dataset(name:str, obj: h5py.Dataset | h5py.Group): - + to_resize = [] + def _need_resizing(name:str, obj: h5py.Dataset | h5py.Group): if isinstance(obj, h5py.Dataset): - if obj.size > 10: - try: - obj.resize(n, axis=0) - except TypeError: - # contiguous arrays cant be resized directly - # so we have to jank our way through it - tmp_name = obj.name + '__tmp' - original_name = obj.name - obj.parent.move(obj.name, tmp_name) - old_obj = obj.parent.get(tmp_name) - new_obj = obj.parent.create_dataset(original_name, data=old_obj[0:n]) - for k, v in old_obj.attrs.items(): - new_obj.attrs[k] = v - del new_obj.parent[tmp_name] + if obj.size > n: + to_resize.append(name) + + print('Resizing datasets...') + # first we get the items that need to be resized and then resize them below + # problems with writing to the file from within the visititems call + h5f_target = h5py.File(str(target), 'r+') + h5f_target.visititems(_need_resizing) - h5f_target.visititems(_prune_dataset) + for resize in to_resize: + obj = h5f_target.get(resize) + try: + obj.resize(n, axis=0) + except TypeError: + # contiguous arrays cant be trivially resized, so we have to copy and create a new dataset + tmp_name = obj.name + '__tmp' + original_name = obj.name + obj.parent.move(obj.name, tmp_name) + old_obj = obj.parent.get(tmp_name) + new_obj = obj.parent.create_dataset(original_name, data=old_obj[0:n]) + for k, v in old_obj.attrs.items(): + new_obj.attrs[k] = v + del new_obj.parent[tmp_name] + h5f_target.flush() h5f_target.close() @@ -301,6 +368,7 @@ def truncate_file(source: Path, target: Optional[Path] = None, n:int=10) -> Path warnings.warn('Truncated file made, but since h5repack not found in path, file wont be any smaller') return target + print('Repacking hdf5...') res = subprocess.run( ['h5repack', '-f', 'GZIP=9', str(target), str(target_tmp)], capture_output=True diff --git a/nwb_linkml/src/nwb_linkml/maps/dtype.py b/nwb_linkml/src/nwb_linkml/maps/dtype.py index 36ce5d6..9419f01 100644 --- a/nwb_linkml/src/nwb_linkml/maps/dtype.py +++ b/nwb_linkml/src/nwb_linkml/maps/dtype.py @@ -82,7 +82,8 @@ allowed_precisions = { 'uint32': ['uint32', 'uint64'], 'float16': ['float16', 'float32', 'float64'], 'float32': ['float32', 'float64'], - 'utf': ['ascii'] + 'utf': ['ascii'], + 'number': ['short', 'int', 'long', 'int16', 'int32', 'int64', 'uint', 'uint8', 'uint16', 'uint32', 'uint64', 'float', 'float16', 'float32', 'float64'] } """ Following HDMF, it turns out that specifying precision actually specifies minimum precision diff --git a/nwb_linkml/src/nwb_linkml/types/ndarray.py b/nwb_linkml/src/nwb_linkml/types/ndarray.py index 70d48c9..e1265ce 100644 --- a/nwb_linkml/src/nwb_linkml/types/ndarray.py +++ b/nwb_linkml/src/nwb_linkml/types/ndarray.py @@ -4,14 +4,10 @@ Extension of nptyping NDArray for pydantic that allows for JSON-Schema serializa * Order to store data in (row first) """ import base64 -import pdb from pathlib import Path from typing import ( Any, Callable, - Annotated, -Generic, -TypeVar ) import sys from copy import copy @@ -60,6 +56,7 @@ class NDArray(_NDArray): def validate_dtype(value: np.ndarray) -> np.ndarray: if dtype is Any: return value + assert value.dtype == dtype or value.dtype.name in allowed_precisions[dtype.__name__], f"Invalid dtype! expected {dtype}, got {value.dtype}" return value def validate_array(value: Any) -> np.ndarray: diff --git a/nwb_linkml/tests/data/aibs.nwb b/nwb_linkml/tests/data/aibs.nwb new file mode 100644 index 0000000..6000e09 Binary files /dev/null and b/nwb_linkml/tests/data/aibs.nwb differ diff --git a/nwb_linkml/tests/data/aibs_ecephys.nwb b/nwb_linkml/tests/data/aibs_ecephys.nwb index b760cd4..4a5ad9c 100644 Binary files a/nwb_linkml/tests/data/aibs_ecephys.nwb and b/nwb_linkml/tests/data/aibs_ecephys.nwb differ diff --git a/nwb_linkml/tests/data/sources.txt b/nwb_linkml/tests/data/sources.txt index 559aafb..a12e522 100644 --- a/nwb_linkml/tests/data/sources.txt +++ b/nwb_linkml/tests/data/sources.txt @@ -1,4 +1,9 @@ +aibs.nwb + - https://dandiarchive.org/dandiset/000021/ + - 000021/sub-738651046/sub-738651046_ses-760693773.nwb + - truncated datasets to length 10 + aibs_ecephys.nwb - https://dandiarchive.org/dandiset/000021/ - 000021/sub-738651046/sub-738651046_ses-760693773_probe-769322820_ecephys.nwb - - truncated datasets to length 10 \ No newline at end of file + - truncated datasets to length 10 diff --git a/nwb_linkml/tests/test_io/test_io_hdf5.py b/nwb_linkml/tests/test_io/test_io_hdf5.py index d74e23c..e592c77 100644 --- a/nwb_linkml/tests/test_io/test_io_hdf5.py +++ b/nwb_linkml/tests/test_io/test_io_hdf5.py @@ -18,7 +18,6 @@ def test_hdf_read(): io = HDF5IO(path=NWBFILE) model = io.read() - pdb.set_trace() @pytest.mark.skip() def test_truncate_file(tmp_output_dir):