fix truncate hdf5 file to not try and write while iterating

2025-01-10 06:04:28 +00:00 · 2023-10-04 20:19:54 -07:00 · 2023-10-04 20:19:54 -07:00 · 3b11afded6
commit 3b11afded6
parent ab63ea071c
11 changed files with 123 additions and 33 deletions
--- a/nwb_linkml/.coveragerc
+++ b/nwb_linkml/.coveragerc
@ -5,4 +5,5 @@ omit =
    */nwb_schema_language/*
    */nwb_linkml/models/*
    */tests/*
-    */plot.py
+    */plot.py
+    */nwb_linkml/types/df.py
--- a/nwb_linkml/poetry.lock
+++ b/nwb_linkml/poetry.lock
@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.

 [[package]]
 name = "annotated-types"
@ -2364,7 +2364,7 @@ files = [
 ]

 [package.dependencies]
-greenlet = {version = "!=0.4.17", markers = "platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\""}
+greenlet = {version = "!=0.4.17", markers = "platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\""}
 typing-extensions = ">=4.2.0"

 [package.extras]
@ -2416,6 +2416,26 @@ files = [
    {file = "toolz-0.12.0.tar.gz", hash = "sha256:88c570861c440ee3f2f6037c4654613228ff40c93a6c25e0eba70d17282c6194"},
 ]

+[[package]]
+name = "tqdm"
+version = "4.66.1"
+description = "Fast, Extensible Progress Meter"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "tqdm-4.66.1-py3-none-any.whl", hash = "sha256:d302b3c5b53d47bce91fea46679d9c3c6508cf6332229aa1e7d8653723793386"},
+    {file = "tqdm-4.66.1.tar.gz", hash = "sha256:d88e651f9db8d8551a62556d3cff9e3034274ca5d66e93197cf2490e2dcb69c7"},
+]
+
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+
+[package.extras]
+dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout", "pytest-xdist"]
+notebook = ["ipywidgets (>=6)"]
+slack = ["slack-sdk"]
+telegram = ["requests"]
+
 [[package]]
 name = "typing-extensions"
 version = "4.8.0"
@ -2635,4 +2655,4 @@ tests = ["coverage", "coveralls", "pytest", "pytest-cov", "pytest-depends", "pyt
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.11,<3.13"
-content-hash = "6c8e41c25a97368f31f190b4d76c3dd839a3b2b364c8653dcdd00ffc258a2547"
+content-hash = "b4afc2881969650e4a9f75c4148aeaa1d1af308f27170ebd76a1c79fcdf71555"
--- a/nwb_linkml/pyproject.toml
+++ b/nwb_linkml/pyproject.toml
@ -32,6 +32,7 @@ pytest-profiling = {version = "^1.7.0", optional = true}
 pydantic-settings = "^2.0.3"
 dask = "^2023.9.2"
 blosc2 = "^2.2.7"
+tqdm = "^4.66.1"


 [tool.poetry.extras]
--- a/nwb_linkml/src/nwb_linkml/init.py
+++ b/nwb_linkml/src/nwb_linkml/init.py
@ -1,5 +1,3 @@
 from nwb_linkml.monkeypatch import apply_patches
 apply_patches()

-from nwb_linkml.maps import preload
-
--- a/nwb_linkml/src/nwb_linkml/io/hdf5.py
+++ b/nwb_linkml/src/nwb_linkml/io/hdf5.py
@ -20,16 +20,19 @@ Other TODO:
 """
 import pdb
 import warnings
-from typing import Optional, Dict, overload, Type, Union
+from typing import Optional, Dict, overload, Type, Union, List
 from pathlib import Path
 from types import ModuleType
 from typing import TYPE_CHECKING, NamedTuple
 import json
 import subprocess
 import shutil
+import os

 import h5py
 from pydantic import BaseModel
+from tqdm import tqdm
+import numpy as np

 from nwb_linkml.maps.hdf5 import H5SourceItem, flatten_hdf, ReadPhases, ReadQueue
 from nwb_linkml.translate import generate_from_nwbfile
@ -241,6 +244,62 @@ def get_model(cls: h5py.Group | h5py.Dataset) -> Type[BaseModel]:
        mod = get_model(cls.parent)
        return mod.model_fields[cls.name.split('/')[-1]].annotation

+def find_references(h5f: h5py.File, path: str) -> List[str]:
+    """
+    Find all objects that make a reference to a given object in
+
+    * Attributes
+    * Dataset-level dtype (a dataset of references)
+    * Compound datasets (a dataset with one "column" of references)
+
+    Notes:
+        This is extremely slow because we collect all references first,
+        rather than checking them as we go and quitting early. PR if you want to make this faster!
+
+    Args:
+        h5f (:class:`h5py.File`): Open hdf5 file
+        path (str): Path to search for references to
+
+    Returns:
+        list[str]: List of paths that reference the given path
+    """
+    references = []
+
+    def _find_references(name, obj: h5py.Group | h5py.Dataset):
+        pbar.update()
+        refs = []
+        for attr in obj.attrs.values():
+            if isinstance(attr, h5py.h5r.Reference):
+                refs.append(attr)
+
+
+        if isinstance(obj, h5py.Dataset):
+            # dataset is all references
+            if obj.dtype.metadata is not None and isinstance(obj.dtype.metadata.get('ref', None), h5py.h5r.Reference):
+                refs.extend(obj[:].tolist())
+            # compound dtype
+            elif isinstance(obj.dtype, np.dtypes.VoidDType):
+                for name in obj.dtype.names:
+                    if isinstance(obj[name][0], h5py.h5r.Reference):
+                        refs.extend(obj[name].tolist())
+
+
+        for ref in refs:
+            assert isinstance(ref, h5py.h5r.Reference)
+            refname = h5f[ref].name
+            if name == path:
+                references.append(name)
+                return
+
+    pbar = tqdm()
+    try:
+        h5f.visititems(_find_references)
+    finally:
+        pbar.close()
+    return references
+
+
+

 def truncate_file(source: Path, target: Optional[Path] = None, n:int=10) -> Path:
    """
@ -266,33 +325,41 @@ def truncate_file(source: Path, target: Optional[Path] = None, n:int=10) -> Path
    # and also a temporary file that we'll make with h5repack
    target_tmp = target.parent / (target.stem + '_tmp.hdf5')

-
    # copy the whole thing
    if target.exists():
        target.unlink()
+    print(f'Copying {source} to {target}...')
    shutil.copy(source, target)
+    os.chmod(target, 0o774)

-    h5f_target = h5py.File(str(target), 'r+')
-    def _prune_dataset(name:str, obj: h5py.Dataset | h5py.Group):
-
+    to_resize = []
+    def _need_resizing(name:str, obj: h5py.Dataset | h5py.Group):
        if isinstance(obj, h5py.Dataset):
-            if obj.size > 10:
-                try:
-                    obj.resize(n, axis=0)
-                except TypeError:
-                    # contiguous arrays cant be resized directly
-                    # so we have to jank our way through it
-                    tmp_name = obj.name + '__tmp'
-                    original_name = obj.name
-                    obj.parent.move(obj.name, tmp_name)
-                    old_obj = obj.parent.get(tmp_name)
-                    new_obj = obj.parent.create_dataset(original_name, data=old_obj[0:n])
-                    for k, v in old_obj.attrs.items():
-                        new_obj.attrs[k] = v
-                    del new_obj.parent[tmp_name]
+            if obj.size > n:
+                to_resize.append(name)
+
+    print('Resizing datasets...')
+    # first we get the items that need to be resized and then resize them below
+    # problems with writing to the file from within the visititems call
+    h5f_target = h5py.File(str(target), 'r+')
+    h5f_target.visititems(_need_resizing)


-    h5f_target.visititems(_prune_dataset)
+    for resize in to_resize:
+        obj = h5f_target.get(resize)
+        try:
+            obj.resize(n, axis=0)
+        except TypeError:
+            # contiguous arrays cant be trivially resized, so we have to copy and create a new dataset
+            tmp_name = obj.name + '__tmp'
+            original_name = obj.name
+            obj.parent.move(obj.name, tmp_name)
+            old_obj = obj.parent.get(tmp_name)
+            new_obj = obj.parent.create_dataset(original_name, data=old_obj[0:n])
+            for k, v in old_obj.attrs.items():
+                new_obj.attrs[k] = v
+            del new_obj.parent[tmp_name]
+
    h5f_target.flush()
    h5f_target.close()

@ -301,6 +368,7 @@ def truncate_file(source: Path, target: Optional[Path] = None, n:int=10) -> Path
        warnings.warn('Truncated file made, but since h5repack not found in path, file wont be any smaller')
        return target

+    print('Repacking hdf5...')
    res = subprocess.run(
        ['h5repack', '-f', 'GZIP=9', str(target), str(target_tmp)],
        capture_output=True
--- a/nwb_linkml/src/nwb_linkml/maps/dtype.py
+++ b/nwb_linkml/src/nwb_linkml/maps/dtype.py
@ -82,7 +82,8 @@ allowed_precisions = {
    'uint32': ['uint32', 'uint64'],
    'float16': ['float16', 'float32', 'float64'],
    'float32': ['float32', 'float64'],
-    'utf': ['ascii']
+    'utf': ['ascii'],
+    'number': ['short', 'int', 'long', 'int16', 'int32', 'int64', 'uint', 'uint8', 'uint16', 'uint32', 'uint64', 'float', 'float16', 'float32', 'float64']
 }
 """
 Following HDMF, it turns out that specifying precision actually specifies minimum precision
--- a/nwb_linkml/src/nwb_linkml/types/ndarray.py
+++ b/nwb_linkml/src/nwb_linkml/types/ndarray.py
@ -4,14 +4,10 @@ Extension of nptyping NDArray for pydantic that allows for JSON-Schema serializa
 * Order to store data in (row first)
 """
 import base64
-import pdb
 from pathlib import Path
 from typing import (
    Any,
    Callable,
-    Annotated,
-Generic,
-TypeVar
 )
 import sys
 from copy import copy
@ -60,6 +56,7 @@ class NDArray(_NDArray):
        def validate_dtype(value: np.ndarray) -> np.ndarray:
            if dtype is Any:
                return value
+
            assert value.dtype == dtype or value.dtype.name in allowed_precisions[dtype.__name__], f"Invalid dtype! expected {dtype}, got {value.dtype}"
            return value
        def validate_array(value: Any) -> np.ndarray:
--- a/nwb_linkml/tests/data/aibs.nwb
+++ b/nwb_linkml/tests/data/aibs.nwb
--- a/nwb_linkml/tests/data/aibs_ecephys.nwb
+++ b/nwb_linkml/tests/data/aibs_ecephys.nwb
--- a/nwb_linkml/tests/data/sources.txt
+++ b/nwb_linkml/tests/data/sources.txt
@ -1,4 +1,9 @@
+aibs.nwb
+    - https://dandiarchive.org/dandiset/000021/
+    - 000021/sub-738651046/sub-738651046_ses-760693773.nwb
+    - truncated datasets to length 10
+
 aibs_ecephys.nwb
    - https://dandiarchive.org/dandiset/000021/
    - 000021/sub-738651046/sub-738651046_ses-760693773_probe-769322820_ecephys.nwb
-    - truncated datasets to length 10
+    - truncated datasets to length 10
--- a/nwb_linkml/tests/test_io/test_io_hdf5.py
+++ b/nwb_linkml/tests/test_io/test_io_hdf5.py
@ -18,7 +18,6 @@ def test_hdf_read():
    io = HDF5IO(path=NWBFILE)
    model = io.read()

-    pdb.set_trace()

@pytest.mark.skip()
 def test_truncate_file(tmp_output_dir):