fix truncate hdf5 file to not try and write while iterating

This commit is contained in:
sneakers-the-rat 2023-10-04 20:19:54 -07:00
parent ab63ea071c
commit 3b11afded6
11 changed files with 123 additions and 33 deletions

View file

@ -6,3 +6,4 @@ omit =
*/nwb_linkml/models/*
*/tests/*
*/plot.py
*/nwb_linkml/types/df.py

26
nwb_linkml/poetry.lock generated
View file

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
[[package]]
name = "annotated-types"
@ -2364,7 +2364,7 @@ files = [
]
[package.dependencies]
greenlet = {version = "!=0.4.17", markers = "platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\""}
greenlet = {version = "!=0.4.17", markers = "platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\""}
typing-extensions = ">=4.2.0"
[package.extras]
@ -2416,6 +2416,26 @@ files = [
{file = "toolz-0.12.0.tar.gz", hash = "sha256:88c570861c440ee3f2f6037c4654613228ff40c93a6c25e0eba70d17282c6194"},
]
[[package]]
name = "tqdm"
version = "4.66.1"
description = "Fast, Extensible Progress Meter"
optional = false
python-versions = ">=3.7"
files = [
{file = "tqdm-4.66.1-py3-none-any.whl", hash = "sha256:d302b3c5b53d47bce91fea46679d9c3c6508cf6332229aa1e7d8653723793386"},
{file = "tqdm-4.66.1.tar.gz", hash = "sha256:d88e651f9db8d8551a62556d3cff9e3034274ca5d66e93197cf2490e2dcb69c7"},
]
[package.dependencies]
colorama = {version = "*", markers = "platform_system == \"Windows\""}
[package.extras]
dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout", "pytest-xdist"]
notebook = ["ipywidgets (>=6)"]
slack = ["slack-sdk"]
telegram = ["requests"]
[[package]]
name = "typing-extensions"
version = "4.8.0"
@ -2635,4 +2655,4 @@ tests = ["coverage", "coveralls", "pytest", "pytest-cov", "pytest-depends", "pyt
[metadata]
lock-version = "2.0"
python-versions = ">=3.11,<3.13"
content-hash = "6c8e41c25a97368f31f190b4d76c3dd839a3b2b364c8653dcdd00ffc258a2547"
content-hash = "b4afc2881969650e4a9f75c4148aeaa1d1af308f27170ebd76a1c79fcdf71555"

View file

@ -32,6 +32,7 @@ pytest-profiling = {version = "^1.7.0", optional = true}
pydantic-settings = "^2.0.3"
dask = "^2023.9.2"
blosc2 = "^2.2.7"
tqdm = "^4.66.1"
[tool.poetry.extras]

View file

@ -1,5 +1,3 @@
from nwb_linkml.monkeypatch import apply_patches
apply_patches()
from nwb_linkml.maps import preload

View file

@ -20,16 +20,19 @@ Other TODO:
"""
import pdb
import warnings
from typing import Optional, Dict, overload, Type, Union
from typing import Optional, Dict, overload, Type, Union, List
from pathlib import Path
from types import ModuleType
from typing import TYPE_CHECKING, NamedTuple
import json
import subprocess
import shutil
import os
import h5py
from pydantic import BaseModel
from tqdm import tqdm
import numpy as np
from nwb_linkml.maps.hdf5 import H5SourceItem, flatten_hdf, ReadPhases, ReadQueue
from nwb_linkml.translate import generate_from_nwbfile
@ -241,6 +244,62 @@ def get_model(cls: h5py.Group | h5py.Dataset) -> Type[BaseModel]:
mod = get_model(cls.parent)
return mod.model_fields[cls.name.split('/')[-1]].annotation
def find_references(h5f: h5py.File, path: str) -> List[str]:
"""
Find all objects that make a reference to a given object in
* Attributes
* Dataset-level dtype (a dataset of references)
* Compound datasets (a dataset with one "column" of references)
Notes:
This is extremely slow because we collect all references first,
rather than checking them as we go and quitting early. PR if you want to make this faster!
Args:
h5f (:class:`h5py.File`): Open hdf5 file
path (str): Path to search for references to
Returns:
list[str]: List of paths that reference the given path
"""
references = []
def _find_references(name, obj: h5py.Group | h5py.Dataset):
pbar.update()
refs = []
for attr in obj.attrs.values():
if isinstance(attr, h5py.h5r.Reference):
refs.append(attr)
if isinstance(obj, h5py.Dataset):
# dataset is all references
if obj.dtype.metadata is not None and isinstance(obj.dtype.metadata.get('ref', None), h5py.h5r.Reference):
refs.extend(obj[:].tolist())
# compound dtype
elif isinstance(obj.dtype, np.dtypes.VoidDType):
for name in obj.dtype.names:
if isinstance(obj[name][0], h5py.h5r.Reference):
refs.extend(obj[name].tolist())
for ref in refs:
assert isinstance(ref, h5py.h5r.Reference)
refname = h5f[ref].name
if name == path:
references.append(name)
return
pbar = tqdm()
try:
h5f.visititems(_find_references)
finally:
pbar.close()
return references
def truncate_file(source: Path, target: Optional[Path] = None, n:int=10) -> Path:
"""
@ -266,22 +325,32 @@ def truncate_file(source: Path, target: Optional[Path] = None, n:int=10) -> Path
# and also a temporary file that we'll make with h5repack
target_tmp = target.parent / (target.stem + '_tmp.hdf5')
# copy the whole thing
if target.exists():
target.unlink()
print(f'Copying {source} to {target}...')
shutil.copy(source, target)
os.chmod(target, 0o774)
h5f_target = h5py.File(str(target), 'r+')
def _prune_dataset(name:str, obj: h5py.Dataset | h5py.Group):
to_resize = []
def _need_resizing(name:str, obj: h5py.Dataset | h5py.Group):
if isinstance(obj, h5py.Dataset):
if obj.size > 10:
if obj.size > n:
to_resize.append(name)
print('Resizing datasets...')
# first we get the items that need to be resized and then resize them below
# problems with writing to the file from within the visititems call
h5f_target = h5py.File(str(target), 'r+')
h5f_target.visititems(_need_resizing)
for resize in to_resize:
obj = h5f_target.get(resize)
try:
obj.resize(n, axis=0)
except TypeError:
# contiguous arrays cant be resized directly
# so we have to jank our way through it
# contiguous arrays cant be trivially resized, so we have to copy and create a new dataset
tmp_name = obj.name + '__tmp'
original_name = obj.name
obj.parent.move(obj.name, tmp_name)
@ -291,8 +360,6 @@ def truncate_file(source: Path, target: Optional[Path] = None, n:int=10) -> Path
new_obj.attrs[k] = v
del new_obj.parent[tmp_name]
h5f_target.visititems(_prune_dataset)
h5f_target.flush()
h5f_target.close()
@ -301,6 +368,7 @@ def truncate_file(source: Path, target: Optional[Path] = None, n:int=10) -> Path
warnings.warn('Truncated file made, but since h5repack not found in path, file wont be any smaller')
return target
print('Repacking hdf5...')
res = subprocess.run(
['h5repack', '-f', 'GZIP=9', str(target), str(target_tmp)],
capture_output=True

View file

@ -82,7 +82,8 @@ allowed_precisions = {
'uint32': ['uint32', 'uint64'],
'float16': ['float16', 'float32', 'float64'],
'float32': ['float32', 'float64'],
'utf': ['ascii']
'utf': ['ascii'],
'number': ['short', 'int', 'long', 'int16', 'int32', 'int64', 'uint', 'uint8', 'uint16', 'uint32', 'uint64', 'float', 'float16', 'float32', 'float64']
}
"""
Following HDMF, it turns out that specifying precision actually specifies minimum precision

View file

@ -4,14 +4,10 @@ Extension of nptyping NDArray for pydantic that allows for JSON-Schema serializa
* Order to store data in (row first)
"""
import base64
import pdb
from pathlib import Path
from typing import (
Any,
Callable,
Annotated,
Generic,
TypeVar
)
import sys
from copy import copy
@ -60,6 +56,7 @@ class NDArray(_NDArray):
def validate_dtype(value: np.ndarray) -> np.ndarray:
if dtype is Any:
return value
assert value.dtype == dtype or value.dtype.name in allowed_precisions[dtype.__name__], f"Invalid dtype! expected {dtype}, got {value.dtype}"
return value
def validate_array(value: Any) -> np.ndarray:

Binary file not shown.

View file

@ -1,3 +1,8 @@
aibs.nwb
- https://dandiarchive.org/dandiset/000021/
- 000021/sub-738651046/sub-738651046_ses-760693773.nwb
- truncated datasets to length 10
aibs_ecephys.nwb
- https://dandiarchive.org/dandiset/000021/
- 000021/sub-738651046/sub-738651046_ses-760693773_probe-769322820_ecephys.nwb

View file

@ -18,7 +18,6 @@ def test_hdf_read():
io = HDF5IO(path=NWBFILE)
model = io.read()
pdb.set_trace()
@pytest.mark.skip()
def test_truncate_file(tmp_output_dir):