mirror of
https://github.com/p2p-ld/nwb-linkml.git
synced 2024-11-12 17:54:29 +00:00
fix truncate hdf5 file to not try and write while iterating
This commit is contained in:
parent
ab63ea071c
commit
3b11afded6
11 changed files with 123 additions and 33 deletions
|
@ -5,4 +5,5 @@ omit =
|
|||
*/nwb_schema_language/*
|
||||
*/nwb_linkml/models/*
|
||||
*/tests/*
|
||||
*/plot.py
|
||||
*/plot.py
|
||||
*/nwb_linkml/types/df.py
|
26
nwb_linkml/poetry.lock
generated
26
nwb_linkml/poetry.lock
generated
|
@ -1,4 +1,4 @@
|
|||
# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "annotated-types"
|
||||
|
@ -2364,7 +2364,7 @@ files = [
|
|||
]
|
||||
|
||||
[package.dependencies]
|
||||
greenlet = {version = "!=0.4.17", markers = "platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\""}
|
||||
greenlet = {version = "!=0.4.17", markers = "platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\""}
|
||||
typing-extensions = ">=4.2.0"
|
||||
|
||||
[package.extras]
|
||||
|
@ -2416,6 +2416,26 @@ files = [
|
|||
{file = "toolz-0.12.0.tar.gz", hash = "sha256:88c570861c440ee3f2f6037c4654613228ff40c93a6c25e0eba70d17282c6194"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tqdm"
|
||||
version = "4.66.1"
|
||||
description = "Fast, Extensible Progress Meter"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "tqdm-4.66.1-py3-none-any.whl", hash = "sha256:d302b3c5b53d47bce91fea46679d9c3c6508cf6332229aa1e7d8653723793386"},
|
||||
{file = "tqdm-4.66.1.tar.gz", hash = "sha256:d88e651f9db8d8551a62556d3cff9e3034274ca5d66e93197cf2490e2dcb69c7"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
colorama = {version = "*", markers = "platform_system == \"Windows\""}
|
||||
|
||||
[package.extras]
|
||||
dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout", "pytest-xdist"]
|
||||
notebook = ["ipywidgets (>=6)"]
|
||||
slack = ["slack-sdk"]
|
||||
telegram = ["requests"]
|
||||
|
||||
[[package]]
|
||||
name = "typing-extensions"
|
||||
version = "4.8.0"
|
||||
|
@ -2635,4 +2655,4 @@ tests = ["coverage", "coveralls", "pytest", "pytest-cov", "pytest-depends", "pyt
|
|||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.11,<3.13"
|
||||
content-hash = "6c8e41c25a97368f31f190b4d76c3dd839a3b2b364c8653dcdd00ffc258a2547"
|
||||
content-hash = "b4afc2881969650e4a9f75c4148aeaa1d1af308f27170ebd76a1c79fcdf71555"
|
||||
|
|
|
@ -32,6 +32,7 @@ pytest-profiling = {version = "^1.7.0", optional = true}
|
|||
pydantic-settings = "^2.0.3"
|
||||
dask = "^2023.9.2"
|
||||
blosc2 = "^2.2.7"
|
||||
tqdm = "^4.66.1"
|
||||
|
||||
|
||||
[tool.poetry.extras]
|
||||
|
|
|
@ -1,5 +1,3 @@
|
|||
from nwb_linkml.monkeypatch import apply_patches
|
||||
apply_patches()
|
||||
|
||||
from nwb_linkml.maps import preload
|
||||
|
||||
|
|
|
@ -20,16 +20,19 @@ Other TODO:
|
|||
"""
|
||||
import pdb
|
||||
import warnings
|
||||
from typing import Optional, Dict, overload, Type, Union
|
||||
from typing import Optional, Dict, overload, Type, Union, List
|
||||
from pathlib import Path
|
||||
from types import ModuleType
|
||||
from typing import TYPE_CHECKING, NamedTuple
|
||||
import json
|
||||
import subprocess
|
||||
import shutil
|
||||
import os
|
||||
|
||||
import h5py
|
||||
from pydantic import BaseModel
|
||||
from tqdm import tqdm
|
||||
import numpy as np
|
||||
|
||||
from nwb_linkml.maps.hdf5 import H5SourceItem, flatten_hdf, ReadPhases, ReadQueue
|
||||
from nwb_linkml.translate import generate_from_nwbfile
|
||||
|
@ -241,6 +244,62 @@ def get_model(cls: h5py.Group | h5py.Dataset) -> Type[BaseModel]:
|
|||
mod = get_model(cls.parent)
|
||||
return mod.model_fields[cls.name.split('/')[-1]].annotation
|
||||
|
||||
def find_references(h5f: h5py.File, path: str) -> List[str]:
|
||||
"""
|
||||
Find all objects that make a reference to a given object in
|
||||
|
||||
* Attributes
|
||||
* Dataset-level dtype (a dataset of references)
|
||||
* Compound datasets (a dataset with one "column" of references)
|
||||
|
||||
Notes:
|
||||
This is extremely slow because we collect all references first,
|
||||
rather than checking them as we go and quitting early. PR if you want to make this faster!
|
||||
|
||||
Args:
|
||||
h5f (:class:`h5py.File`): Open hdf5 file
|
||||
path (str): Path to search for references to
|
||||
|
||||
Returns:
|
||||
list[str]: List of paths that reference the given path
|
||||
"""
|
||||
references = []
|
||||
|
||||
def _find_references(name, obj: h5py.Group | h5py.Dataset):
|
||||
pbar.update()
|
||||
refs = []
|
||||
for attr in obj.attrs.values():
|
||||
if isinstance(attr, h5py.h5r.Reference):
|
||||
refs.append(attr)
|
||||
|
||||
|
||||
if isinstance(obj, h5py.Dataset):
|
||||
# dataset is all references
|
||||
if obj.dtype.metadata is not None and isinstance(obj.dtype.metadata.get('ref', None), h5py.h5r.Reference):
|
||||
refs.extend(obj[:].tolist())
|
||||
# compound dtype
|
||||
elif isinstance(obj.dtype, np.dtypes.VoidDType):
|
||||
for name in obj.dtype.names:
|
||||
if isinstance(obj[name][0], h5py.h5r.Reference):
|
||||
refs.extend(obj[name].tolist())
|
||||
|
||||
|
||||
for ref in refs:
|
||||
assert isinstance(ref, h5py.h5r.Reference)
|
||||
refname = h5f[ref].name
|
||||
if name == path:
|
||||
references.append(name)
|
||||
return
|
||||
|
||||
pbar = tqdm()
|
||||
try:
|
||||
h5f.visititems(_find_references)
|
||||
finally:
|
||||
pbar.close()
|
||||
return references
|
||||
|
||||
|
||||
|
||||
|
||||
def truncate_file(source: Path, target: Optional[Path] = None, n:int=10) -> Path:
|
||||
"""
|
||||
|
@ -266,33 +325,41 @@ def truncate_file(source: Path, target: Optional[Path] = None, n:int=10) -> Path
|
|||
# and also a temporary file that we'll make with h5repack
|
||||
target_tmp = target.parent / (target.stem + '_tmp.hdf5')
|
||||
|
||||
|
||||
# copy the whole thing
|
||||
if target.exists():
|
||||
target.unlink()
|
||||
print(f'Copying {source} to {target}...')
|
||||
shutil.copy(source, target)
|
||||
os.chmod(target, 0o774)
|
||||
|
||||
h5f_target = h5py.File(str(target), 'r+')
|
||||
def _prune_dataset(name:str, obj: h5py.Dataset | h5py.Group):
|
||||
|
||||
to_resize = []
|
||||
def _need_resizing(name:str, obj: h5py.Dataset | h5py.Group):
|
||||
if isinstance(obj, h5py.Dataset):
|
||||
if obj.size > 10:
|
||||
try:
|
||||
obj.resize(n, axis=0)
|
||||
except TypeError:
|
||||
# contiguous arrays cant be resized directly
|
||||
# so we have to jank our way through it
|
||||
tmp_name = obj.name + '__tmp'
|
||||
original_name = obj.name
|
||||
obj.parent.move(obj.name, tmp_name)
|
||||
old_obj = obj.parent.get(tmp_name)
|
||||
new_obj = obj.parent.create_dataset(original_name, data=old_obj[0:n])
|
||||
for k, v in old_obj.attrs.items():
|
||||
new_obj.attrs[k] = v
|
||||
del new_obj.parent[tmp_name]
|
||||
if obj.size > n:
|
||||
to_resize.append(name)
|
||||
|
||||
print('Resizing datasets...')
|
||||
# first we get the items that need to be resized and then resize them below
|
||||
# problems with writing to the file from within the visititems call
|
||||
h5f_target = h5py.File(str(target), 'r+')
|
||||
h5f_target.visititems(_need_resizing)
|
||||
|
||||
|
||||
h5f_target.visititems(_prune_dataset)
|
||||
for resize in to_resize:
|
||||
obj = h5f_target.get(resize)
|
||||
try:
|
||||
obj.resize(n, axis=0)
|
||||
except TypeError:
|
||||
# contiguous arrays cant be trivially resized, so we have to copy and create a new dataset
|
||||
tmp_name = obj.name + '__tmp'
|
||||
original_name = obj.name
|
||||
obj.parent.move(obj.name, tmp_name)
|
||||
old_obj = obj.parent.get(tmp_name)
|
||||
new_obj = obj.parent.create_dataset(original_name, data=old_obj[0:n])
|
||||
for k, v in old_obj.attrs.items():
|
||||
new_obj.attrs[k] = v
|
||||
del new_obj.parent[tmp_name]
|
||||
|
||||
h5f_target.flush()
|
||||
h5f_target.close()
|
||||
|
||||
|
@ -301,6 +368,7 @@ def truncate_file(source: Path, target: Optional[Path] = None, n:int=10) -> Path
|
|||
warnings.warn('Truncated file made, but since h5repack not found in path, file wont be any smaller')
|
||||
return target
|
||||
|
||||
print('Repacking hdf5...')
|
||||
res = subprocess.run(
|
||||
['h5repack', '-f', 'GZIP=9', str(target), str(target_tmp)],
|
||||
capture_output=True
|
||||
|
|
|
@ -82,7 +82,8 @@ allowed_precisions = {
|
|||
'uint32': ['uint32', 'uint64'],
|
||||
'float16': ['float16', 'float32', 'float64'],
|
||||
'float32': ['float32', 'float64'],
|
||||
'utf': ['ascii']
|
||||
'utf': ['ascii'],
|
||||
'number': ['short', 'int', 'long', 'int16', 'int32', 'int64', 'uint', 'uint8', 'uint16', 'uint32', 'uint64', 'float', 'float16', 'float32', 'float64']
|
||||
}
|
||||
"""
|
||||
Following HDMF, it turns out that specifying precision actually specifies minimum precision
|
||||
|
|
|
@ -4,14 +4,10 @@ Extension of nptyping NDArray for pydantic that allows for JSON-Schema serializa
|
|||
* Order to store data in (row first)
|
||||
"""
|
||||
import base64
|
||||
import pdb
|
||||
from pathlib import Path
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
Annotated,
|
||||
Generic,
|
||||
TypeVar
|
||||
)
|
||||
import sys
|
||||
from copy import copy
|
||||
|
@ -60,6 +56,7 @@ class NDArray(_NDArray):
|
|||
def validate_dtype(value: np.ndarray) -> np.ndarray:
|
||||
if dtype is Any:
|
||||
return value
|
||||
|
||||
assert value.dtype == dtype or value.dtype.name in allowed_precisions[dtype.__name__], f"Invalid dtype! expected {dtype}, got {value.dtype}"
|
||||
return value
|
||||
def validate_array(value: Any) -> np.ndarray:
|
||||
|
|
BIN
nwb_linkml/tests/data/aibs.nwb
Normal file
BIN
nwb_linkml/tests/data/aibs.nwb
Normal file
Binary file not shown.
Binary file not shown.
|
@ -1,4 +1,9 @@
|
|||
aibs.nwb
|
||||
- https://dandiarchive.org/dandiset/000021/
|
||||
- 000021/sub-738651046/sub-738651046_ses-760693773.nwb
|
||||
- truncated datasets to length 10
|
||||
|
||||
aibs_ecephys.nwb
|
||||
- https://dandiarchive.org/dandiset/000021/
|
||||
- 000021/sub-738651046/sub-738651046_ses-760693773_probe-769322820_ecephys.nwb
|
||||
- truncated datasets to length 10
|
||||
- truncated datasets to length 10
|
||||
|
|
|
@ -18,7 +18,6 @@ def test_hdf_read():
|
|||
io = HDF5IO(path=NWBFILE)
|
||||
model = io.read()
|
||||
|
||||
pdb.set_trace()
|
||||
|
||||
@pytest.mark.skip()
|
||||
def test_truncate_file(tmp_output_dir):
|
||||
|
|
Loading…
Reference in a new issue