mirror of
https://github.com/p2p-ld/nwb-linkml.git
synced 2025-01-10 14:14:27 +00:00
fix truncate hdf5 file to not try and write while iterating
This commit is contained in:
parent
ab63ea071c
commit
3b11afded6
11 changed files with 123 additions and 33 deletions
|
@ -6,3 +6,4 @@ omit =
|
||||||
*/nwb_linkml/models/*
|
*/nwb_linkml/models/*
|
||||||
*/tests/*
|
*/tests/*
|
||||||
*/plot.py
|
*/plot.py
|
||||||
|
*/nwb_linkml/types/df.py
|
26
nwb_linkml/poetry.lock
generated
26
nwb_linkml/poetry.lock
generated
|
@ -1,4 +1,4 @@
|
||||||
# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
|
# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "annotated-types"
|
name = "annotated-types"
|
||||||
|
@ -2364,7 +2364,7 @@ files = [
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
greenlet = {version = "!=0.4.17", markers = "platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\""}
|
greenlet = {version = "!=0.4.17", markers = "platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\""}
|
||||||
typing-extensions = ">=4.2.0"
|
typing-extensions = ">=4.2.0"
|
||||||
|
|
||||||
[package.extras]
|
[package.extras]
|
||||||
|
@ -2416,6 +2416,26 @@ files = [
|
||||||
{file = "toolz-0.12.0.tar.gz", hash = "sha256:88c570861c440ee3f2f6037c4654613228ff40c93a6c25e0eba70d17282c6194"},
|
{file = "toolz-0.12.0.tar.gz", hash = "sha256:88c570861c440ee3f2f6037c4654613228ff40c93a6c25e0eba70d17282c6194"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tqdm"
|
||||||
|
version = "4.66.1"
|
||||||
|
description = "Fast, Extensible Progress Meter"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.7"
|
||||||
|
files = [
|
||||||
|
{file = "tqdm-4.66.1-py3-none-any.whl", hash = "sha256:d302b3c5b53d47bce91fea46679d9c3c6508cf6332229aa1e7d8653723793386"},
|
||||||
|
{file = "tqdm-4.66.1.tar.gz", hash = "sha256:d88e651f9db8d8551a62556d3cff9e3034274ca5d66e93197cf2490e2dcb69c7"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
colorama = {version = "*", markers = "platform_system == \"Windows\""}
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout", "pytest-xdist"]
|
||||||
|
notebook = ["ipywidgets (>=6)"]
|
||||||
|
slack = ["slack-sdk"]
|
||||||
|
telegram = ["requests"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "typing-extensions"
|
name = "typing-extensions"
|
||||||
version = "4.8.0"
|
version = "4.8.0"
|
||||||
|
@ -2635,4 +2655,4 @@ tests = ["coverage", "coveralls", "pytest", "pytest-cov", "pytest-depends", "pyt
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = ">=3.11,<3.13"
|
python-versions = ">=3.11,<3.13"
|
||||||
content-hash = "6c8e41c25a97368f31f190b4d76c3dd839a3b2b364c8653dcdd00ffc258a2547"
|
content-hash = "b4afc2881969650e4a9f75c4148aeaa1d1af308f27170ebd76a1c79fcdf71555"
|
||||||
|
|
|
@ -32,6 +32,7 @@ pytest-profiling = {version = "^1.7.0", optional = true}
|
||||||
pydantic-settings = "^2.0.3"
|
pydantic-settings = "^2.0.3"
|
||||||
dask = "^2023.9.2"
|
dask = "^2023.9.2"
|
||||||
blosc2 = "^2.2.7"
|
blosc2 = "^2.2.7"
|
||||||
|
tqdm = "^4.66.1"
|
||||||
|
|
||||||
|
|
||||||
[tool.poetry.extras]
|
[tool.poetry.extras]
|
||||||
|
|
|
@ -1,5 +1,3 @@
|
||||||
from nwb_linkml.monkeypatch import apply_patches
|
from nwb_linkml.monkeypatch import apply_patches
|
||||||
apply_patches()
|
apply_patches()
|
||||||
|
|
||||||
from nwb_linkml.maps import preload
|
|
||||||
|
|
||||||
|
|
|
@ -20,16 +20,19 @@ Other TODO:
|
||||||
"""
|
"""
|
||||||
import pdb
|
import pdb
|
||||||
import warnings
|
import warnings
|
||||||
from typing import Optional, Dict, overload, Type, Union
|
from typing import Optional, Dict, overload, Type, Union, List
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from types import ModuleType
|
from types import ModuleType
|
||||||
from typing import TYPE_CHECKING, NamedTuple
|
from typing import TYPE_CHECKING, NamedTuple
|
||||||
import json
|
import json
|
||||||
import subprocess
|
import subprocess
|
||||||
import shutil
|
import shutil
|
||||||
|
import os
|
||||||
|
|
||||||
import h5py
|
import h5py
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
from tqdm import tqdm
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from nwb_linkml.maps.hdf5 import H5SourceItem, flatten_hdf, ReadPhases, ReadQueue
|
from nwb_linkml.maps.hdf5 import H5SourceItem, flatten_hdf, ReadPhases, ReadQueue
|
||||||
from nwb_linkml.translate import generate_from_nwbfile
|
from nwb_linkml.translate import generate_from_nwbfile
|
||||||
|
@ -241,6 +244,62 @@ def get_model(cls: h5py.Group | h5py.Dataset) -> Type[BaseModel]:
|
||||||
mod = get_model(cls.parent)
|
mod = get_model(cls.parent)
|
||||||
return mod.model_fields[cls.name.split('/')[-1]].annotation
|
return mod.model_fields[cls.name.split('/')[-1]].annotation
|
||||||
|
|
||||||
|
def find_references(h5f: h5py.File, path: str) -> List[str]:
|
||||||
|
"""
|
||||||
|
Find all objects that make a reference to a given object in
|
||||||
|
|
||||||
|
* Attributes
|
||||||
|
* Dataset-level dtype (a dataset of references)
|
||||||
|
* Compound datasets (a dataset with one "column" of references)
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
This is extremely slow because we collect all references first,
|
||||||
|
rather than checking them as we go and quitting early. PR if you want to make this faster!
|
||||||
|
|
||||||
|
Args:
|
||||||
|
h5f (:class:`h5py.File`): Open hdf5 file
|
||||||
|
path (str): Path to search for references to
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list[str]: List of paths that reference the given path
|
||||||
|
"""
|
||||||
|
references = []
|
||||||
|
|
||||||
|
def _find_references(name, obj: h5py.Group | h5py.Dataset):
|
||||||
|
pbar.update()
|
||||||
|
refs = []
|
||||||
|
for attr in obj.attrs.values():
|
||||||
|
if isinstance(attr, h5py.h5r.Reference):
|
||||||
|
refs.append(attr)
|
||||||
|
|
||||||
|
|
||||||
|
if isinstance(obj, h5py.Dataset):
|
||||||
|
# dataset is all references
|
||||||
|
if obj.dtype.metadata is not None and isinstance(obj.dtype.metadata.get('ref', None), h5py.h5r.Reference):
|
||||||
|
refs.extend(obj[:].tolist())
|
||||||
|
# compound dtype
|
||||||
|
elif isinstance(obj.dtype, np.dtypes.VoidDType):
|
||||||
|
for name in obj.dtype.names:
|
||||||
|
if isinstance(obj[name][0], h5py.h5r.Reference):
|
||||||
|
refs.extend(obj[name].tolist())
|
||||||
|
|
||||||
|
|
||||||
|
for ref in refs:
|
||||||
|
assert isinstance(ref, h5py.h5r.Reference)
|
||||||
|
refname = h5f[ref].name
|
||||||
|
if name == path:
|
||||||
|
references.append(name)
|
||||||
|
return
|
||||||
|
|
||||||
|
pbar = tqdm()
|
||||||
|
try:
|
||||||
|
h5f.visititems(_find_references)
|
||||||
|
finally:
|
||||||
|
pbar.close()
|
||||||
|
return references
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def truncate_file(source: Path, target: Optional[Path] = None, n:int=10) -> Path:
|
def truncate_file(source: Path, target: Optional[Path] = None, n:int=10) -> Path:
|
||||||
"""
|
"""
|
||||||
|
@ -266,33 +325,41 @@ def truncate_file(source: Path, target: Optional[Path] = None, n:int=10) -> Path
|
||||||
# and also a temporary file that we'll make with h5repack
|
# and also a temporary file that we'll make with h5repack
|
||||||
target_tmp = target.parent / (target.stem + '_tmp.hdf5')
|
target_tmp = target.parent / (target.stem + '_tmp.hdf5')
|
||||||
|
|
||||||
|
|
||||||
# copy the whole thing
|
# copy the whole thing
|
||||||
if target.exists():
|
if target.exists():
|
||||||
target.unlink()
|
target.unlink()
|
||||||
|
print(f'Copying {source} to {target}...')
|
||||||
shutil.copy(source, target)
|
shutil.copy(source, target)
|
||||||
|
os.chmod(target, 0o774)
|
||||||
|
|
||||||
h5f_target = h5py.File(str(target), 'r+')
|
to_resize = []
|
||||||
def _prune_dataset(name:str, obj: h5py.Dataset | h5py.Group):
|
def _need_resizing(name:str, obj: h5py.Dataset | h5py.Group):
|
||||||
|
|
||||||
if isinstance(obj, h5py.Dataset):
|
if isinstance(obj, h5py.Dataset):
|
||||||
if obj.size > 10:
|
if obj.size > n:
|
||||||
try:
|
to_resize.append(name)
|
||||||
obj.resize(n, axis=0)
|
|
||||||
except TypeError:
|
print('Resizing datasets...')
|
||||||
# contiguous arrays cant be resized directly
|
# first we get the items that need to be resized and then resize them below
|
||||||
# so we have to jank our way through it
|
# problems with writing to the file from within the visititems call
|
||||||
tmp_name = obj.name + '__tmp'
|
h5f_target = h5py.File(str(target), 'r+')
|
||||||
original_name = obj.name
|
h5f_target.visititems(_need_resizing)
|
||||||
obj.parent.move(obj.name, tmp_name)
|
|
||||||
old_obj = obj.parent.get(tmp_name)
|
|
||||||
new_obj = obj.parent.create_dataset(original_name, data=old_obj[0:n])
|
|
||||||
for k, v in old_obj.attrs.items():
|
|
||||||
new_obj.attrs[k] = v
|
|
||||||
del new_obj.parent[tmp_name]
|
|
||||||
|
|
||||||
|
|
||||||
h5f_target.visititems(_prune_dataset)
|
for resize in to_resize:
|
||||||
|
obj = h5f_target.get(resize)
|
||||||
|
try:
|
||||||
|
obj.resize(n, axis=0)
|
||||||
|
except TypeError:
|
||||||
|
# contiguous arrays cant be trivially resized, so we have to copy and create a new dataset
|
||||||
|
tmp_name = obj.name + '__tmp'
|
||||||
|
original_name = obj.name
|
||||||
|
obj.parent.move(obj.name, tmp_name)
|
||||||
|
old_obj = obj.parent.get(tmp_name)
|
||||||
|
new_obj = obj.parent.create_dataset(original_name, data=old_obj[0:n])
|
||||||
|
for k, v in old_obj.attrs.items():
|
||||||
|
new_obj.attrs[k] = v
|
||||||
|
del new_obj.parent[tmp_name]
|
||||||
|
|
||||||
h5f_target.flush()
|
h5f_target.flush()
|
||||||
h5f_target.close()
|
h5f_target.close()
|
||||||
|
|
||||||
|
@ -301,6 +368,7 @@ def truncate_file(source: Path, target: Optional[Path] = None, n:int=10) -> Path
|
||||||
warnings.warn('Truncated file made, but since h5repack not found in path, file wont be any smaller')
|
warnings.warn('Truncated file made, but since h5repack not found in path, file wont be any smaller')
|
||||||
return target
|
return target
|
||||||
|
|
||||||
|
print('Repacking hdf5...')
|
||||||
res = subprocess.run(
|
res = subprocess.run(
|
||||||
['h5repack', '-f', 'GZIP=9', str(target), str(target_tmp)],
|
['h5repack', '-f', 'GZIP=9', str(target), str(target_tmp)],
|
||||||
capture_output=True
|
capture_output=True
|
||||||
|
|
|
@ -82,7 +82,8 @@ allowed_precisions = {
|
||||||
'uint32': ['uint32', 'uint64'],
|
'uint32': ['uint32', 'uint64'],
|
||||||
'float16': ['float16', 'float32', 'float64'],
|
'float16': ['float16', 'float32', 'float64'],
|
||||||
'float32': ['float32', 'float64'],
|
'float32': ['float32', 'float64'],
|
||||||
'utf': ['ascii']
|
'utf': ['ascii'],
|
||||||
|
'number': ['short', 'int', 'long', 'int16', 'int32', 'int64', 'uint', 'uint8', 'uint16', 'uint32', 'uint64', 'float', 'float16', 'float32', 'float64']
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
Following HDMF, it turns out that specifying precision actually specifies minimum precision
|
Following HDMF, it turns out that specifying precision actually specifies minimum precision
|
||||||
|
|
|
@ -4,14 +4,10 @@ Extension of nptyping NDArray for pydantic that allows for JSON-Schema serializa
|
||||||
* Order to store data in (row first)
|
* Order to store data in (row first)
|
||||||
"""
|
"""
|
||||||
import base64
|
import base64
|
||||||
import pdb
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import (
|
from typing import (
|
||||||
Any,
|
Any,
|
||||||
Callable,
|
Callable,
|
||||||
Annotated,
|
|
||||||
Generic,
|
|
||||||
TypeVar
|
|
||||||
)
|
)
|
||||||
import sys
|
import sys
|
||||||
from copy import copy
|
from copy import copy
|
||||||
|
@ -60,6 +56,7 @@ class NDArray(_NDArray):
|
||||||
def validate_dtype(value: np.ndarray) -> np.ndarray:
|
def validate_dtype(value: np.ndarray) -> np.ndarray:
|
||||||
if dtype is Any:
|
if dtype is Any:
|
||||||
return value
|
return value
|
||||||
|
|
||||||
assert value.dtype == dtype or value.dtype.name in allowed_precisions[dtype.__name__], f"Invalid dtype! expected {dtype}, got {value.dtype}"
|
assert value.dtype == dtype or value.dtype.name in allowed_precisions[dtype.__name__], f"Invalid dtype! expected {dtype}, got {value.dtype}"
|
||||||
return value
|
return value
|
||||||
def validate_array(value: Any) -> np.ndarray:
|
def validate_array(value: Any) -> np.ndarray:
|
||||||
|
|
BIN
nwb_linkml/tests/data/aibs.nwb
Normal file
BIN
nwb_linkml/tests/data/aibs.nwb
Normal file
Binary file not shown.
Binary file not shown.
|
@ -1,3 +1,8 @@
|
||||||
|
aibs.nwb
|
||||||
|
- https://dandiarchive.org/dandiset/000021/
|
||||||
|
- 000021/sub-738651046/sub-738651046_ses-760693773.nwb
|
||||||
|
- truncated datasets to length 10
|
||||||
|
|
||||||
aibs_ecephys.nwb
|
aibs_ecephys.nwb
|
||||||
- https://dandiarchive.org/dandiset/000021/
|
- https://dandiarchive.org/dandiset/000021/
|
||||||
- 000021/sub-738651046/sub-738651046_ses-760693773_probe-769322820_ecephys.nwb
|
- 000021/sub-738651046/sub-738651046_ses-760693773_probe-769322820_ecephys.nwb
|
||||||
|
|
|
@ -18,7 +18,6 @@ def test_hdf_read():
|
||||||
io = HDF5IO(path=NWBFILE)
|
io = HDF5IO(path=NWBFILE)
|
||||||
model = io.read()
|
model = io.read()
|
||||||
|
|
||||||
pdb.set_trace()
|
|
||||||
|
|
||||||
@pytest.mark.skip()
|
@pytest.mark.skip()
|
||||||
def test_truncate_file(tmp_output_dir):
|
def test_truncate_file(tmp_output_dir):
|
||||||
|
|
Loading…
Reference in a new issue