nwb-linkml/nwb_linkml/tests/test_io/test_io_hdf5.py

101 lines
3.4 KiB
Python
Raw Normal View History

import pdb
2024-07-02 04:44:35 +00:00
import h5py
import numpy as np
2024-07-02 04:44:35 +00:00
import pytest
2024-07-02 04:44:35 +00:00
from nwb_linkml.io.hdf5 import HDF5IO, truncate_file
2024-07-02 01:59:21 +00:00
@pytest.mark.xfail()
2024-07-02 04:23:31 +00:00
@pytest.mark.parametrize("dset", ["aibs.nwb", "aibs_ecephys.nwb"])
2023-10-06 05:12:27 +00:00
def test_hdf_read(data_dir, dset):
NWBFILE = data_dir / dset
io = HDF5IO(path=NWBFILE)
2023-10-06 05:12:27 +00:00
# the test for now is just whether we can read it lol
model = io.read()
2024-07-02 04:23:31 +00:00
def test_truncate_file(tmp_output_dir):
2024-07-02 04:23:31 +00:00
source = tmp_output_dir / "truncate_source.hdf5"
# create a dang ol hdf5 file with a big dataset and some softlinks and make sure
# we truncate the dataset and preserve softlink
2024-07-02 04:23:31 +00:00
h5f = h5py.File(str(source), "w")
data_group = h5f.create_group("data")
dataset_contig = h5f.create_dataset(
2024-07-02 04:23:31 +00:00
"/data/dataset_contig",
data=np.zeros((1000, 30, 40), dtype=np.float64),
compression="gzip",
compression_opts=9,
)
dataset_chunked = h5f.create_dataset(
2024-07-02 04:23:31 +00:00
"/data/dataset_chunked",
data=np.zeros((1000, 40, 50), dtype=np.float64),
compression="gzip",
compression_opts=9,
2024-07-02 04:23:31 +00:00
chunks=True,
)
2024-07-02 04:23:31 +00:00
dataset_contig.attrs["reference_other"] = dataset_chunked.ref
dataset_chunked.attrs["reference_other"] = dataset_contig.ref
dataset_contig.attrs["anattr"] = 1
2024-07-02 04:23:31 +00:00
link_group = h5f.create_group("link/child")
link_group.attrs["reference_contig"] = dataset_contig.ref
link_group.attrs["reference_chunked"] = dataset_chunked.ref
h5f.flush()
h5f.close()
source_size = source.stat().st_size
# do it without providing target to check that we make filename correctly
n = 10
target_output = truncate_file(source, n=n)
2024-07-02 04:23:31 +00:00
assert target_output == source.parent / (source.stem + "_truncated.hdf5")
# check that we actually made it smaller
target_size = target_output.stat().st_size
# empirically, the source dataset is ~125KB and truncated is ~17KB
assert target_size < source_size / 5
# then check that we have what's expected in the file
2024-07-02 04:23:31 +00:00
target_h5f = h5py.File(target_output, "r")
# truncation happened
2024-07-02 04:23:31 +00:00
assert target_h5f["data"]["dataset_contig"].shape == (n, 30, 40)
assert target_h5f["data"]["dataset_chunked"].shape == (n, 40, 50)
# references still work
# can't directly assess object identity equality with "is"
# so this tests if the referenced dereference and that they dereference to the right place
2024-07-02 04:23:31 +00:00
assert (
target_h5f[target_h5f["data"]["dataset_contig"].attrs["reference_other"]].name
== target_h5f["data"]["dataset_chunked"].name
)
assert (
target_h5f[target_h5f["data"]["dataset_chunked"].attrs["reference_other"]].name
== target_h5f["data"]["dataset_contig"].name
)
assert (
target_h5f[target_h5f["link"]["child"].attrs["reference_contig"]].name
== target_h5f["data"]["dataset_contig"].name
)
assert (
target_h5f[target_h5f["link"]["child"].attrs["reference_chunked"]].name
== target_h5f["data"]["dataset_chunked"].name
)
assert target_h5f["data"]["dataset_contig"].attrs["anattr"] == 1
@pytest.mark.skip()
def test_flatten_hdf():
from nwb_linkml.maps.hdf5 import flatten_hdf
2024-07-02 04:23:31 +00:00
path = "/Users/jonny/Dropbox/lab/p2p_ld/data/nwb/sub-738651046_ses-760693773.nwb"
import h5py
2024-07-02 04:23:31 +00:00
h5f = h5py.File(path)
flat = flatten_hdf(h5f)
2024-07-02 04:23:31 +00:00
assert not any(["specifications" in v.path for v in flat.values()])
pdb.set_trace()
2024-07-02 04:23:31 +00:00
raise NotImplementedError("Just a stub for local testing for now, finish me!")