nwb-linkml/nwb_linkml/tests/test_io/test_io_hdf5.py

import h5py
import networkx as nx
import numpy as np
import pytest

from nwb_linkml.io.hdf5 import (
    HDF5IO,
    filter_dependency_graph,
    hdf_dependency_graph,
    truncate_file,
    resolve_hardlink,
)


@pytest.mark.skip()
@pytest.mark.parametrize("dset", ["aibs.nwb", "aibs_ecephys.nwb"])
def test_hdf_read(data_dir, dset):
    NWBFILE = data_dir / dset
    io = HDF5IO(path=NWBFILE)
    # the test for now is just whether we can read it lol
    _ = io.read()


def test_truncate_file(tmp_output_dir):
    source = tmp_output_dir / "truncate_source.hdf5"

    # create a dang ol hdf5 file with a big dataset and some softlinks and make sure
    # we truncate the dataset and preserve softlink

    h5f = h5py.File(str(source), "w")
    data_group = h5f.create_group("data")
    dataset_contig = h5f.create_dataset(
        "/data/dataset_contig",
        data=np.zeros((1000, 30, 40), dtype=np.float64),
        compression="gzip",
        compression_opts=9,
    )
    dataset_chunked = h5f.create_dataset(
        "/data/dataset_chunked",
        data=np.zeros((1000, 40, 50), dtype=np.float64),
        compression="gzip",
        compression_opts=9,
        chunks=True,
    )
    dataset_contig.attrs["reference_other"] = dataset_chunked.ref
    dataset_chunked.attrs["reference_other"] = dataset_contig.ref
    dataset_contig.attrs["anattr"] = 1

    link_group = h5f.create_group("link/child")
    link_group.attrs["reference_contig"] = dataset_contig.ref
    link_group.attrs["reference_chunked"] = dataset_chunked.ref
    h5f.flush()
    h5f.close()

    source_size = source.stat().st_size

    # do it without providing target to check that we make filename correctly
    n = 10
    target_output = truncate_file(source, n=n)
    assert target_output == source.parent / (source.stem + "_truncated.hdf5")
    # check that we actually made it smaller
    target_size = target_output.stat().st_size
    # empirically, the source dataset is ~125KB and truncated is ~17KB
    assert target_size < source_size / 5

    # then check that we have what's expected in the file
    target_h5f = h5py.File(target_output, "r")

    # truncation happened
    assert target_h5f["data"]["dataset_contig"].shape == (n, 30, 40)
    assert target_h5f["data"]["dataset_chunked"].shape == (n, 40, 50)
    # references still work
    # can't directly assess object identity equality with "is"
    # so this tests if the referenced dereference and that they dereference to the right place
    assert (
        target_h5f[target_h5f["data"]["dataset_contig"].attrs["reference_other"]].name
        == target_h5f["data"]["dataset_chunked"].name
    )
    assert (
        target_h5f[target_h5f["data"]["dataset_chunked"].attrs["reference_other"]].name
        == target_h5f["data"]["dataset_contig"].name
    )
    assert (
        target_h5f[target_h5f["link"]["child"].attrs["reference_contig"]].name
        == target_h5f["data"]["dataset_contig"].name
    )
    assert (
        target_h5f[target_h5f["link"]["child"].attrs["reference_chunked"]].name
        == target_h5f["data"]["dataset_chunked"].name
    )
    assert target_h5f["data"]["dataset_contig"].attrs["anattr"] == 1


def test_dependencies_hardlink(nwb_file):
    """
    Test that hardlinks are resolved (eg. from /processing/ecephys/LFP/ElectricalSeries/electrodes
    to /acquisition/ElectricalSeries/electrodes
    Args:
        nwb_file:

    Returns:

    """
    parent = "/processing/ecephys/LFP/ElectricalSeries"
    source = "/processing/ecephys/LFP/ElectricalSeries/electrodes"
    target = "/acquisition/ElectricalSeries/electrodes"

    # assert that the hardlink exists in the test file
    with h5py.File(str(nwb_file), "r") as h5f:
        node = h5f.get(source)
        linked_node = resolve_hardlink(node)
        assert linked_node == target

    graph = hdf_dependency_graph(nwb_file)
    # the parent should link to the target as a child
    assert (parent, target) in graph.edges([parent])
    assert graph.edges[parent, target]["label"] == "child"


@pytest.mark.dev
def test_dependency_graph_images(nwb_file, tmp_output_dir):
    """
    Generate images of the dependency graph
    """
    graph = hdf_dependency_graph(nwb_file)
    A_unfiltered = nx.nx_agraph.to_agraph(graph)
    A_unfiltered.draw(tmp_output_dir / "test_nwb_unfiltered.png", prog="dot")
    graph = filter_dependency_graph(graph)
    A_filtered = nx.nx_agraph.to_agraph(graph)
    A_filtered.draw(tmp_output_dir / "test_nwb_filtered.png", prog="dot")


@pytest.mark.parametrize(
    "dset",
    [
        {"name": "aibs.nwb", "source": "sub-738651046_ses-760693773.nwb"},
        {
            "name": "aibs_ecephys.nwb",
            "source": "sub-738651046_ses-760693773_probe-769322820_ecephys.nwb",
        },
    ],
)
@pytest.mark.dev
def test_make_truncated_datasets(tmp_output_dir, data_dir, dset):
    input_file = tmp_output_dir / dset["source"]
    output_file = data_dir / dset["name"]
    if not input_file.exists():
        return

    truncate_file(input_file, output_file, 10)