nwb-linkml/nwb_linkml/tests/test_io/test_io_hdf5.py

import h5py
import networkx as nx
import numpy as np
import pytest

from nwb_linkml.io.hdf5 import (
    HDF5IO,
    filter_dependency_graph,
    hdf_dependency_graph,
    truncate_file,
    resolve_hardlink,
)


@pytest.mark.skip()
@pytest.mark.parametrize("dset", ["aibs.nwb", "aibs_ecephys.nwb"])
def test_hdf_read(data_dir, dset):
    NWBFILE = data_dir / dset
    io = HDF5IO(path=NWBFILE)
    # the test for now is just whether we can read it lol
    _ = io.read()


def test_truncate_file(tmp_output_dir):
    source = tmp_output_dir / "truncate_source.hdf5"

    # create a dang ol hdf5 file with a big dataset and some softlinks and make sure
    # we truncate the dataset and preserve softlink

    h5f = h5py.File(str(source), "w")
    data_group = h5f.create_group("data")
    dataset_contig = h5f.create_dataset(
        "/data/dataset_contig",
        data=np.zeros((1000, 30, 40), dtype=np.float64),
        compression="gzip",
        compression_opts=9,
    )
    dataset_chunked = h5f.create_dataset(
        "/data/dataset_chunked",
        data=np.zeros((1000, 40, 50), dtype=np.float64),
        compression="gzip",
        compression_opts=9,
        chunks=True,
    )
    dataset_contig.attrs["reference_other"] = dataset_chunked.ref
    dataset_chunked.attrs["reference_other"] = dataset_contig.ref
    dataset_contig.attrs["anattr"] = 1

    link_group = h5f.create_group("link/child")
    link_group.attrs["reference_contig"] = dataset_contig.ref
    link_group.attrs["reference_chunked"] = dataset_chunked.ref
    h5f.flush()
    h5f.close()

    source_size = source.stat().st_size

    # do it without providing target to check that we make filename correctly
    n = 10
    target_output = truncate_file(source, n=n)
    assert target_output == source.parent / (source.stem + "_truncated.hdf5")
    # check that we actually made it smaller
    target_size = target_output.stat().st_size
    # empirically, the source dataset is ~125KB and truncated is ~17KB
    assert target_size < source_size / 5

    # then check that we have what's expected in the file
    target_h5f = h5py.File(target_output, "r")

    # truncation happened
    assert target_h5f["data"]["dataset_contig"].shape == (n, 30, 40)
    assert target_h5f["data"]["dataset_chunked"].shape == (n, 40, 50)
    # references still work
    # can't directly assess object identity equality with "is"
    # so this tests if the referenced dereference and that they dereference to the right place
    assert (
        target_h5f[target_h5f["data"]["dataset_contig"].attrs["reference_other"]].name
        == target_h5f["data"]["dataset_chunked"].name
    )
    assert (
        target_h5f[target_h5f["data"]["dataset_chunked"].attrs["reference_other"]].name
        == target_h5f["data"]["dataset_contig"].name
    )
    assert (
        target_h5f[target_h5f["link"]["child"].attrs["reference_contig"]].name
        == target_h5f["data"]["dataset_contig"].name
    )
    assert (
        target_h5f[target_h5f["link"]["child"].attrs["reference_chunked"]].name
        == target_h5f["data"]["dataset_chunked"].name
    )
    assert target_h5f["data"]["dataset_contig"].attrs["anattr"] == 1


def test_dependencies_hardlink(nwb_file):
    """
    Test that hardlinks are resolved (eg. from /processing/ecephys/LFP/ElectricalSeries/electrodes
    to /acquisition/ElectricalSeries/electrodes
    Args:
        nwb_file:

    Returns:

    """
    parent = "/processing/ecephys/LFP/ElectricalSeries"
    source = "/processing/ecephys/LFP/ElectricalSeries/electrodes"
    target = "/acquisition/ElectricalSeries/electrodes"

    # assert that the hardlink exists in the test file
    with h5py.File(str(nwb_file), "r") as h5f:
        node = h5f.get(source)
        linked_node = resolve_hardlink(node)
        assert linked_node == target

    graph = hdf_dependency_graph(nwb_file)
    # the parent should link to the target as a child
    assert (parent, target) in graph.edges([parent])
    assert graph.edges[parent, target]["label"] == "child"


@pytest.mark.dev
def test_dependency_graph_images(nwb_file, tmp_output_dir):
    """
    Generate images of the dependency graph
    """
    graph = hdf_dependency_graph(nwb_file)
    A_unfiltered = nx.nx_agraph.to_agraph(graph)
    A_unfiltered.draw(tmp_output_dir / "test_nwb_unfiltered.png", prog="dot")
    graph = filter_dependency_graph(graph)
    A_filtered = nx.nx_agraph.to_agraph(graph)
    A_filtered.draw(tmp_output_dir / "test_nwb_filtered.png", prog="dot")


@pytest.mark.parametrize(
    "dset",
    [
        {"name": "aibs.nwb", "source": "sub-738651046_ses-760693773.nwb"},
        {
            "name": "aibs_ecephys.nwb",
            "source": "sub-738651046_ses-760693773_probe-769322820_ecephys.nwb",
        },
    ],
)
@pytest.mark.dev
def test_make_truncated_datasets(tmp_output_dir, data_dir, dset):
    input_file = tmp_output_dir / dset["source"]
    output_file = data_dir / dset["name"]
    if not input_file.exists():
        return

    truncate_file(input_file, output_file, 10)
ruff automatic fixes 2024-07-02 04:44:35 +00:00			`import h5py`
checkpointing working on model loading. it's a sloggggggggg 2024-09-03 07:54:56 +00:00			`import networkx as nx`
figuring out the strategy here... - added linkml_meta classvar to store additional linkml properties if needed - injecting path field to metaclass - sketch of doing a queue-based read - prune datasets & example allen institute data 2023-09-22 07:31:34 +00:00			`import numpy as np`
ruff automatic fixes 2024-07-02 04:44:35 +00:00			`import pytest`
i'm just sorta making a mess here. coming back tomorrow 2023-09-06 07:50:49 +00:00
fix generics with defaults, use typing_extensions 2024-10-03 07:11:38 +00:00			`from nwb_linkml.io.hdf5 import (`
			`HDF5IO,`
			`filter_dependency_graph,`
			`hdf_dependency_graph,`
			`truncate_file,`
			`resolve_hardlink,`
			`)`
Working on finalizing the mapping operation... doing it single threaded for now and it's very slow but it completes up until the stage where we need to zip up the orphaned objects and other things that can be inferred from the model. Need to make a proxytable model like proxyarray because reading all these tables takes way too fuckin long and it's not what we want to do anyway. 2023-09-26 05:03:53 +00:00
holy hell it was a TYPE COERCION in the way linkml handles annotations and a version mismatch between CI and local https://github.com/linkml/linkml-model/pull/162 2023-10-12 05:30:26 +00:00
requests caching, python 10 2024-07-02 07:34:01 +00:00			`@pytest.mark.skip()`
black formatting 2024-07-02 04:23:31 +00:00			`@pytest.mark.parametrize("dset", ["aibs.nwb", "aibs_ecephys.nwb"])`
[tests] cheap read test 2023-10-06 05:12:27 +00:00			`def test_hdf_read(data_dir, dset):`
			`NWBFILE = data_dir / dset`
i'm just sorta making a mess here. coming back tomorrow 2023-09-06 07:50:49 +00:00			`io = HDF5IO(path=NWBFILE)`
[tests] cheap read test 2023-10-06 05:12:27 +00:00			`# the test for now is just whether we can read it lol`
clean up old hdf5 reader methods, fix truncate_hdf5 method, make proper'd test data files with working references 2024-09-12 02:02:15 +00:00			`_ = io.read()`
successfully building many versions of nwb schema. working on hdf5 importing, come back to it when fresh, just sorta poking at it because it's so close. 2023-09-14 09:45:01 +00:00
black formatting 2024-07-02 04:23:31 +00:00
figuring out the strategy here... - added linkml_meta classvar to store additional linkml properties if needed - injecting path field to metaclass - sketch of doing a queue-based read - prune datasets & example allen institute data 2023-09-22 07:31:34 +00:00			`def test_truncate_file(tmp_output_dir):`
black formatting 2024-07-02 04:23:31 +00:00			`source = tmp_output_dir / "truncate_source.hdf5"`
figuring out the strategy here... - added linkml_meta classvar to store additional linkml properties if needed - injecting path field to metaclass - sketch of doing a queue-based read - prune datasets & example allen institute data 2023-09-22 07:31:34 +00:00
			`# create a dang ol hdf5 file with a big dataset and some softlinks and make sure`
			`# we truncate the dataset and preserve softlink`

black formatting 2024-07-02 04:23:31 +00:00			`h5f = h5py.File(str(source), "w")`
			`data_group = h5f.create_group("data")`
figuring out the strategy here... - added linkml_meta classvar to store additional linkml properties if needed - injecting path field to metaclass - sketch of doing a queue-based read - prune datasets & example allen institute data 2023-09-22 07:31:34 +00:00			`dataset_contig = h5f.create_dataset(`
black formatting 2024-07-02 04:23:31 +00:00			`"/data/dataset_contig",`
			`data=np.zeros((1000, 30, 40), dtype=np.float64),`
			`compression="gzip",`
			`compression_opts=9,`
figuring out the strategy here... - added linkml_meta classvar to store additional linkml properties if needed - injecting path field to metaclass - sketch of doing a queue-based read - prune datasets & example allen institute data 2023-09-22 07:31:34 +00:00			`)`
			`dataset_chunked = h5f.create_dataset(`
black formatting 2024-07-02 04:23:31 +00:00			`"/data/dataset_chunked",`
figuring out the strategy here... - added linkml_meta classvar to store additional linkml properties if needed - injecting path field to metaclass - sketch of doing a queue-based read - prune datasets & example allen institute data 2023-09-22 07:31:34 +00:00			`data=np.zeros((1000, 40, 50), dtype=np.float64),`
			`compression="gzip",`
			`compression_opts=9,`
black formatting 2024-07-02 04:23:31 +00:00			`chunks=True,`
figuring out the strategy here... - added linkml_meta classvar to store additional linkml properties if needed - injecting path field to metaclass - sketch of doing a queue-based read - prune datasets & example allen institute data 2023-09-22 07:31:34 +00:00			`)`
black formatting 2024-07-02 04:23:31 +00:00			`dataset_contig.attrs["reference_other"] = dataset_chunked.ref`
			`dataset_chunked.attrs["reference_other"] = dataset_contig.ref`
			`dataset_contig.attrs["anattr"] = 1`
figuring out the strategy here... - added linkml_meta classvar to store additional linkml properties if needed - injecting path field to metaclass - sketch of doing a queue-based read - prune datasets & example allen institute data 2023-09-22 07:31:34 +00:00
black formatting 2024-07-02 04:23:31 +00:00			`link_group = h5f.create_group("link/child")`
			`link_group.attrs["reference_contig"] = dataset_contig.ref`
			`link_group.attrs["reference_chunked"] = dataset_chunked.ref`
figuring out the strategy here... - added linkml_meta classvar to store additional linkml properties if needed - injecting path field to metaclass - sketch of doing a queue-based read - prune datasets & example allen institute data 2023-09-22 07:31:34 +00:00			`h5f.flush()`
			`h5f.close()`

			`source_size = source.stat().st_size`

			`# do it without providing target to check that we make filename correctly`
			`n = 10`
			`target_output = truncate_file(source, n=n)`
black formatting 2024-07-02 04:23:31 +00:00			`assert target_output == source.parent / (source.stem + "_truncated.hdf5")`
figuring out the strategy here... - added linkml_meta classvar to store additional linkml properties if needed - injecting path field to metaclass - sketch of doing a queue-based read - prune datasets & example allen institute data 2023-09-22 07:31:34 +00:00			`# check that we actually made it smaller`
			`target_size = target_output.stat().st_size`
			`# empirically, the source dataset is ~125KB and truncated is ~17KB`
			`assert target_size < source_size / 5`

			`# then check that we have what's expected in the file`
black formatting 2024-07-02 04:23:31 +00:00			`target_h5f = h5py.File(target_output, "r")`
figuring out the strategy here... - added linkml_meta classvar to store additional linkml properties if needed - injecting path field to metaclass - sketch of doing a queue-based read - prune datasets & example allen institute data 2023-09-22 07:31:34 +00:00
			`# truncation happened`
black formatting 2024-07-02 04:23:31 +00:00			`assert target_h5f["data"]["dataset_contig"].shape == (n, 30, 40)`
			`assert target_h5f["data"]["dataset_chunked"].shape == (n, 40, 50)`
figuring out the strategy here... - added linkml_meta classvar to store additional linkml properties if needed - injecting path field to metaclass - sketch of doing a queue-based read - prune datasets & example allen institute data 2023-09-22 07:31:34 +00:00			`# references still work`
			`# can't directly assess object identity equality with "is"`
			`# so this tests if the referenced dereference and that they dereference to the right place`
black formatting 2024-07-02 04:23:31 +00:00			`assert (`
			`target_h5f[target_h5f["data"]["dataset_contig"].attrs["reference_other"]].name`
			`== target_h5f["data"]["dataset_chunked"].name`
			`)`
			`assert (`
			`target_h5f[target_h5f["data"]["dataset_chunked"].attrs["reference_other"]].name`
			`== target_h5f["data"]["dataset_contig"].name`
			`)`
			`assert (`
			`target_h5f[target_h5f["link"]["child"].attrs["reference_contig"]].name`
			`== target_h5f["data"]["dataset_contig"].name`
			`)`
			`assert (`
			`target_h5f[target_h5f["link"]["child"].attrs["reference_chunked"]].name`
			`== target_h5f["data"]["dataset_chunked"].name`
			`)`
			`assert target_h5f["data"]["dataset_contig"].attrs["anattr"] == 1`

Working on finalizing the mapping operation... doing it single threaded for now and it's very slow but it completes up until the stage where we need to zip up the orphaned objects and other things that can be inferred from the model. Need to make a proxytable model like proxyarray because reading all these tables takes way too fuckin long and it's not what we want to do anyway. 2023-09-26 05:03:53 +00:00
clean up old hdf5 reader methods, fix truncate_hdf5 method, make proper'd test data files with working references 2024-09-12 02:02:15 +00:00			`def test_dependencies_hardlink(nwb_file):`
			`"""`
			`Test that hardlinks are resolved (eg. from /processing/ecephys/LFP/ElectricalSeries/electrodes`
			`to /acquisition/ElectricalSeries/electrodes`
			`Args:`
			`nwb_file:`

			`Returns:`
black formatting 2024-07-02 04:23:31 +00:00
clean up old hdf5 reader methods, fix truncate_hdf5 method, make proper'd test data files with working references 2024-09-12 02:02:15 +00:00			`"""`
			`parent = "/processing/ecephys/LFP/ElectricalSeries"`
			`source = "/processing/ecephys/LFP/ElectricalSeries/electrodes"`
			`target = "/acquisition/ElectricalSeries/electrodes"`
black formatting 2024-07-02 04:23:31 +00:00
clean up old hdf5 reader methods, fix truncate_hdf5 method, make proper'd test data files with working references 2024-09-12 02:02:15 +00:00			`# assert that the hardlink exists in the test file`
			`with h5py.File(str(nwb_file), "r") as h5f:`
			`node = h5f.get(source)`
			`linked_node = resolve_hardlink(node)`
			`assert linked_node == target`

			`graph = hdf_dependency_graph(nwb_file)`
			`# the parent should link to the target as a child`
			`assert (parent, target) in graph.edges([parent])`
			`assert graph.edges[parent, target]["label"] == "child"`
working on grpah loading of nwb files 2024-08-31 08:47:42 +00:00

checkpointing working on model loading. it's a sloggggggggg 2024-09-03 07:54:56 +00:00			`@pytest.mark.dev`
clean up old hdf5 reader methods, fix truncate_hdf5 method, make proper'd test data files with working references 2024-09-12 02:02:15 +00:00			`def test_dependency_graph_images(nwb_file, tmp_output_dir):`
working on grpah loading of nwb files 2024-08-31 08:47:42 +00:00			`"""`
clean up old hdf5 reader methods, fix truncate_hdf5 method, make proper'd test data files with working references 2024-09-12 02:02:15 +00:00			`Generate images of the dependency graph`
working on grpah loading of nwb files 2024-08-31 08:47:42 +00:00			`"""`
			`graph = hdf_dependency_graph(nwb_file)`
checkpointing working on model loading. it's a sloggggggggg 2024-09-03 07:54:56 +00:00			`A_unfiltered = nx.nx_agraph.to_agraph(graph)`
			`A_unfiltered.draw(tmp_output_dir / "test_nwb_unfiltered.png", prog="dot")`
working on grpah loading of nwb files 2024-08-31 08:47:42 +00:00			`graph = filter_dependency_graph(graph)`
checkpointing working on model loading. it's a sloggggggggg 2024-09-03 07:54:56 +00:00			`A_filtered = nx.nx_agraph.to_agraph(graph)`
			`A_filtered.draw(tmp_output_dir / "test_nwb_filtered.png", prog="dot")`
CHECKPOINT WITH IT WORKING before cleanup and model regeneration 2024-09-04 00:48:36 +00:00

clean up old hdf5 reader methods, fix truncate_hdf5 method, make proper'd test data files with working references 2024-09-12 02:02:15 +00:00			`@pytest.mark.parametrize(`
			`"dset",`
			`[`
			`{"name": "aibs.nwb", "source": "sub-738651046_ses-760693773.nwb"},`
			`{`
			`"name": "aibs_ecephys.nwb",`
			`"source": "sub-738651046_ses-760693773_probe-769322820_ecephys.nwb",`
			`},`
			`],`
			`)`
			`@pytest.mark.dev`
			`def test_make_truncated_datasets(tmp_output_dir, data_dir, dset):`
			`input_file = tmp_output_dir / dset["source"]`
			`output_file = data_dir / dset["name"]`
			`if not input_file.exists():`
			`return`
CHECKPOINT WITH IT WORKING before cleanup and model regeneration 2024-09-04 00:48:36 +00:00
clean up old hdf5 reader methods, fix truncate_hdf5 method, make proper'd test data files with working references 2024-09-12 02:02:15 +00:00			`truncate_file(input_file, output_file, 10)`