From 49585e467a27805494ee7fcd03dc616ceda0a0f7 Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Sat, 31 Aug 2024 01:47:42 -0700 Subject: [PATCH] working on grpah loading of nwb files --- nwb_linkml/pdm.lock | 4 +- nwb_linkml/pyproject.toml | 1 + nwb_linkml/src/nwb_linkml/io/hdf5.py | 83 ++++++++++++++++++- nwb_linkml/src/nwb_linkml/maps/hdf5.py | 2 +- nwb_linkml/tests/data/test_nwb.yaml | 63 +++++--------- .../tests/data/test_nwb_condensed_sketch.yaml | 76 +++++++++++++++++ nwb_linkml/tests/fixtures.py | 2 + nwb_linkml/tests/test_io/test_io_hdf5.py | 11 ++- 8 files changed, 197 insertions(+), 45 deletions(-) create mode 100644 nwb_linkml/tests/data/test_nwb_condensed_sketch.yaml diff --git a/nwb_linkml/pdm.lock b/nwb_linkml/pdm.lock index e4f43f3..f6f2c7c 100644 --- a/nwb_linkml/pdm.lock +++ b/nwb_linkml/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "dev", "plot", "tests"] strategy = ["inherit_metadata"] lock_version = "4.5.0" -content_hash = "sha256:aaf3c34a5f39fc7db0c5dce91a0693eb78358a255d6b0a72f2e1f988eb7e899f" +content_hash = "sha256:1c297e11f6dc9e4f6b8d29df872177d2ce65bbd334c0b65aa5175dfb125c4d9f" [[metadata.targets]] requires_python = ">=3.10,<3.13" @@ -996,7 +996,7 @@ name = "networkx" version = "3.3" requires_python = ">=3.10" summary = "Python package for creating and manipulating graphs and networks" -groups = ["dev", "tests"] +groups = ["default", "dev", "tests"] files = [ {file = "networkx-3.3-py3-none-any.whl", hash = "sha256:28575580c6ebdaf4505b22c6256a2b9de86b316dc63ba9e93abde3d78dfdbcf2"}, {file = "networkx-3.3.tar.gz", hash = "sha256:0c127d8b2f4865f59ae9cb8aafcd60b5c70f3241ebd66f7defad7c4ab90126c9"}, diff --git a/nwb_linkml/pyproject.toml b/nwb_linkml/pyproject.toml index 91aed24..6e86158 100644 --- a/nwb_linkml/pyproject.toml +++ b/nwb_linkml/pyproject.toml @@ -25,6 +25,7 @@ dependencies = [ "numpydantic>=1.3.3", "black>=24.4.2", "pandas>=2.2.2", + "networkx>=3.3", ] [project.urls] diff --git a/nwb_linkml/src/nwb_linkml/io/hdf5.py b/nwb_linkml/src/nwb_linkml/io/hdf5.py index ade89d9..2da77f4 100644 --- a/nwb_linkml/src/nwb_linkml/io/hdf5.py +++ b/nwb_linkml/src/nwb_linkml/io/hdf5.py @@ -22,6 +22,7 @@ Other TODO: import json import os +import re import shutil import subprocess import sys @@ -31,11 +32,12 @@ from types import ModuleType from typing import TYPE_CHECKING, Dict, List, Optional, Union, overload import h5py +import networkx as nx import numpy as np from pydantic import BaseModel from tqdm import tqdm -from nwb_linkml.maps.hdf5 import ReadPhases, ReadQueue, flatten_hdf +from nwb_linkml.maps.hdf5 import ReadPhases, ReadQueue, flatten_hdf, get_references if TYPE_CHECKING: from nwb_linkml.providers.schema import SchemaProvider @@ -47,6 +49,85 @@ else: from typing_extensions import Never +def hdf_dependency_graph(h5f: Path | h5py.File) -> nx.DiGraph: + """ + Directed dependency graph of dataset and group nodes in an NWBFile such that + each node ``n_i`` is connected to node ``n_j`` if + + * ``n_j`` is ``n_i``'s child + * ``n_i`` contains a reference to ``n_j`` + + Resolve references in + + * Attributes + * Dataset columns + * Compound dtypes + + Args: + h5f (:class:`pathlib.Path` | :class:`h5py.File`): NWB file to graph + + Returns: + :class:`networkx.DiGraph` + """ + # detect nodes to skip + skip_pattern = re.compile("^/specifications.*") + + if isinstance(h5f, (Path, str)): + h5f = h5py.File(h5f, "r") + + g = nx.DiGraph() + + def _visit_item(name: str, node: h5py.Dataset | h5py.Group) -> None: + if skip_pattern.match(name): + return + # find references in attributes + refs = get_references(node) + if isinstance(node, h5py.Group): + refs.extend([child.name for child in node.values()]) + refs = set(refs) + + # add edges + edges = [(node.name, ref) for ref in refs] + g.add_edges_from(edges) + + # ensure node added to graph + if len(edges) == 0: + g.add_node(node.name) + + # store attrs in node + g.nodes[node.name].update(node.attrs) + + # apply to root + _visit_item(h5f.name, h5f) + + h5f.visititems(_visit_item) + return g + + +def filter_dependency_graph(g: nx.DiGraph) -> nx.DiGraph: + """ + Remove nodes from a dependency graph if they + + * have no neurodata type AND + * have no outbound edges + + OR + + * are a VectorIndex (which are handled by the dynamictable mixins) + """ + remove_nodes = [] + node: str + for node in g.nodes.keys(): + ndtype = g.nodes[node].get("neurodata_type", None) + if ndtype == "VectorData": + remove_nodes.append(node) + elif not ndtype and g.out_degree(node) == 0: + remove_nodes.append(node) + + g.remove_nodes_from(remove_nodes) + return g + + class HDF5IO: """ Read (and eventually write) from an NWB HDF5 file. diff --git a/nwb_linkml/src/nwb_linkml/maps/hdf5.py b/nwb_linkml/src/nwb_linkml/maps/hdf5.py index a7b052f..e554dc3 100644 --- a/nwb_linkml/src/nwb_linkml/maps/hdf5.py +++ b/nwb_linkml/src/nwb_linkml/maps/hdf5.py @@ -859,7 +859,7 @@ def get_references(obj: h5py.Dataset | h5py.Group) -> List[str]: # scalar if isinstance(obj[()], h5py.h5r.Reference): refs.append(obj[()]) - elif isinstance(obj[0], h5py.h5r.Reference): + elif len(obj) > 0 and isinstance(obj[0], h5py.h5r.Reference): # single-column refs.extend(obj[:].tolist()) elif len(obj.dtype) > 1: diff --git a/nwb_linkml/tests/data/test_nwb.yaml b/nwb_linkml/tests/data/test_nwb.yaml index dfcb722..defa118 100644 --- a/nwb_linkml/tests/data/test_nwb.yaml +++ b/nwb_linkml/tests/data/test_nwb.yaml @@ -1,46 +1,28 @@ # manually transcribed target version of nwb-linkml dataset # matching the one created by fixtures.py:nwb_file ---- -id: my_dataset +meta: + id: my_dataset -prefixes: - nwbfile: - - path: "test_nwb.nwb" - - hash: "blake2b:blahblahblahblah" + prefixes: + nwbfile: + - path: "test_nwb.nwb" + - hash: "blake2b:blahblahblahblah" -imports: - core: - as: nwb - version: "2.7.0" - from: - - pypi: - package: nwb-models ---- - - hdmf-common: - as: hdmf - version: "1.8.0" - from: - - pypi: - package: nwb-models ---- + imports: + core: + as: nwb + version: "2.7.0" + from: + - pypi: + package: nwb-models + hdmf-common: + as: hdmf + version: "1.8.0" + from: + - pypi: + package: nwb-models extracellular_ephys: &ecephys - electrodes: - group: - - @shank{{i}} - - @shank{{i}} - - @shank{{i}} - # could have expression here like { range(3) } => i - # - ... { range(3) } => i - # or blank ... implies use expression from outer scope - - ... - shank{{i}}: - device: @general.devices.array - ...: { range(3) } => i - -# expands to -extracellular_ephys: electrodes: group: - @shank0 @@ -54,7 +36,7 @@ extracellular_ephys: device: @general.devices.array # etc. -data: !{{ nwb.NWBFile }} <== :nwbfile +data: !nwb.NWBFile file_create_date: [ 2024-01-01 ] identifier: "1111-1111-1111-1111" session_description: All that you touch, you change. @@ -63,11 +45,12 @@ data: !{{ nwb.NWBFile }} <== :nwbfile devices: - Heka ITC-1600: - Microscope: + description: My two-photon microscope + manufacturer: The best microscope manufacturer - array: description: old reliable manufacturer: diy - extracellular_ephys: *ecephys - + extracellular_ephys: nwbfile:/general/extracellular_ephys experiment_description: All that you change, changes you. experimenter: [ "Lauren Oya Olamina" ] institution: Earthseed Research Institute diff --git a/nwb_linkml/tests/data/test_nwb_condensed_sketch.yaml b/nwb_linkml/tests/data/test_nwb_condensed_sketch.yaml new file mode 100644 index 0000000..150378b --- /dev/null +++ b/nwb_linkml/tests/data/test_nwb_condensed_sketch.yaml @@ -0,0 +1,76 @@ +# Sketch of a condensed expression syntax for creation with nwb-linkml +# just a sketch! keeping here for continued work but currentl unused. +--- +id: my_dataset + +prefixes: + nwbfile: + - path: "test_nwb.nwb" + - hash: "blake2b:blahblahblahblah" + +imports: + core: + as: nwb + version: "2.7.0" + from: + - pypi: + package: nwb-models + hdmf-common: + as: hdmf + version: "1.8.0" + from: + - pypi: + package: nwb-models +--- + +extracellular_ephys: &ecephys + electrodes: + group: + - @shank{{i}} + - @shank{{i}} + - @shank{{i}} + # could have expression here like { range(3) } => i + # - ... { range(3) } => i + # or blank ... implies use expression from outer scope + - ... + shank{{i}}: + device: @general.devices.array + ...: { range(3) } => i + +# expands to +extracellular_ephys: + electrodes: + group: + - @shank0 + - @shank0 + - @shank0 + - @shank1 + - # etc. + shank0: + device: @general.devices.array + shank1: + device: @general.devices.array + # etc. + +data: !{{ nwb.NWBFile }} <== :nwbfile + file_create_date: [ 2024-01-01 ] + identifier: "1111-1111-1111-1111" + session_description: All that you touch, you change. + session_start_time: 2024-01-01T01:01:01 + general: + devices: + - Heka ITC-1600: + - Microscope: + - array: + description: old reliable + manufacturer: diy + extracellular_ephys: *ecephys + + experiment_description: All that you change, changes you. + experimenter: [ "Lauren Oya Olamina" ] + institution: Earthseed Research Institute + keywords: + - behavior + - belief + related_publications: doi:10.1016/j.neuron.2016.12.011 + diff --git a/nwb_linkml/tests/fixtures.py b/nwb_linkml/tests/fixtures.py index c4a1c36..1f31a5c 100644 --- a/nwb_linkml/tests/fixtures.py +++ b/nwb_linkml/tests/fixtures.py @@ -349,6 +349,8 @@ def nwb_file(tmp_output_dir) -> Path: generator = np.random.default_rng() nwb_path = tmp_output_dir / "test_nwb.nwb" + if nwb_path.exists(): + return nwb_path nwbfile = NWBFile( session_description="All that you touch, you change.", # required diff --git a/nwb_linkml/tests/test_io/test_io_hdf5.py b/nwb_linkml/tests/test_io/test_io_hdf5.py index c64cf48..82e2a36 100644 --- a/nwb_linkml/tests/test_io/test_io_hdf5.py +++ b/nwb_linkml/tests/test_io/test_io_hdf5.py @@ -4,7 +4,7 @@ import h5py import numpy as np import pytest -from nwb_linkml.io.hdf5 import HDF5IO, truncate_file +from nwb_linkml.io.hdf5 import HDF5IO, truncate_file, hdf_dependency_graph, filter_dependency_graph @pytest.mark.skip() @@ -98,3 +98,12 @@ def test_flatten_hdf(): assert not any(["specifications" in v.path for v in flat.values()]) pdb.set_trace() raise NotImplementedError("Just a stub for local testing for now, finish me!") + + +def test_dependency_graph(nwb_file): + """ + dependency graph is correctly constructed from an HDF5 file + """ + graph = hdf_dependency_graph(nwb_file) + graph = filter_dependency_graph(graph) + pass