need 2 stop for the night but its sort of happening

2025-01-10 06:04:28 +00:00 · 2023-09-22 02:48:40 -07:00 · 2023-09-22 02:48:40 -07:00 · aac0c7abdd
commit aac0c7abdd
parent 40984a6582
3 changed files with 169 additions and 7 deletions
--- a/nwb_linkml/src/nwb_linkml/io/hdf5.py
+++ b/nwb_linkml/src/nwb_linkml/io/hdf5.py
@ -54,9 +54,8 @@ class H5SourceItem(BaseModel):
    """What kind of hdf5 element this is"""
    depends: List[str] = Field(default_factory=list)
    """Paths of other source items that this item depends on before it can be instantiated. eg. from softlinks"""
-
-
-
+    attrs: dict = Field(default_factory=dict)
+    """Any static attrs that can be had from the element"""

    model_config = ConfigDict(arbitrary_types_allowed=True)
    @property
@ -64,16 +63,23 @@ class H5SourceItem(BaseModel):
        """path split by /"""
        return self.path.split('/')

+FlatH5 = Dict[str, H5SourceItem]
+
 class ReadQueue(BaseModel):
    """Container model to store items as they are built """
+    h5f: h5py.File = Field(
+        description="Open hdf5 file used when resolving the queue!"
+    )
    queue: Dict[str,H5SourceItem] = Field(
        default_factory=dict,
        description="Items left to be instantiated, keyed by hdf5 path",
    )
-    completed: Dict[str, BaseModel] = Field(
+    completed: Dict[str, Any] = Field(
        default_factory=dict,
        description="Items that have already been instantiated, keyed by hdf5 path"
    )
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+

 class HDF5IO():

@ -357,13 +363,36 @@ def flatten_hdf(h5f:h5py.File | h5py.Group, skip='specifications') -> Dict[str,
            h5_type = 'group'
        # dereference and get name of reference
        depends = list(set([h5f[i].name for i in refs]))
+        if not name.startswith('/'):
+            name = '/' + name
        items[name] = H5SourceItem.model_construct(
            path = name,
            leaf = leaf,
-            depends = depends
+            depends = depends,
+            h5_type=h5_type,
+            attrs = dict(obj.attrs.items())
        )

    h5f.visititems(_itemize)
    return items

+def sort_flat_hdf(flat: Dict[str, H5SourceItem]) -> Dict[str, H5SourceItem]:
+    """
+    Sort flat hdf5 file in a rough order of solvability
+
+    * First process any leaf items
+
+    * Put any items with dependencies at the end
+
+    Args:
+        flat:
+
+    Returns:
+
+    """
+    class Rank(NamedTuple):
+        has_depends: bool
+        not_leaf: bool
+
+

--- a/nwb_linkml/src/nwb_linkml/io/hdf5_scratch.py
+++ b/nwb_linkml/src/nwb_linkml/io/hdf5_scratch.py
@ -0,0 +1,134 @@
+"""
+Just saving a scratch file temporarily where i was trying a different strategy,
+rather than doing one big recursive pass through, try and solve subsections
+of the tree and then piece them together once you have the others done.
+
+sort of working. I think what i need to do is populate the 'depends'
+field more so that at each pass i can work through the items whose dependencies
+have been solved from the bottom up.
+"""
+
+from nwb_linkml.io.hdf5 import HDF5IO, flatten_hdf
+import h5py
+from typing import NamedTuple, Tuple, Optional
+from nwb_linkml.io.hdf5 import H5SourceItem, FlatH5, ReadQueue, HDF5IO
+from nwb_linkml.providers.schema import SchemaProvider
+from rich import print
+from pydantic import BaseModel
+
+
+class Rank(NamedTuple):
+    has_depends: bool
+    not_leaf: bool
+    not_dataset: bool
+    has_type: bool
+
+def sort_flat(item:Tuple[str, H5SourceItem]):
+
+    return Rank(
+        has_depends=len(item[1].depends)>0,
+        not_leaf = ~item[1].leaf,
+        not_dataset = item[1].h5_type != 'dataset',
+        has_type = 'neurodata_type' in item[1].attrs
+    )
+
+def prune_empty(flat: FlatH5) -> FlatH5:
+    """
+    Groups without children or attrs can be removed
+    """
+    deletes = []
+    for k,v in flat.items():
+        if v.leaf and v.h5_type == 'group' and len(v.attrs) == 0:
+            deletes.append(k)
+
+    for k in deletes:
+        del flat[k]
+
+    return flat
+
+def resolve_scalars(res: ReadQueue) -> ReadQueue:
+    for path, item in res.queue.copy().items():
+        if item.h5_type == 'group':
+            continue
+        dset = res.h5f.get(path)
+        if dset.shape == ():
+            res.completed[path] = dset[()]
+            res.queue.pop(path)
+    return res
+
+def resolve_terminal_arrays(res:ReadQueue) -> ReadQueue:
+    """Terminal arrays can just get loaded as a dict"""
+    for path, item in res.queue.copy().items():
+        if item.h5_type != 'dataset' or not item.leaf or len(item.depends) > 0:
+            continue
+        h5_object = res.h5f.get(path)
+        item_dict = {
+            'name': path.split('/')[-1],
+            'array': h5_object[:],
+            **h5_object.attrs,
+        }
+        res.completed[path] = item_dict
+        res.queue.pop(path)
+    return res
+
+def attempt_parentless(res:ReadQueue, provider:SchemaProvider) -> ReadQueue:
+    """Try the groups whose parents have no neurodata type (ie. acquisition)"""
+    for path, item in res.queue.copy().items():
+        if item.h5_type == 'dataset':
+            continue
+        group = res.h5f.get(path)
+        if 'neurodata_type' in group.parent.attrs.keys() or 'neurodata_type' not in group.attrs.keys():
+            continue
+        model = provider.get_class(group.attrs['namespace'], group.attrs['neurodata_type'])
+        res = naive_instantiation(group, model, res)
+    return res
+
+
+
+def naive_instantiation(element: h5py.Group|h5py.Dataset, model:BaseModel, res:ReadQueue) -> Optional[BaseModel]:
+    """
+    Try to instantiate model with just the attrs and any resolved children
+    """
+    print(element)
+    kwargs = {}
+    kwargs['name'] = element.name.split('/')[-1]
+    for k in element.attrs.keys():
+        try:
+            kwargs[k] = element.attrs[k]
+        except Exception as e:
+            print(f'couldnt load attr: {e}')
+    for key, child in element.items():
+        if child.name in res.completed:
+            kwargs[child.name] = res.completed[child.name]
+
+    kwargs = {k:v for k,v in kwargs.items() if k in model.model_fields.keys()}
+
+    try:
+        instance = model(**kwargs)
+        res.queue.pop(element.name)
+        res.completed[element.name] = instance
+        print('succeeded')
+        return res
+    except Exception as e:
+        print(f'failed: {e}')
+        return res
+
+
+# --------------------------------------------------
+path = '/Users/jonny/Dropbox/lab/p2p_ld/data/nwb/sub-738651046_ses-760693773_probe-769322820_ecephys.nwb'
+
+h5io = HDF5IO(path)
+provider = h5io.make_provider()
+
+h5f = h5py.File(path)
+flat = flatten_hdf(h5f)
+
+flat = prune_empty(flat)
+flat_sorted = dict(sorted(flat.items(), key=sort_flat))
+
+res = ReadQueue(h5f=h5f, queue=flat_sorted.copy())
+
+res = resolve_scalars(res)
+res = resolve_terminal_arrays(res)
+res = attempt_parentless(res, provider)
+
--- a/nwb_linkml/tests/test_io/test_io_hdf5.py
+++ b/nwb_linkml/tests/test_io/test_io_hdf5.py
@ -76,7 +76,6 @@ def test_truncate_file(tmp_output_dir):
    assert target_h5f[target_h5f['link']['child'].attrs['reference_contig']].name == target_h5f['data']['dataset_contig'].name
    assert target_h5f[target_h5f['link']['child'].attrs['reference_chunked']].name == target_h5f['data']['dataset_chunked'].name
    assert target_h5f['data']['dataset_contig'].attrs['anattr'] == 1
-
@pytest.mark.skip()
 def test_flatten_hdf():
    from nwb_linkml.io.hdf5 import HDF5IO, flatten_hdf
@ -85,6 +84,6 @@ def test_flatten_hdf():
    h5f = h5py.File(path)
    flat = flatten_hdf(h5f)
    assert not any(['specifications' in v.path for v in flat.values()])
-
+    pdb.set_trace()
    raise NotImplementedError('Just a stub for local testing for now, finish me!')