Removing DataFrame type from imports, leaving as stub

2025-01-09 21:54:27 +00:00 · 2023-10-04 17:59:10 -07:00 · 2023-10-04 17:59:10 -07:00 · 9947edfed2
commit 9947edfed2
parent f682105c1a
6 changed files with 95 additions and 271 deletions
--- a/docs/conf.py
+++ b/docs/conf.py
@ -40,7 +40,8 @@ intersphinx_mapping = {
    'numpy': ('https://numpy.org/doc/stable/', None),
    'pandas': ('https://pandas.pydata.org/docs/', None),
    'pydantic': ('https://docs.pydantic.dev/latest/', None),
-    'h5py': ('https://docs.h5py.org/en/stable/', None)
+    'h5py': ('https://docs.h5py.org/en/stable/', None),
+    'dask': ('https://docs.dask.org/en/stable/', None)

 }

--- a/nwb_linkml/src/nwb_linkml/io/hdf5_scratch.py
+++ b/nwb_linkml/src/nwb_linkml/io/hdf5_scratch.py
@ -1,144 +0,0 @@
-"""
-Just saving a scratch file temporarily where i was trying a different strategy,
-rather than doing one big recursive pass through, try and solve subsections
-of the tree and then piece them together once you have the others done.
-
-sort of working. I think what i need to do is populate the 'depends'
-field more so that at each pass i can work through the items whose dependencies
-have been solved from the bottom up.
-"""
-
-from typing import List
-from nwb_linkml.types.df import DataFrame
-
-class MyDf(DataFrame):
-    ints: List[int]
-
-a = MyDf(ints=[1,2,3])
-
-
-from nwb_linkml.io.hdf5 import HDF5IO
-import h5py
-from typing import NamedTuple, Tuple, Optional
-from nwb_linkml.io.hdf5 import HDF5IO
-from nwb_linkml.maps.hdf5 import H5SourceItem, FlatH5, ReadQueue, flatten_hdf
-from nwb_linkml.providers.schema import SchemaProvider
-from rich import print
-from pydantic import BaseModel
-
-
-class Rank(NamedTuple):
-    has_depends: bool
-    not_leaf: bool
-    not_dataset: bool
-    has_type: bool
-
-def sort_flat(item:Tuple[str, H5SourceItem]):
-
-    return Rank(
-        has_depends=len(item[1].depends)>0,
-        not_leaf = ~item[1].leaf,
-        not_dataset = item[1].h5_type != 'dataset',
-        has_type = 'neurodata_type' in item[1].attrs
-    )
-
-def prune_empty(flat: FlatH5) -> FlatH5:
-    """
-    Groups without children or attrs can be removed
-    """
-    deletes = []
-    for k,v in flat.items():
-        if v.leaf and v.h5_type == 'group' and len(v.attrs) == 0:
-            deletes.append(k)
-
-    for k in deletes:
-        del flat[k]
-
-    return flat
-
-def resolve_scalars(res: ReadQueue) -> ReadQueue:
-    for path, item in res.queue.copy().items():
-        if item.h5_type == 'group':
-            continue
-        dset = res.h5f.get(path)
-        if dset.shape == ():
-            res.completed[path] = dset[()]
-            res.queue.pop(path)
-    return res
-
-def resolve_terminal_arrays(res:ReadQueue) -> ReadQueue:
-    """Terminal arrays can just get loaded as a dict"""
-    for path, item in res.queue.copy().items():
-        if item.h5_type != 'dataset' or not item.leaf or len(item.depends) > 0:
-            continue
-        h5_object = res.h5f.get(path)
-        item_dict = {
-            'name': path.split('/')[-1],
-            'array': h5_object[:],
-            **h5_object.attrs,
-        }
-        res.completed[path] = item_dict
-        res.queue.pop(path)
-    return res
-
-def attempt_parentless(res:ReadQueue, provider:SchemaProvider) -> ReadQueue:
-    """Try the groups whose parents have no neurodata type (ie. acquisition)"""
-    for path, item in res.queue.copy().items():
-        if item.h5_type == 'dataset':
-            continue
-        group = res.h5f.get(path)
-        if 'neurodata_type' in group.parent.attrs.keys() or 'neurodata_type' not in group.attrs.keys():
-            continue
-        model = provider.get_class(group.attrs['namespace'], group.attrs['neurodata_type'])
-        res = naive_instantiation(group, model, res)
-    return res
-
-
-
-def naive_instantiation(element: h5py.Group|h5py.Dataset, model:BaseModel, res:ReadQueue) -> Optional[BaseModel]:
-    """
-    Try to instantiate model with just the attrs and any resolved children
-    """
-    print(element)
-    kwargs = {}
-    kwargs['name'] = element.name.split('/')[-1]
-    for k in element.attrs.keys():
-        try:
-            kwargs[k] = element.attrs[k]
-        except Exception as e:
-            print(f'couldnt load attr: {e}')
-    for key, child in element.items():
-        if child.name in res.completed:
-            kwargs[child.name] = res.completed[child.name]
-
-    kwargs = {k:v for k,v in kwargs.items() if k in model.model_fields.keys()}
-
-    try:
-        instance = model(**kwargs)
-        res.queue.pop(element.name)
-        res.completed[element.name] = instance
-        print('succeeded')
-        return res
-    except Exception as e:
-        print(f'failed: {e}')
-        return res
-
-
-# --------------------------------------------------
-path = '/Users/jonny/Dropbox/lab/p2p_ld/data/nwb/sub-738651046_ses-760693773_probe-769322820_ecephys.nwb'
-
-h5io = HDF5IO(path)
-provider = h5io.make_provider()
-
-h5f = h5py.File(path)
-flat = flatten_hdf(h5f)
-
-flat = prune_empty(flat)
-flat_sorted = dict(sorted(flat.items(), key=sort_flat))
-
-res = ReadQueue(h5f=h5f, queue=flat_sorted.copy())
-
-res = resolve_scalars(res)
-res = resolve_terminal_arrays(res)
-res = attempt_parentless(res, provider)
-
--- a/nwb_linkml/src/nwb_linkml/maps/hdmf.py
+++ b/nwb_linkml/src/nwb_linkml/maps/hdmf.py
@ -1,82 +1,36 @@
 """
 Mapping functions for handling HDMF classes like DynamicTables
 """
-import pdb
-import warnings
 from typing import List, Type, Optional, Any
-import ast
-from nwb_linkml.types import DataFrame
+import warnings
+
+
 import h5py
 from pydantic import create_model, BaseModel
-from nwb_linkml.maps import dtype
 import numpy as np
 from nwb_linkml.types.hdf5 import HDF5_Path
 from nwb_linkml.types.ndarray import NDArray, NDArrayProxy
-from nwb_linkml.annotations import get_inner_types
 import dask.array as da
-import nptyping

-def model_from_dynamictable(group:h5py.Group, base:Optional[BaseModel] = None) -> Type[DataFrame]:
+
+def model_from_dynamictable(group:h5py.Group, base:Optional[BaseModel] = None) -> Type[BaseModel]:
    """
    Create a pydantic model from a dynamic table
    """
    colnames = group.attrs['colnames']
    types = {}
    for col in colnames:
-        # idxname = col + '_index'
-        # if idxname in group.keys():
-        #     idx = group.get(idxname)[0]
-        #     dset = group.get(col)
-        #     item = dset[idx]
-        # else:
-        #     dset = group.get(col)
-        #     item = dset[0]
-        # # read the first entry to see what we got
-        #
-        # if isinstance(item, bytes):
-        #     item = item.decode('utf-8')
-        # if isinstance(item, str):
-        #     # try to see if this is actually a list or smth encoded as a string
-        #     try:
-        #         item = ast.literal_eval(item)
-        #     except (ValueError, SyntaxError):
-        #         pass

-        # Get a nptypes type for the array
-        #pdb.set_trace()
-
-        # type_ = type(item)
-        # type_ = dtype.np_to_python.get(type_, type_)
-        # if type_ is h5py.h5r.Reference:
-        #     #type_ = HDF5_Path
-        #     type_ = 'String'
-        # elif type_ is np.ndarray:
-        #     item: np.ndarray
-        #     type_ = dtype.flat_to_npytyping[item.dtype.name]
-
-        #if type_ is not np.void:
-            #type_ = NDArray[Any, getattr(nptyping, dtype.flat_to_npytyping[item.dtype.name])]
-
-        #nptype = nptyping.typing_.name_per_dtype[group[col].dtype.type]
        nptype = group[col].dtype.type
        if nptype == np.void:
-            # warnings.warn(f"Cant handle numpy void type for column {col} in {group.name}")
+            warnings.warn(f"Cant handle numpy void type for column {col} in {group.name}")
            continue
        type_ = Optional[NDArray[Any, nptype]]

-
        # FIXME: handling nested column types that appear only in some versions?
        #types[col] = (List[type_ | None], ...)
        types[col] = (type_, None)

-    # if base is None:
-    #     #base = DataFrame
-    #     base = BaseModel
-    # else:
-    #     base = (BaseModel, base)
-    #     #base = (DataFrame, base)
-
-
    model = create_model(group.name.split('/')[-1], **types, __base__=base)
    return model

@ -120,70 +74,6 @@ def dynamictable_to_model(
                 **items)


-
-
-def dynamictable_to_df(group:h5py.Group,
-                       model:Optional[Type[DataFrame]]=None,
-                       base:Optional[BaseModel] = None) -> DataFrame:
-    if model is None:
-        model = model_from_dynamictable(group, base)
-
-    items = {}
-    for col, col_type in model.model_fields.items():
-        if col not in group.keys():
-            continue
-        idxname = col + '_index'
-        if idxname in group.keys():
-            idx = group.get(idxname)[:]
-            data = group.get(col)[idx-1]
-        else:
-            data = group.get(col)[:]
-
-        # Handle typing inside of list
-        if isinstance(data[0], bytes):
-            data = data.astype('unicode')
-        if isinstance(data[0], str):
-            # lists and other compound data types can get flattened out to strings when stored
-            # so we try and literal eval and recover them
-            try:
-                eval_type = type(ast.literal_eval(data[0]))
-            except (ValueError, SyntaxError):
-                eval_type = str
-
-            # if we've found one of those, get the data type within it.
-            if eval_type is not str:
-                eval_list = []
-                for item in data.tolist():
-                    try:
-                        eval_list.append(ast.literal_eval(item))
-                    except ValueError:
-                        eval_list.append(None)
-                data = eval_list
-        elif isinstance(data[0], h5py.h5r.Reference):
-            data = [HDF5_Path(group[d].name) for d in data]
-        elif isinstance(data[0], tuple) and any([isinstance(d, h5py.h5r.Reference) for d in data[0]]):
-            # references stored inside a tuple, reference + location.
-            # dereference them!?
-            dset = group.get(col)
-            names = dset.dtype.names
-            if names is not None and names[0] == 'idx_start' and names[1] == 'count':
-                data = dereference_reference_vector(dset, data)
-
-        else:
-            data = data.tolist()
-
-        # After list, check if we need to put this thing inside of
-        # another class, as indicated by the enclosing model
-
-
-
-        items[col] = data
-
-    return model(hdf5_path = group.name,
-                 name = group.name.split('/')[-1],
-                 **items)
-
-
 def dereference_reference_vector(dset: h5py.Dataset, data:Optional[List[Any]]) -> List:
    """
    Given a compound dataset with indices, counts, and object references, dereference to values
--- a/nwb_linkml/src/nwb_linkml/types/init.py
+++ b/nwb_linkml/src/nwb_linkml/types/init.py
@ -1,2 +1 @@
 from nwb_linkml.types.ndarray import NDArray
-from nwb_linkml.types.df import DataFrame
--- a/nwb_linkml/src/nwb_linkml/types/df.py
+++ b/nwb_linkml/src/nwb_linkml/types/df.py
@ -1,8 +1,19 @@
 """
 Pydantic models that behave like pandas dataframes
+
+.. note::
+
+    This is currently unused but kept in place as a stub in case it is worth revisiting in the future.
+    It turned out to be too momentarily difficult to make lazy-loading work with dask arrays per column
+    while still keeping pandas-like API intact. In the future we should investigate modifying the
+    :func:`dask.dataframe.read_hdf` function to treat individual hdf5 datasets like columns
+
+    pandas has been removed from dependencies for now, as it not used elsewhere, but it is
+    left in this module since it is necessary for it to make sense.
 """
+import ast
 import pdb
-from typing import List, Any, get_origin, get_args, Union, Optional, Dict
+from typing import List, Any, get_origin, get_args, Union, Optional, Dict, Type
 from types import NoneType

 import h5py
@ -16,6 +27,10 @@ from pydantic import (
    model_validator
 )

+from nwb_linkml.maps.hdmf import model_from_dynamictable, dereference_reference_vector
+from nwb_linkml.types.hdf5 import HDF5_Path
+
+
 class DataFrame(BaseModel, pd.DataFrame):
    """
    Pydantic model root class that mimics a pandas dataframe.
@ -116,3 +131,65 @@ class DataFrame(BaseModel, pd.DataFrame):
                for k, v in out.items()
            }
            return nxt(self.__class__(**out))
+
+
+def dynamictable_to_df(group:h5py.Group,
+                       model:Optional[Type[DataFrame]]=None,
+                       base:Optional[BaseModel] = None) -> DataFrame:
+    if model is None:
+        model = model_from_dynamictable(group, base)
+
+    items = {}
+    for col, col_type in model.model_fields.items():
+        if col not in group.keys():
+            continue
+        idxname = col + '_index'
+        if idxname in group.keys():
+            idx = group.get(idxname)[:]
+            data = group.get(col)[idx-1]
+        else:
+            data = group.get(col)[:]
+
+        # Handle typing inside of list
+        if isinstance(data[0], bytes):
+            data = data.astype('unicode')
+        if isinstance(data[0], str):
+            # lists and other compound data types can get flattened out to strings when stored
+            # so we try and literal eval and recover them
+            try:
+                eval_type = type(ast.literal_eval(data[0]))
+            except (ValueError, SyntaxError):
+                eval_type = str
+
+            # if we've found one of those, get the data type within it.
+            if eval_type is not str:
+                eval_list = []
+                for item in data.tolist():
+                    try:
+                        eval_list.append(ast.literal_eval(item))
+                    except ValueError:
+                        eval_list.append(None)
+                data = eval_list
+        elif isinstance(data[0], h5py.h5r.Reference):
+            data = [HDF5_Path(group[d].name) for d in data]
+        elif isinstance(data[0], tuple) and any([isinstance(d, h5py.h5r.Reference) for d in data[0]]):
+            # references stored inside a tuple, reference + location.
+            # dereference them!?
+            dset = group.get(col)
+            names = dset.dtype.names
+            if names is not None and names[0] == 'idx_start' and names[1] == 'count':
+                data = dereference_reference_vector(dset, data)
+
+        else:
+            data = data.tolist()
+
+        # After list, check if we need to put this thing inside of
+        # another class, as indicated by the enclosing model
+
+
+
+        items[col] = data
+
+    return model(hdf5_path = group.name,
+                 name = group.name.split('/')[-1],
+                 **items)
--- a/nwb_linkml/tests/test_types/test_df.py
+++ b/nwb_linkml/tests/test_types/test_df.py
@ -3,14 +3,15 @@ import pytest
 import pandas as pd
 from pydantic import BaseModel, ValidationError
 from typing import List, Union, Optional
-from nwb_linkml.types import DataFrame

+@pytest.mark.skip()
 def test_df():
    """
    Dataframe class should behave like both a pydantic model and a dataframe
    """
+    from nwb_linkml.types.df import DataFrame

-class MyDf(DataFrame):
+    class MyDf(DataFrame):
        ints: List[int]
        strings: List[str]
        multi:  List[int | str]