Removing DataFrame type from imports, leaving as stub

2025-01-09 21:54:27 +00:00 · 2023-10-04 17:59:10 -07:00 · 2023-10-04 17:59:10 -07:00 · 9947edfed2
commit 9947edfed2
parent f682105c1a
6 changed files with 95 additions and 271 deletions
--- a/docs/conf.py
+++ b/docs/conf.py
@ -40,7 +40,8 @@ intersphinx_mapping = {
    'numpy': ('https://numpy.org/doc/stable/', None),
    'pandas': ('https://pandas.pydata.org/docs/', None),
    'pydantic': ('https://docs.pydantic.dev/latest/', None),
-    'h5py': ('https://docs.h5py.org/en/stable/', None)
+    'h5py': ('https://docs.h5py.org/en/stable/', None),
    'dask': ('https://docs.dask.org/en/stable/', None)
 }
--- a/nwb_linkml/src/nwb_linkml/io/hdf5_scratch.py
+++ b/nwb_linkml/src/nwb_linkml/io/hdf5_scratch.py
@ -1,144 +0,0 @@
 """
 Just saving a scratch file temporarily where i was trying a different strategy,
 rather than doing one big recursive pass through, try and solve subsections
 of the tree and then piece them together once you have the others done.
 sort of working. I think what i need to do is populate the 'depends'
 field more so that at each pass i can work through the items whose dependencies
 have been solved from the bottom up.
 """
 from typing import List
 from nwb_linkml.types.df import DataFrame
 class MyDf(DataFrame):
    ints: List[int]
 a = MyDf(ints=[1,2,3])
 from nwb_linkml.io.hdf5 import HDF5IO
 import h5py
 from typing import NamedTuple, Tuple, Optional
 from nwb_linkml.io.hdf5 import HDF5IO
 from nwb_linkml.maps.hdf5 import H5SourceItem, FlatH5, ReadQueue, flatten_hdf
 from nwb_linkml.providers.schema import SchemaProvider
 from rich import print
 from pydantic import BaseModel
 class Rank(NamedTuple):
    has_depends: bool
    not_leaf: bool
    not_dataset: bool
    has_type: bool
 def sort_flat(item:Tuple[str, H5SourceItem]):
    return Rank(
        has_depends=len(item[1].depends)>0,
        not_leaf = ~item[1].leaf,
        not_dataset = item[1].h5_type != 'dataset',
        has_type = 'neurodata_type' in item[1].attrs
    )
 def prune_empty(flat: FlatH5) -> FlatH5:
    """
    Groups without children or attrs can be removed
    """
    deletes = []
    for k,v in flat.items():
        if v.leaf and v.h5_type == 'group' and len(v.attrs) == 0:
            deletes.append(k)
    for k in deletes:
        del flat[k]
    return flat
 def resolve_scalars(res: ReadQueue) -> ReadQueue:
    for path, item in res.queue.copy().items():
        if item.h5_type == 'group':
            continue
        dset = res.h5f.get(path)
        if dset.shape == ():
            res.completed[path] = dset[()]
            res.queue.pop(path)
    return res
 def resolve_terminal_arrays(res:ReadQueue) -> ReadQueue:
    """Terminal arrays can just get loaded as a dict"""
    for path, item in res.queue.copy().items():
        if item.h5_type != 'dataset' or not item.leaf or len(item.depends) > 0:
            continue
        h5_object = res.h5f.get(path)
        item_dict = {
            'name': path.split('/')[-1],
            'array': h5_object[:],
            **h5_object.attrs,
        }
        res.completed[path] = item_dict
        res.queue.pop(path)
    return res
 def attempt_parentless(res:ReadQueue, provider:SchemaProvider) -> ReadQueue:
    """Try the groups whose parents have no neurodata type (ie. acquisition)"""
    for path, item in res.queue.copy().items():
        if item.h5_type == 'dataset':
            continue
        group = res.h5f.get(path)
        if 'neurodata_type' in group.parent.attrs.keys() or 'neurodata_type' not in group.attrs.keys():
            continue
        model = provider.get_class(group.attrs['namespace'], group.attrs['neurodata_type'])
        res = naive_instantiation(group, model, res)
    return res
 def naive_instantiation(element: h5py.Group|h5py.Dataset, model:BaseModel, res:ReadQueue) -> Optional[BaseModel]:
    """
    Try to instantiate model with just the attrs and any resolved children
    """
    print(element)
    kwargs = {}
    kwargs['name'] = element.name.split('/')[-1]
    for k in element.attrs.keys():
        try:
            kwargs[k] = element.attrs[k]
        except Exception as e:
            print(f'couldnt load attr: {e}')
    for key, child in element.items():
        if child.name in res.completed:
            kwargs[child.name] = res.completed[child.name]
    kwargs = {k:v for k,v in kwargs.items() if k in model.model_fields.keys()}
    try:
        instance = model(**kwargs)
        res.queue.pop(element.name)
        res.completed[element.name] = instance
        print('succeeded')
        return res
    except Exception as e:
        print(f'failed: {e}')
        return res
 # --------------------------------------------------
 path = '/Users/jonny/Dropbox/lab/p2p_ld/data/nwb/sub-738651046_ses-760693773_probe-769322820_ecephys.nwb'
 h5io = HDF5IO(path)
 provider = h5io.make_provider()
 h5f = h5py.File(path)
 flat = flatten_hdf(h5f)
 flat = prune_empty(flat)
 flat_sorted = dict(sorted(flat.items(), key=sort_flat))
 res = ReadQueue(h5f=h5f, queue=flat_sorted.copy())
 res = resolve_scalars(res)
 res = resolve_terminal_arrays(res)
 res = attempt_parentless(res, provider)
--- a/nwb_linkml/src/nwb_linkml/maps/hdmf.py
+++ b/nwb_linkml/src/nwb_linkml/maps/hdmf.py
@ -1,82 +1,36 @@
 """
 Mapping functions for handling HDMF classes like DynamicTables
 """
 import pdb
 import warnings
 from typing import List, Type, Optional, Any
-import ast
+import warnings
-from nwb_linkml.types import DataFrame
+
 import h5py
 from pydantic import create_model, BaseModel
 from nwb_linkml.maps import dtype
 import numpy as np
 from nwb_linkml.types.hdf5 import HDF5_Path
 from nwb_linkml.types.ndarray import NDArray, NDArrayProxy
 from nwb_linkml.annotations import get_inner_types
 import dask.array as da
 import nptyping
-def model_from_dynamictable(group:h5py.Group, base:Optional[BaseModel] = None) -> Type[DataFrame]:
+
 def model_from_dynamictable(group:h5py.Group, base:Optional[BaseModel] = None) -> Type[BaseModel]:
    """
    Create a pydantic model from a dynamic table
    """
    colnames = group.attrs['colnames']
    types = {}
    for col in colnames:
        # idxname = col + '_index'
        # if idxname in group.keys():
        #     idx = group.get(idxname)[0]
        #     dset = group.get(col)
        #     item = dset[idx]
        # else:
        #     dset = group.get(col)
        #     item = dset[0]
        # # read the first entry to see what we got
        #
        # if isinstance(item, bytes):
        #     item = item.decode('utf-8')
        # if isinstance(item, str):
        #     # try to see if this is actually a list or smth encoded as a string
        #     try:
        #         item = ast.literal_eval(item)
        #     except (ValueError, SyntaxError):
        #         pass
        # Get a nptypes type for the array
        #pdb.set_trace()
        # type_ = type(item)
        # type_ = dtype.np_to_python.get(type_, type_)
        # if type_ is h5py.h5r.Reference:
        #     #type_ = HDF5_Path
        #     type_ = 'String'
        # elif type_ is np.ndarray:
        #     item: np.ndarray
        #     type_ = dtype.flat_to_npytyping[item.dtype.name]
        #if type_ is not np.void:
            #type_ = NDArray[Any, getattr(nptyping, dtype.flat_to_npytyping[item.dtype.name])]
        #nptype = nptyping.typing_.name_per_dtype[group[col].dtype.type]
        nptype = group[col].dtype.type
        if nptype == np.void:
-            # warnings.warn(f"Cant handle numpy void type for column {col} in {group.name}")
+            warnings.warn(f"Cant handle numpy void type for column {col} in {group.name}")
            continue
        type_ = Optional[NDArray[Any, nptype]]
-
+        # FIXME: handling nested column types that appear only in some versions?
            # FIXME: handling nested column types that appear only in some versions?
        #types[col] = (List[type_ | None], ...)
        types[col] = (type_, None)
    # if base is None:
    #     #base = DataFrame
    #     base = BaseModel
    # else:
    #     base = (BaseModel, base)
    #     #base = (DataFrame, base)
    model = create_model(group.name.split('/')[-1], **types, __base__=base)
    return model
@ -120,70 +74,6 @@ def dynamictable_to_model(
                 **items)
 def dynamictable_to_df(group:h5py.Group,
                       model:Optional[Type[DataFrame]]=None,
                       base:Optional[BaseModel] = None) -> DataFrame:
    if model is None:
        model = model_from_dynamictable(group, base)
    items = {}
    for col, col_type in model.model_fields.items():
        if col not in group.keys():
            continue
        idxname = col + '_index'
        if idxname in group.keys():
            idx = group.get(idxname)[:]
            data = group.get(col)[idx-1]
        else:
            data = group.get(col)[:]
        # Handle typing inside of list
        if isinstance(data[0], bytes):
            data = data.astype('unicode')
        if isinstance(data[0], str):
            # lists and other compound data types can get flattened out to strings when stored
            # so we try and literal eval and recover them
            try:
                eval_type = type(ast.literal_eval(data[0]))
            except (ValueError, SyntaxError):
                eval_type = str
            # if we've found one of those, get the data type within it.
            if eval_type is not str:
                eval_list = []
                for item in data.tolist():
                    try:
                        eval_list.append(ast.literal_eval(item))
                    except ValueError:
                        eval_list.append(None)
                data = eval_list
        elif isinstance(data[0], h5py.h5r.Reference):
            data = [HDF5_Path(group[d].name) for d in data]
        elif isinstance(data[0], tuple) and any([isinstance(d, h5py.h5r.Reference) for d in data[0]]):
            # references stored inside a tuple, reference + location.
            # dereference them!?
            dset = group.get(col)
            names = dset.dtype.names
            if names is not None and names[0] == 'idx_start' and names[1] == 'count':
                data = dereference_reference_vector(dset, data)
        else:
            data = data.tolist()
        # After list, check if we need to put this thing inside of
        # another class, as indicated by the enclosing model
        items[col] = data
    return model(hdf5_path = group.name,
                 name = group.name.split('/')[-1],
                 **items)
 def dereference_reference_vector(dset: h5py.Dataset, data:Optional[List[Any]]) -> List:
    """
    Given a compound dataset with indices, counts, and object references, dereference to values
--- a/nwb_linkml/src/nwb_linkml/types/init.py
+++ b/nwb_linkml/src/nwb_linkml/types/init.py
@ -1,2 +1 @@
-from nwb_linkml.types.ndarray import NDArray
+from nwb_linkml.types.ndarray import NDArray
 from nwb_linkml.types.df import DataFrame
--- a/nwb_linkml/src/nwb_linkml/types/df.py
+++ b/nwb_linkml/src/nwb_linkml/types/df.py
@ -1,8 +1,19 @@
 """
 Pydantic models that behave like pandas dataframes
 .. note::
    This is currently unused but kept in place as a stub in case it is worth revisiting in the future.
    It turned out to be too momentarily difficult to make lazy-loading work with dask arrays per column
    while still keeping pandas-like API intact. In the future we should investigate modifying the
    :func:`dask.dataframe.read_hdf` function to treat individual hdf5 datasets like columns
    pandas has been removed from dependencies for now, as it not used elsewhere, but it is
    left in this module since it is necessary for it to make sense.
 """
 import ast
 import pdb
-from typing import List, Any, get_origin, get_args, Union, Optional, Dict
+from typing import List, Any, get_origin, get_args, Union, Optional, Dict, Type
 from types import NoneType
 import h5py
@ -16,6 +27,10 @@ from pydantic import (
    model_validator
 )
 from nwb_linkml.maps.hdmf import model_from_dynamictable, dereference_reference_vector
 from nwb_linkml.types.hdf5 import HDF5_Path
 class DataFrame(BaseModel, pd.DataFrame):
    """
    Pydantic model root class that mimics a pandas dataframe.
@ -116,3 +131,65 @@ class DataFrame(BaseModel, pd.DataFrame):
                for k, v in out.items()
            }
            return nxt(self.__class__(**out))
 def dynamictable_to_df(group:h5py.Group,
                       model:Optional[Type[DataFrame]]=None,
                       base:Optional[BaseModel] = None) -> DataFrame:
    if model is None:
        model = model_from_dynamictable(group, base)
    items = {}
    for col, col_type in model.model_fields.items():
        if col not in group.keys():
            continue
        idxname = col + '_index'
        if idxname in group.keys():
            idx = group.get(idxname)[:]
            data = group.get(col)[idx-1]
        else:
            data = group.get(col)[:]
        # Handle typing inside of list
        if isinstance(data[0], bytes):
            data = data.astype('unicode')
        if isinstance(data[0], str):
            # lists and other compound data types can get flattened out to strings when stored
            # so we try and literal eval and recover them
            try:
                eval_type = type(ast.literal_eval(data[0]))
            except (ValueError, SyntaxError):
                eval_type = str
            # if we've found one of those, get the data type within it.
            if eval_type is not str:
                eval_list = []
                for item in data.tolist():
                    try:
                        eval_list.append(ast.literal_eval(item))
                    except ValueError:
                        eval_list.append(None)
                data = eval_list
        elif isinstance(data[0], h5py.h5r.Reference):
            data = [HDF5_Path(group[d].name) for d in data]
        elif isinstance(data[0], tuple) and any([isinstance(d, h5py.h5r.Reference) for d in data[0]]):
            # references stored inside a tuple, reference + location.
            # dereference them!?
            dset = group.get(col)
            names = dset.dtype.names
            if names is not None and names[0] == 'idx_start' and names[1] == 'count':
                data = dereference_reference_vector(dset, data)
        else:
            data = data.tolist()
        # After list, check if we need to put this thing inside of
        # another class, as indicated by the enclosing model
        items[col] = data
    return model(hdf5_path = group.name,
                 name = group.name.split('/')[-1],
                 **items)
--- a/nwb_linkml/tests/test_types/test_df.py
+++ b/nwb_linkml/tests/test_types/test_df.py
@ -3,18 +3,19 @@ import pytest
 import pandas as pd
 from pydantic import BaseModel, ValidationError
 from typing import List, Union, Optional
 from nwb_linkml.types import DataFrame
@pytest.mark.skip()
 def test_df():
    """
    Dataframe class should behave like both a pydantic model and a dataframe
    """
    from nwb_linkml.types.df import DataFrame
-class MyDf(DataFrame):
+    class MyDf(DataFrame):
-    ints: List[int]
+        ints: List[int]
-    strings: List[str]
+        strings: List[str]
-    multi:  List[int | str]
+        multi:  List[int | str]
-    opts: Optional[List[int]] = None
+        opts: Optional[List[int]] = None
    good_kwargs = {
        'ints': [1,2,3],
`@ -1,2 +1 @@`
	`from nwb_linkml.types.ndarray import NDArray`	`from nwb_linkml.types.ndarray import NDArray`
	`from nwb_linkml.types.df import DataFrame`