diff --git a/docs/conf.py b/docs/conf.py index c4286b0..3028388 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -40,7 +40,8 @@ intersphinx_mapping = { 'numpy': ('https://numpy.org/doc/stable/', None), 'pandas': ('https://pandas.pydata.org/docs/', None), 'pydantic': ('https://docs.pydantic.dev/latest/', None), - 'h5py': ('https://docs.h5py.org/en/stable/', None) + 'h5py': ('https://docs.h5py.org/en/stable/', None), + 'dask': ('https://docs.dask.org/en/stable/', None) } diff --git a/nwb_linkml/src/nwb_linkml/io/hdf5_scratch.py b/nwb_linkml/src/nwb_linkml/io/hdf5_scratch.py deleted file mode 100644 index afbff5b..0000000 --- a/nwb_linkml/src/nwb_linkml/io/hdf5_scratch.py +++ /dev/null @@ -1,144 +0,0 @@ -""" -Just saving a scratch file temporarily where i was trying a different strategy, -rather than doing one big recursive pass through, try and solve subsections -of the tree and then piece them together once you have the others done. - -sort of working. I think what i need to do is populate the 'depends' -field more so that at each pass i can work through the items whose dependencies -have been solved from the bottom up. -""" - -from typing import List -from nwb_linkml.types.df import DataFrame - -class MyDf(DataFrame): - ints: List[int] - -a = MyDf(ints=[1,2,3]) - - -from nwb_linkml.io.hdf5 import HDF5IO -import h5py -from typing import NamedTuple, Tuple, Optional -from nwb_linkml.io.hdf5 import HDF5IO -from nwb_linkml.maps.hdf5 import H5SourceItem, FlatH5, ReadQueue, flatten_hdf -from nwb_linkml.providers.schema import SchemaProvider -from rich import print -from pydantic import BaseModel - - -class Rank(NamedTuple): - has_depends: bool - not_leaf: bool - not_dataset: bool - has_type: bool - -def sort_flat(item:Tuple[str, H5SourceItem]): - - return Rank( - has_depends=len(item[1].depends)>0, - not_leaf = ~item[1].leaf, - not_dataset = item[1].h5_type != 'dataset', - has_type = 'neurodata_type' in item[1].attrs - ) - -def prune_empty(flat: FlatH5) -> FlatH5: - """ - Groups without children or attrs can be removed - """ - deletes = [] - for k,v in flat.items(): - if v.leaf and v.h5_type == 'group' and len(v.attrs) == 0: - deletes.append(k) - - for k in deletes: - del flat[k] - - return flat - -def resolve_scalars(res: ReadQueue) -> ReadQueue: - for path, item in res.queue.copy().items(): - if item.h5_type == 'group': - continue - dset = res.h5f.get(path) - if dset.shape == (): - res.completed[path] = dset[()] - res.queue.pop(path) - return res - -def resolve_terminal_arrays(res:ReadQueue) -> ReadQueue: - """Terminal arrays can just get loaded as a dict""" - for path, item in res.queue.copy().items(): - if item.h5_type != 'dataset' or not item.leaf or len(item.depends) > 0: - continue - h5_object = res.h5f.get(path) - item_dict = { - 'name': path.split('/')[-1], - 'array': h5_object[:], - **h5_object.attrs, - } - res.completed[path] = item_dict - res.queue.pop(path) - return res - -def attempt_parentless(res:ReadQueue, provider:SchemaProvider) -> ReadQueue: - """Try the groups whose parents have no neurodata type (ie. acquisition)""" - for path, item in res.queue.copy().items(): - if item.h5_type == 'dataset': - continue - group = res.h5f.get(path) - if 'neurodata_type' in group.parent.attrs.keys() or 'neurodata_type' not in group.attrs.keys(): - continue - model = provider.get_class(group.attrs['namespace'], group.attrs['neurodata_type']) - res = naive_instantiation(group, model, res) - return res - - - -def naive_instantiation(element: h5py.Group|h5py.Dataset, model:BaseModel, res:ReadQueue) -> Optional[BaseModel]: - """ - Try to instantiate model with just the attrs and any resolved children - """ - print(element) - kwargs = {} - kwargs['name'] = element.name.split('/')[-1] - for k in element.attrs.keys(): - try: - kwargs[k] = element.attrs[k] - except Exception as e: - print(f'couldnt load attr: {e}') - for key, child in element.items(): - if child.name in res.completed: - kwargs[child.name] = res.completed[child.name] - - kwargs = {k:v for k,v in kwargs.items() if k in model.model_fields.keys()} - - try: - instance = model(**kwargs) - res.queue.pop(element.name) - res.completed[element.name] = instance - print('succeeded') - return res - except Exception as e: - print(f'failed: {e}') - return res - - -# -------------------------------------------------- -path = '/Users/jonny/Dropbox/lab/p2p_ld/data/nwb/sub-738651046_ses-760693773_probe-769322820_ecephys.nwb' - -h5io = HDF5IO(path) -provider = h5io.make_provider() - -h5f = h5py.File(path) -flat = flatten_hdf(h5f) - -flat = prune_empty(flat) -flat_sorted = dict(sorted(flat.items(), key=sort_flat)) - -res = ReadQueue(h5f=h5f, queue=flat_sorted.copy()) - -res = resolve_scalars(res) -res = resolve_terminal_arrays(res) -res = attempt_parentless(res, provider) - diff --git a/nwb_linkml/src/nwb_linkml/maps/hdmf.py b/nwb_linkml/src/nwb_linkml/maps/hdmf.py index bc996e2..56e29a8 100644 --- a/nwb_linkml/src/nwb_linkml/maps/hdmf.py +++ b/nwb_linkml/src/nwb_linkml/maps/hdmf.py @@ -1,82 +1,36 @@ """ Mapping functions for handling HDMF classes like DynamicTables """ -import pdb -import warnings from typing import List, Type, Optional, Any -import ast -from nwb_linkml.types import DataFrame +import warnings + + import h5py from pydantic import create_model, BaseModel -from nwb_linkml.maps import dtype import numpy as np from nwb_linkml.types.hdf5 import HDF5_Path from nwb_linkml.types.ndarray import NDArray, NDArrayProxy -from nwb_linkml.annotations import get_inner_types import dask.array as da -import nptyping -def model_from_dynamictable(group:h5py.Group, base:Optional[BaseModel] = None) -> Type[DataFrame]: + +def model_from_dynamictable(group:h5py.Group, base:Optional[BaseModel] = None) -> Type[BaseModel]: """ Create a pydantic model from a dynamic table """ colnames = group.attrs['colnames'] types = {} for col in colnames: - # idxname = col + '_index' - # if idxname in group.keys(): - # idx = group.get(idxname)[0] - # dset = group.get(col) - # item = dset[idx] - # else: - # dset = group.get(col) - # item = dset[0] - # # read the first entry to see what we got - # - # if isinstance(item, bytes): - # item = item.decode('utf-8') - # if isinstance(item, str): - # # try to see if this is actually a list or smth encoded as a string - # try: - # item = ast.literal_eval(item) - # except (ValueError, SyntaxError): - # pass - # Get a nptypes type for the array - #pdb.set_trace() - - # type_ = type(item) - # type_ = dtype.np_to_python.get(type_, type_) - # if type_ is h5py.h5r.Reference: - # #type_ = HDF5_Path - # type_ = 'String' - # elif type_ is np.ndarray: - # item: np.ndarray - # type_ = dtype.flat_to_npytyping[item.dtype.name] - - #if type_ is not np.void: - #type_ = NDArray[Any, getattr(nptyping, dtype.flat_to_npytyping[item.dtype.name])] - - #nptype = nptyping.typing_.name_per_dtype[group[col].dtype.type] nptype = group[col].dtype.type if nptype == np.void: - # warnings.warn(f"Cant handle numpy void type for column {col} in {group.name}") + warnings.warn(f"Cant handle numpy void type for column {col} in {group.name}") continue type_ = Optional[NDArray[Any, nptype]] - - # FIXME: handling nested column types that appear only in some versions? + # FIXME: handling nested column types that appear only in some versions? #types[col] = (List[type_ | None], ...) types[col] = (type_, None) - # if base is None: - # #base = DataFrame - # base = BaseModel - # else: - # base = (BaseModel, base) - # #base = (DataFrame, base) - - model = create_model(group.name.split('/')[-1], **types, __base__=base) return model @@ -120,70 +74,6 @@ def dynamictable_to_model( **items) - - -def dynamictable_to_df(group:h5py.Group, - model:Optional[Type[DataFrame]]=None, - base:Optional[BaseModel] = None) -> DataFrame: - if model is None: - model = model_from_dynamictable(group, base) - - items = {} - for col, col_type in model.model_fields.items(): - if col not in group.keys(): - continue - idxname = col + '_index' - if idxname in group.keys(): - idx = group.get(idxname)[:] - data = group.get(col)[idx-1] - else: - data = group.get(col)[:] - - # Handle typing inside of list - if isinstance(data[0], bytes): - data = data.astype('unicode') - if isinstance(data[0], str): - # lists and other compound data types can get flattened out to strings when stored - # so we try and literal eval and recover them - try: - eval_type = type(ast.literal_eval(data[0])) - except (ValueError, SyntaxError): - eval_type = str - - # if we've found one of those, get the data type within it. - if eval_type is not str: - eval_list = [] - for item in data.tolist(): - try: - eval_list.append(ast.literal_eval(item)) - except ValueError: - eval_list.append(None) - data = eval_list - elif isinstance(data[0], h5py.h5r.Reference): - data = [HDF5_Path(group[d].name) for d in data] - elif isinstance(data[0], tuple) and any([isinstance(d, h5py.h5r.Reference) for d in data[0]]): - # references stored inside a tuple, reference + location. - # dereference them!? - dset = group.get(col) - names = dset.dtype.names - if names is not None and names[0] == 'idx_start' and names[1] == 'count': - data = dereference_reference_vector(dset, data) - - else: - data = data.tolist() - - # After list, check if we need to put this thing inside of - # another class, as indicated by the enclosing model - - - - items[col] = data - - return model(hdf5_path = group.name, - name = group.name.split('/')[-1], - **items) - - def dereference_reference_vector(dset: h5py.Dataset, data:Optional[List[Any]]) -> List: """ Given a compound dataset with indices, counts, and object references, dereference to values diff --git a/nwb_linkml/src/nwb_linkml/types/__init__.py b/nwb_linkml/src/nwb_linkml/types/__init__.py index b0523c0..801a327 100644 --- a/nwb_linkml/src/nwb_linkml/types/__init__.py +++ b/nwb_linkml/src/nwb_linkml/types/__init__.py @@ -1,2 +1 @@ -from nwb_linkml.types.ndarray import NDArray -from nwb_linkml.types.df import DataFrame \ No newline at end of file +from nwb_linkml.types.ndarray import NDArray \ No newline at end of file diff --git a/nwb_linkml/src/nwb_linkml/types/df.py b/nwb_linkml/src/nwb_linkml/types/df.py index df32fb6..836bbbb 100644 --- a/nwb_linkml/src/nwb_linkml/types/df.py +++ b/nwb_linkml/src/nwb_linkml/types/df.py @@ -1,8 +1,19 @@ """ Pydantic models that behave like pandas dataframes + +.. note:: + + This is currently unused but kept in place as a stub in case it is worth revisiting in the future. + It turned out to be too momentarily difficult to make lazy-loading work with dask arrays per column + while still keeping pandas-like API intact. In the future we should investigate modifying the + :func:`dask.dataframe.read_hdf` function to treat individual hdf5 datasets like columns + + pandas has been removed from dependencies for now, as it not used elsewhere, but it is + left in this module since it is necessary for it to make sense. """ +import ast import pdb -from typing import List, Any, get_origin, get_args, Union, Optional, Dict +from typing import List, Any, get_origin, get_args, Union, Optional, Dict, Type from types import NoneType import h5py @@ -16,6 +27,10 @@ from pydantic import ( model_validator ) +from nwb_linkml.maps.hdmf import model_from_dynamictable, dereference_reference_vector +from nwb_linkml.types.hdf5 import HDF5_Path + + class DataFrame(BaseModel, pd.DataFrame): """ Pydantic model root class that mimics a pandas dataframe. @@ -116,3 +131,65 @@ class DataFrame(BaseModel, pd.DataFrame): for k, v in out.items() } return nxt(self.__class__(**out)) + + +def dynamictable_to_df(group:h5py.Group, + model:Optional[Type[DataFrame]]=None, + base:Optional[BaseModel] = None) -> DataFrame: + if model is None: + model = model_from_dynamictable(group, base) + + items = {} + for col, col_type in model.model_fields.items(): + if col not in group.keys(): + continue + idxname = col + '_index' + if idxname in group.keys(): + idx = group.get(idxname)[:] + data = group.get(col)[idx-1] + else: + data = group.get(col)[:] + + # Handle typing inside of list + if isinstance(data[0], bytes): + data = data.astype('unicode') + if isinstance(data[0], str): + # lists and other compound data types can get flattened out to strings when stored + # so we try and literal eval and recover them + try: + eval_type = type(ast.literal_eval(data[0])) + except (ValueError, SyntaxError): + eval_type = str + + # if we've found one of those, get the data type within it. + if eval_type is not str: + eval_list = [] + for item in data.tolist(): + try: + eval_list.append(ast.literal_eval(item)) + except ValueError: + eval_list.append(None) + data = eval_list + elif isinstance(data[0], h5py.h5r.Reference): + data = [HDF5_Path(group[d].name) for d in data] + elif isinstance(data[0], tuple) and any([isinstance(d, h5py.h5r.Reference) for d in data[0]]): + # references stored inside a tuple, reference + location. + # dereference them!? + dset = group.get(col) + names = dset.dtype.names + if names is not None and names[0] == 'idx_start' and names[1] == 'count': + data = dereference_reference_vector(dset, data) + + else: + data = data.tolist() + + # After list, check if we need to put this thing inside of + # another class, as indicated by the enclosing model + + + + items[col] = data + + return model(hdf5_path = group.name, + name = group.name.split('/')[-1], + **items) diff --git a/nwb_linkml/tests/test_types/test_df.py b/nwb_linkml/tests/test_types/test_df.py index d4bef18..07a6c73 100644 --- a/nwb_linkml/tests/test_types/test_df.py +++ b/nwb_linkml/tests/test_types/test_df.py @@ -3,18 +3,19 @@ import pytest import pandas as pd from pydantic import BaseModel, ValidationError from typing import List, Union, Optional -from nwb_linkml.types import DataFrame +@pytest.mark.skip() def test_df(): """ Dataframe class should behave like both a pydantic model and a dataframe """ + from nwb_linkml.types.df import DataFrame -class MyDf(DataFrame): - ints: List[int] - strings: List[str] - multi: List[int | str] - opts: Optional[List[int]] = None + class MyDf(DataFrame): + ints: List[int] + strings: List[str] + multi: List[int | str] + opts: Optional[List[int]] = None good_kwargs = { 'ints': [1,2,3],