Removing DataFrame type from imports, leaving as stub

This commit is contained in:
sneakers-the-rat 2023-10-04 17:59:10 -07:00
parent f682105c1a
commit 9947edfed2
6 changed files with 95 additions and 271 deletions

View file

@ -40,7 +40,8 @@ intersphinx_mapping = {
'numpy': ('https://numpy.org/doc/stable/', None),
'pandas': ('https://pandas.pydata.org/docs/', None),
'pydantic': ('https://docs.pydantic.dev/latest/', None),
'h5py': ('https://docs.h5py.org/en/stable/', None)
'h5py': ('https://docs.h5py.org/en/stable/', None),
'dask': ('https://docs.dask.org/en/stable/', None)
}

View file

@ -1,144 +0,0 @@
"""
Just saving a scratch file temporarily where i was trying a different strategy,
rather than doing one big recursive pass through, try and solve subsections
of the tree and then piece them together once you have the others done.
sort of working. I think what i need to do is populate the 'depends'
field more so that at each pass i can work through the items whose dependencies
have been solved from the bottom up.
"""
from typing import List
from nwb_linkml.types.df import DataFrame
class MyDf(DataFrame):
ints: List[int]
a = MyDf(ints=[1,2,3])
from nwb_linkml.io.hdf5 import HDF5IO
import h5py
from typing import NamedTuple, Tuple, Optional
from nwb_linkml.io.hdf5 import HDF5IO
from nwb_linkml.maps.hdf5 import H5SourceItem, FlatH5, ReadQueue, flatten_hdf
from nwb_linkml.providers.schema import SchemaProvider
from rich import print
from pydantic import BaseModel
class Rank(NamedTuple):
has_depends: bool
not_leaf: bool
not_dataset: bool
has_type: bool
def sort_flat(item:Tuple[str, H5SourceItem]):
return Rank(
has_depends=len(item[1].depends)>0,
not_leaf = ~item[1].leaf,
not_dataset = item[1].h5_type != 'dataset',
has_type = 'neurodata_type' in item[1].attrs
)
def prune_empty(flat: FlatH5) -> FlatH5:
"""
Groups without children or attrs can be removed
"""
deletes = []
for k,v in flat.items():
if v.leaf and v.h5_type == 'group' and len(v.attrs) == 0:
deletes.append(k)
for k in deletes:
del flat[k]
return flat
def resolve_scalars(res: ReadQueue) -> ReadQueue:
for path, item in res.queue.copy().items():
if item.h5_type == 'group':
continue
dset = res.h5f.get(path)
if dset.shape == ():
res.completed[path] = dset[()]
res.queue.pop(path)
return res
def resolve_terminal_arrays(res:ReadQueue) -> ReadQueue:
"""Terminal arrays can just get loaded as a dict"""
for path, item in res.queue.copy().items():
if item.h5_type != 'dataset' or not item.leaf or len(item.depends) > 0:
continue
h5_object = res.h5f.get(path)
item_dict = {
'name': path.split('/')[-1],
'array': h5_object[:],
**h5_object.attrs,
}
res.completed[path] = item_dict
res.queue.pop(path)
return res
def attempt_parentless(res:ReadQueue, provider:SchemaProvider) -> ReadQueue:
"""Try the groups whose parents have no neurodata type (ie. acquisition)"""
for path, item in res.queue.copy().items():
if item.h5_type == 'dataset':
continue
group = res.h5f.get(path)
if 'neurodata_type' in group.parent.attrs.keys() or 'neurodata_type' not in group.attrs.keys():
continue
model = provider.get_class(group.attrs['namespace'], group.attrs['neurodata_type'])
res = naive_instantiation(group, model, res)
return res
def naive_instantiation(element: h5py.Group|h5py.Dataset, model:BaseModel, res:ReadQueue) -> Optional[BaseModel]:
"""
Try to instantiate model with just the attrs and any resolved children
"""
print(element)
kwargs = {}
kwargs['name'] = element.name.split('/')[-1]
for k in element.attrs.keys():
try:
kwargs[k] = element.attrs[k]
except Exception as e:
print(f'couldnt load attr: {e}')
for key, child in element.items():
if child.name in res.completed:
kwargs[child.name] = res.completed[child.name]
kwargs = {k:v for k,v in kwargs.items() if k in model.model_fields.keys()}
try:
instance = model(**kwargs)
res.queue.pop(element.name)
res.completed[element.name] = instance
print('succeeded')
return res
except Exception as e:
print(f'failed: {e}')
return res
# --------------------------------------------------
path = '/Users/jonny/Dropbox/lab/p2p_ld/data/nwb/sub-738651046_ses-760693773_probe-769322820_ecephys.nwb'
h5io = HDF5IO(path)
provider = h5io.make_provider()
h5f = h5py.File(path)
flat = flatten_hdf(h5f)
flat = prune_empty(flat)
flat_sorted = dict(sorted(flat.items(), key=sort_flat))
res = ReadQueue(h5f=h5f, queue=flat_sorted.copy())
res = resolve_scalars(res)
res = resolve_terminal_arrays(res)
res = attempt_parentless(res, provider)

View file

@ -1,82 +1,36 @@
"""
Mapping functions for handling HDMF classes like DynamicTables
"""
import pdb
import warnings
from typing import List, Type, Optional, Any
import ast
from nwb_linkml.types import DataFrame
import warnings
import h5py
from pydantic import create_model, BaseModel
from nwb_linkml.maps import dtype
import numpy as np
from nwb_linkml.types.hdf5 import HDF5_Path
from nwb_linkml.types.ndarray import NDArray, NDArrayProxy
from nwb_linkml.annotations import get_inner_types
import dask.array as da
import nptyping
def model_from_dynamictable(group:h5py.Group, base:Optional[BaseModel] = None) -> Type[DataFrame]:
def model_from_dynamictable(group:h5py.Group, base:Optional[BaseModel] = None) -> Type[BaseModel]:
"""
Create a pydantic model from a dynamic table
"""
colnames = group.attrs['colnames']
types = {}
for col in colnames:
# idxname = col + '_index'
# if idxname in group.keys():
# idx = group.get(idxname)[0]
# dset = group.get(col)
# item = dset[idx]
# else:
# dset = group.get(col)
# item = dset[0]
# # read the first entry to see what we got
#
# if isinstance(item, bytes):
# item = item.decode('utf-8')
# if isinstance(item, str):
# # try to see if this is actually a list or smth encoded as a string
# try:
# item = ast.literal_eval(item)
# except (ValueError, SyntaxError):
# pass
# Get a nptypes type for the array
#pdb.set_trace()
# type_ = type(item)
# type_ = dtype.np_to_python.get(type_, type_)
# if type_ is h5py.h5r.Reference:
# #type_ = HDF5_Path
# type_ = 'String'
# elif type_ is np.ndarray:
# item: np.ndarray
# type_ = dtype.flat_to_npytyping[item.dtype.name]
#if type_ is not np.void:
#type_ = NDArray[Any, getattr(nptyping, dtype.flat_to_npytyping[item.dtype.name])]
#nptype = nptyping.typing_.name_per_dtype[group[col].dtype.type]
nptype = group[col].dtype.type
if nptype == np.void:
# warnings.warn(f"Cant handle numpy void type for column {col} in {group.name}")
warnings.warn(f"Cant handle numpy void type for column {col} in {group.name}")
continue
type_ = Optional[NDArray[Any, nptype]]
# FIXME: handling nested column types that appear only in some versions?
#types[col] = (List[type_ | None], ...)
types[col] = (type_, None)
# if base is None:
# #base = DataFrame
# base = BaseModel
# else:
# base = (BaseModel, base)
# #base = (DataFrame, base)
model = create_model(group.name.split('/')[-1], **types, __base__=base)
return model
@ -120,70 +74,6 @@ def dynamictable_to_model(
**items)
def dynamictable_to_df(group:h5py.Group,
model:Optional[Type[DataFrame]]=None,
base:Optional[BaseModel] = None) -> DataFrame:
if model is None:
model = model_from_dynamictable(group, base)
items = {}
for col, col_type in model.model_fields.items():
if col not in group.keys():
continue
idxname = col + '_index'
if idxname in group.keys():
idx = group.get(idxname)[:]
data = group.get(col)[idx-1]
else:
data = group.get(col)[:]
# Handle typing inside of list
if isinstance(data[0], bytes):
data = data.astype('unicode')
if isinstance(data[0], str):
# lists and other compound data types can get flattened out to strings when stored
# so we try and literal eval and recover them
try:
eval_type = type(ast.literal_eval(data[0]))
except (ValueError, SyntaxError):
eval_type = str
# if we've found one of those, get the data type within it.
if eval_type is not str:
eval_list = []
for item in data.tolist():
try:
eval_list.append(ast.literal_eval(item))
except ValueError:
eval_list.append(None)
data = eval_list
elif isinstance(data[0], h5py.h5r.Reference):
data = [HDF5_Path(group[d].name) for d in data]
elif isinstance(data[0], tuple) and any([isinstance(d, h5py.h5r.Reference) for d in data[0]]):
# references stored inside a tuple, reference + location.
# dereference them!?
dset = group.get(col)
names = dset.dtype.names
if names is not None and names[0] == 'idx_start' and names[1] == 'count':
data = dereference_reference_vector(dset, data)
else:
data = data.tolist()
# After list, check if we need to put this thing inside of
# another class, as indicated by the enclosing model
items[col] = data
return model(hdf5_path = group.name,
name = group.name.split('/')[-1],
**items)
def dereference_reference_vector(dset: h5py.Dataset, data:Optional[List[Any]]) -> List:
"""
Given a compound dataset with indices, counts, and object references, dereference to values

View file

@ -1,2 +1 @@
from nwb_linkml.types.ndarray import NDArray
from nwb_linkml.types.df import DataFrame

View file

@ -1,8 +1,19 @@
"""
Pydantic models that behave like pandas dataframes
.. note::
This is currently unused but kept in place as a stub in case it is worth revisiting in the future.
It turned out to be too momentarily difficult to make lazy-loading work with dask arrays per column
while still keeping pandas-like API intact. In the future we should investigate modifying the
:func:`dask.dataframe.read_hdf` function to treat individual hdf5 datasets like columns
pandas has been removed from dependencies for now, as it not used elsewhere, but it is
left in this module since it is necessary for it to make sense.
"""
import ast
import pdb
from typing import List, Any, get_origin, get_args, Union, Optional, Dict
from typing import List, Any, get_origin, get_args, Union, Optional, Dict, Type
from types import NoneType
import h5py
@ -16,6 +27,10 @@ from pydantic import (
model_validator
)
from nwb_linkml.maps.hdmf import model_from_dynamictable, dereference_reference_vector
from nwb_linkml.types.hdf5 import HDF5_Path
class DataFrame(BaseModel, pd.DataFrame):
"""
Pydantic model root class that mimics a pandas dataframe.
@ -116,3 +131,65 @@ class DataFrame(BaseModel, pd.DataFrame):
for k, v in out.items()
}
return nxt(self.__class__(**out))
def dynamictable_to_df(group:h5py.Group,
model:Optional[Type[DataFrame]]=None,
base:Optional[BaseModel] = None) -> DataFrame:
if model is None:
model = model_from_dynamictable(group, base)
items = {}
for col, col_type in model.model_fields.items():
if col not in group.keys():
continue
idxname = col + '_index'
if idxname in group.keys():
idx = group.get(idxname)[:]
data = group.get(col)[idx-1]
else:
data = group.get(col)[:]
# Handle typing inside of list
if isinstance(data[0], bytes):
data = data.astype('unicode')
if isinstance(data[0], str):
# lists and other compound data types can get flattened out to strings when stored
# so we try and literal eval and recover them
try:
eval_type = type(ast.literal_eval(data[0]))
except (ValueError, SyntaxError):
eval_type = str
# if we've found one of those, get the data type within it.
if eval_type is not str:
eval_list = []
for item in data.tolist():
try:
eval_list.append(ast.literal_eval(item))
except ValueError:
eval_list.append(None)
data = eval_list
elif isinstance(data[0], h5py.h5r.Reference):
data = [HDF5_Path(group[d].name) for d in data]
elif isinstance(data[0], tuple) and any([isinstance(d, h5py.h5r.Reference) for d in data[0]]):
# references stored inside a tuple, reference + location.
# dereference them!?
dset = group.get(col)
names = dset.dtype.names
if names is not None and names[0] == 'idx_start' and names[1] == 'count':
data = dereference_reference_vector(dset, data)
else:
data = data.tolist()
# After list, check if we need to put this thing inside of
# another class, as indicated by the enclosing model
items[col] = data
return model(hdf5_path = group.name,
name = group.name.split('/')[-1],
**items)

View file

@ -3,14 +3,15 @@ import pytest
import pandas as pd
from pydantic import BaseModel, ValidationError
from typing import List, Union, Optional
from nwb_linkml.types import DataFrame
@pytest.mark.skip()
def test_df():
"""
Dataframe class should behave like both a pydantic model and a dataframe
"""
from nwb_linkml.types.df import DataFrame
class MyDf(DataFrame):
class MyDf(DataFrame):
ints: List[int]
strings: List[str]
multi: List[int | str]