mirror of
https://github.com/p2p-ld/nwb-linkml.git
synced 2025-01-09 21:54:27 +00:00
Removing DataFrame type from imports, leaving as stub
This commit is contained in:
parent
f682105c1a
commit
9947edfed2
6 changed files with 95 additions and 271 deletions
|
@ -40,7 +40,8 @@ intersphinx_mapping = {
|
|||
'numpy': ('https://numpy.org/doc/stable/', None),
|
||||
'pandas': ('https://pandas.pydata.org/docs/', None),
|
||||
'pydantic': ('https://docs.pydantic.dev/latest/', None),
|
||||
'h5py': ('https://docs.h5py.org/en/stable/', None)
|
||||
'h5py': ('https://docs.h5py.org/en/stable/', None),
|
||||
'dask': ('https://docs.dask.org/en/stable/', None)
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -1,144 +0,0 @@
|
|||
"""
|
||||
Just saving a scratch file temporarily where i was trying a different strategy,
|
||||
rather than doing one big recursive pass through, try and solve subsections
|
||||
of the tree and then piece them together once you have the others done.
|
||||
|
||||
sort of working. I think what i need to do is populate the 'depends'
|
||||
field more so that at each pass i can work through the items whose dependencies
|
||||
have been solved from the bottom up.
|
||||
"""
|
||||
|
||||
from typing import List
|
||||
from nwb_linkml.types.df import DataFrame
|
||||
|
||||
class MyDf(DataFrame):
|
||||
ints: List[int]
|
||||
|
||||
a = MyDf(ints=[1,2,3])
|
||||
|
||||
|
||||
from nwb_linkml.io.hdf5 import HDF5IO
|
||||
import h5py
|
||||
from typing import NamedTuple, Tuple, Optional
|
||||
from nwb_linkml.io.hdf5 import HDF5IO
|
||||
from nwb_linkml.maps.hdf5 import H5SourceItem, FlatH5, ReadQueue, flatten_hdf
|
||||
from nwb_linkml.providers.schema import SchemaProvider
|
||||
from rich import print
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class Rank(NamedTuple):
|
||||
has_depends: bool
|
||||
not_leaf: bool
|
||||
not_dataset: bool
|
||||
has_type: bool
|
||||
|
||||
def sort_flat(item:Tuple[str, H5SourceItem]):
|
||||
|
||||
return Rank(
|
||||
has_depends=len(item[1].depends)>0,
|
||||
not_leaf = ~item[1].leaf,
|
||||
not_dataset = item[1].h5_type != 'dataset',
|
||||
has_type = 'neurodata_type' in item[1].attrs
|
||||
)
|
||||
|
||||
def prune_empty(flat: FlatH5) -> FlatH5:
|
||||
"""
|
||||
Groups without children or attrs can be removed
|
||||
"""
|
||||
deletes = []
|
||||
for k,v in flat.items():
|
||||
if v.leaf and v.h5_type == 'group' and len(v.attrs) == 0:
|
||||
deletes.append(k)
|
||||
|
||||
for k in deletes:
|
||||
del flat[k]
|
||||
|
||||
return flat
|
||||
|
||||
def resolve_scalars(res: ReadQueue) -> ReadQueue:
|
||||
for path, item in res.queue.copy().items():
|
||||
if item.h5_type == 'group':
|
||||
continue
|
||||
dset = res.h5f.get(path)
|
||||
if dset.shape == ():
|
||||
res.completed[path] = dset[()]
|
||||
res.queue.pop(path)
|
||||
return res
|
||||
|
||||
def resolve_terminal_arrays(res:ReadQueue) -> ReadQueue:
|
||||
"""Terminal arrays can just get loaded as a dict"""
|
||||
for path, item in res.queue.copy().items():
|
||||
if item.h5_type != 'dataset' or not item.leaf or len(item.depends) > 0:
|
||||
continue
|
||||
h5_object = res.h5f.get(path)
|
||||
item_dict = {
|
||||
'name': path.split('/')[-1],
|
||||
'array': h5_object[:],
|
||||
**h5_object.attrs,
|
||||
}
|
||||
res.completed[path] = item_dict
|
||||
res.queue.pop(path)
|
||||
return res
|
||||
|
||||
def attempt_parentless(res:ReadQueue, provider:SchemaProvider) -> ReadQueue:
|
||||
"""Try the groups whose parents have no neurodata type (ie. acquisition)"""
|
||||
for path, item in res.queue.copy().items():
|
||||
if item.h5_type == 'dataset':
|
||||
continue
|
||||
group = res.h5f.get(path)
|
||||
if 'neurodata_type' in group.parent.attrs.keys() or 'neurodata_type' not in group.attrs.keys():
|
||||
continue
|
||||
model = provider.get_class(group.attrs['namespace'], group.attrs['neurodata_type'])
|
||||
res = naive_instantiation(group, model, res)
|
||||
return res
|
||||
|
||||
|
||||
|
||||
def naive_instantiation(element: h5py.Group|h5py.Dataset, model:BaseModel, res:ReadQueue) -> Optional[BaseModel]:
|
||||
"""
|
||||
Try to instantiate model with just the attrs and any resolved children
|
||||
"""
|
||||
print(element)
|
||||
kwargs = {}
|
||||
kwargs['name'] = element.name.split('/')[-1]
|
||||
for k in element.attrs.keys():
|
||||
try:
|
||||
kwargs[k] = element.attrs[k]
|
||||
except Exception as e:
|
||||
print(f'couldnt load attr: {e}')
|
||||
for key, child in element.items():
|
||||
if child.name in res.completed:
|
||||
kwargs[child.name] = res.completed[child.name]
|
||||
|
||||
kwargs = {k:v for k,v in kwargs.items() if k in model.model_fields.keys()}
|
||||
|
||||
try:
|
||||
instance = model(**kwargs)
|
||||
res.queue.pop(element.name)
|
||||
res.completed[element.name] = instance
|
||||
print('succeeded')
|
||||
return res
|
||||
except Exception as e:
|
||||
print(f'failed: {e}')
|
||||
return res
|
||||
|
||||
|
||||
# --------------------------------------------------
|
||||
path = '/Users/jonny/Dropbox/lab/p2p_ld/data/nwb/sub-738651046_ses-760693773_probe-769322820_ecephys.nwb'
|
||||
|
||||
h5io = HDF5IO(path)
|
||||
provider = h5io.make_provider()
|
||||
|
||||
h5f = h5py.File(path)
|
||||
flat = flatten_hdf(h5f)
|
||||
|
||||
flat = prune_empty(flat)
|
||||
flat_sorted = dict(sorted(flat.items(), key=sort_flat))
|
||||
|
||||
res = ReadQueue(h5f=h5f, queue=flat_sorted.copy())
|
||||
|
||||
res = resolve_scalars(res)
|
||||
res = resolve_terminal_arrays(res)
|
||||
res = attempt_parentless(res, provider)
|
||||
|
|
@ -1,82 +1,36 @@
|
|||
"""
|
||||
Mapping functions for handling HDMF classes like DynamicTables
|
||||
"""
|
||||
import pdb
|
||||
import warnings
|
||||
from typing import List, Type, Optional, Any
|
||||
import ast
|
||||
from nwb_linkml.types import DataFrame
|
||||
import warnings
|
||||
|
||||
|
||||
import h5py
|
||||
from pydantic import create_model, BaseModel
|
||||
from nwb_linkml.maps import dtype
|
||||
import numpy as np
|
||||
from nwb_linkml.types.hdf5 import HDF5_Path
|
||||
from nwb_linkml.types.ndarray import NDArray, NDArrayProxy
|
||||
from nwb_linkml.annotations import get_inner_types
|
||||
import dask.array as da
|
||||
import nptyping
|
||||
|
||||
def model_from_dynamictable(group:h5py.Group, base:Optional[BaseModel] = None) -> Type[DataFrame]:
|
||||
|
||||
def model_from_dynamictable(group:h5py.Group, base:Optional[BaseModel] = None) -> Type[BaseModel]:
|
||||
"""
|
||||
Create a pydantic model from a dynamic table
|
||||
"""
|
||||
colnames = group.attrs['colnames']
|
||||
types = {}
|
||||
for col in colnames:
|
||||
# idxname = col + '_index'
|
||||
# if idxname in group.keys():
|
||||
# idx = group.get(idxname)[0]
|
||||
# dset = group.get(col)
|
||||
# item = dset[idx]
|
||||
# else:
|
||||
# dset = group.get(col)
|
||||
# item = dset[0]
|
||||
# # read the first entry to see what we got
|
||||
#
|
||||
# if isinstance(item, bytes):
|
||||
# item = item.decode('utf-8')
|
||||
# if isinstance(item, str):
|
||||
# # try to see if this is actually a list or smth encoded as a string
|
||||
# try:
|
||||
# item = ast.literal_eval(item)
|
||||
# except (ValueError, SyntaxError):
|
||||
# pass
|
||||
|
||||
# Get a nptypes type for the array
|
||||
#pdb.set_trace()
|
||||
|
||||
# type_ = type(item)
|
||||
# type_ = dtype.np_to_python.get(type_, type_)
|
||||
# if type_ is h5py.h5r.Reference:
|
||||
# #type_ = HDF5_Path
|
||||
# type_ = 'String'
|
||||
# elif type_ is np.ndarray:
|
||||
# item: np.ndarray
|
||||
# type_ = dtype.flat_to_npytyping[item.dtype.name]
|
||||
|
||||
#if type_ is not np.void:
|
||||
#type_ = NDArray[Any, getattr(nptyping, dtype.flat_to_npytyping[item.dtype.name])]
|
||||
|
||||
#nptype = nptyping.typing_.name_per_dtype[group[col].dtype.type]
|
||||
nptype = group[col].dtype.type
|
||||
if nptype == np.void:
|
||||
# warnings.warn(f"Cant handle numpy void type for column {col} in {group.name}")
|
||||
warnings.warn(f"Cant handle numpy void type for column {col} in {group.name}")
|
||||
continue
|
||||
type_ = Optional[NDArray[Any, nptype]]
|
||||
|
||||
|
||||
# FIXME: handling nested column types that appear only in some versions?
|
||||
# FIXME: handling nested column types that appear only in some versions?
|
||||
#types[col] = (List[type_ | None], ...)
|
||||
types[col] = (type_, None)
|
||||
|
||||
# if base is None:
|
||||
# #base = DataFrame
|
||||
# base = BaseModel
|
||||
# else:
|
||||
# base = (BaseModel, base)
|
||||
# #base = (DataFrame, base)
|
||||
|
||||
|
||||
model = create_model(group.name.split('/')[-1], **types, __base__=base)
|
||||
return model
|
||||
|
||||
|
@ -120,70 +74,6 @@ def dynamictable_to_model(
|
|||
**items)
|
||||
|
||||
|
||||
|
||||
|
||||
def dynamictable_to_df(group:h5py.Group,
|
||||
model:Optional[Type[DataFrame]]=None,
|
||||
base:Optional[BaseModel] = None) -> DataFrame:
|
||||
if model is None:
|
||||
model = model_from_dynamictable(group, base)
|
||||
|
||||
items = {}
|
||||
for col, col_type in model.model_fields.items():
|
||||
if col not in group.keys():
|
||||
continue
|
||||
idxname = col + '_index'
|
||||
if idxname in group.keys():
|
||||
idx = group.get(idxname)[:]
|
||||
data = group.get(col)[idx-1]
|
||||
else:
|
||||
data = group.get(col)[:]
|
||||
|
||||
# Handle typing inside of list
|
||||
if isinstance(data[0], bytes):
|
||||
data = data.astype('unicode')
|
||||
if isinstance(data[0], str):
|
||||
# lists and other compound data types can get flattened out to strings when stored
|
||||
# so we try and literal eval and recover them
|
||||
try:
|
||||
eval_type = type(ast.literal_eval(data[0]))
|
||||
except (ValueError, SyntaxError):
|
||||
eval_type = str
|
||||
|
||||
# if we've found one of those, get the data type within it.
|
||||
if eval_type is not str:
|
||||
eval_list = []
|
||||
for item in data.tolist():
|
||||
try:
|
||||
eval_list.append(ast.literal_eval(item))
|
||||
except ValueError:
|
||||
eval_list.append(None)
|
||||
data = eval_list
|
||||
elif isinstance(data[0], h5py.h5r.Reference):
|
||||
data = [HDF5_Path(group[d].name) for d in data]
|
||||
elif isinstance(data[0], tuple) and any([isinstance(d, h5py.h5r.Reference) for d in data[0]]):
|
||||
# references stored inside a tuple, reference + location.
|
||||
# dereference them!?
|
||||
dset = group.get(col)
|
||||
names = dset.dtype.names
|
||||
if names is not None and names[0] == 'idx_start' and names[1] == 'count':
|
||||
data = dereference_reference_vector(dset, data)
|
||||
|
||||
else:
|
||||
data = data.tolist()
|
||||
|
||||
# After list, check if we need to put this thing inside of
|
||||
# another class, as indicated by the enclosing model
|
||||
|
||||
|
||||
|
||||
items[col] = data
|
||||
|
||||
return model(hdf5_path = group.name,
|
||||
name = group.name.split('/')[-1],
|
||||
**items)
|
||||
|
||||
|
||||
def dereference_reference_vector(dset: h5py.Dataset, data:Optional[List[Any]]) -> List:
|
||||
"""
|
||||
Given a compound dataset with indices, counts, and object references, dereference to values
|
||||
|
|
|
@ -1,2 +1 @@
|
|||
from nwb_linkml.types.ndarray import NDArray
|
||||
from nwb_linkml.types.df import DataFrame
|
|
@ -1,8 +1,19 @@
|
|||
"""
|
||||
Pydantic models that behave like pandas dataframes
|
||||
|
||||
.. note::
|
||||
|
||||
This is currently unused but kept in place as a stub in case it is worth revisiting in the future.
|
||||
It turned out to be too momentarily difficult to make lazy-loading work with dask arrays per column
|
||||
while still keeping pandas-like API intact. In the future we should investigate modifying the
|
||||
:func:`dask.dataframe.read_hdf` function to treat individual hdf5 datasets like columns
|
||||
|
||||
pandas has been removed from dependencies for now, as it not used elsewhere, but it is
|
||||
left in this module since it is necessary for it to make sense.
|
||||
"""
|
||||
import ast
|
||||
import pdb
|
||||
from typing import List, Any, get_origin, get_args, Union, Optional, Dict
|
||||
from typing import List, Any, get_origin, get_args, Union, Optional, Dict, Type
|
||||
from types import NoneType
|
||||
|
||||
import h5py
|
||||
|
@ -16,6 +27,10 @@ from pydantic import (
|
|||
model_validator
|
||||
)
|
||||
|
||||
from nwb_linkml.maps.hdmf import model_from_dynamictable, dereference_reference_vector
|
||||
from nwb_linkml.types.hdf5 import HDF5_Path
|
||||
|
||||
|
||||
class DataFrame(BaseModel, pd.DataFrame):
|
||||
"""
|
||||
Pydantic model root class that mimics a pandas dataframe.
|
||||
|
@ -116,3 +131,65 @@ class DataFrame(BaseModel, pd.DataFrame):
|
|||
for k, v in out.items()
|
||||
}
|
||||
return nxt(self.__class__(**out))
|
||||
|
||||
|
||||
def dynamictable_to_df(group:h5py.Group,
|
||||
model:Optional[Type[DataFrame]]=None,
|
||||
base:Optional[BaseModel] = None) -> DataFrame:
|
||||
if model is None:
|
||||
model = model_from_dynamictable(group, base)
|
||||
|
||||
items = {}
|
||||
for col, col_type in model.model_fields.items():
|
||||
if col not in group.keys():
|
||||
continue
|
||||
idxname = col + '_index'
|
||||
if idxname in group.keys():
|
||||
idx = group.get(idxname)[:]
|
||||
data = group.get(col)[idx-1]
|
||||
else:
|
||||
data = group.get(col)[:]
|
||||
|
||||
# Handle typing inside of list
|
||||
if isinstance(data[0], bytes):
|
||||
data = data.astype('unicode')
|
||||
if isinstance(data[0], str):
|
||||
# lists and other compound data types can get flattened out to strings when stored
|
||||
# so we try and literal eval and recover them
|
||||
try:
|
||||
eval_type = type(ast.literal_eval(data[0]))
|
||||
except (ValueError, SyntaxError):
|
||||
eval_type = str
|
||||
|
||||
# if we've found one of those, get the data type within it.
|
||||
if eval_type is not str:
|
||||
eval_list = []
|
||||
for item in data.tolist():
|
||||
try:
|
||||
eval_list.append(ast.literal_eval(item))
|
||||
except ValueError:
|
||||
eval_list.append(None)
|
||||
data = eval_list
|
||||
elif isinstance(data[0], h5py.h5r.Reference):
|
||||
data = [HDF5_Path(group[d].name) for d in data]
|
||||
elif isinstance(data[0], tuple) and any([isinstance(d, h5py.h5r.Reference) for d in data[0]]):
|
||||
# references stored inside a tuple, reference + location.
|
||||
# dereference them!?
|
||||
dset = group.get(col)
|
||||
names = dset.dtype.names
|
||||
if names is not None and names[0] == 'idx_start' and names[1] == 'count':
|
||||
data = dereference_reference_vector(dset, data)
|
||||
|
||||
else:
|
||||
data = data.tolist()
|
||||
|
||||
# After list, check if we need to put this thing inside of
|
||||
# another class, as indicated by the enclosing model
|
||||
|
||||
|
||||
|
||||
items[col] = data
|
||||
|
||||
return model(hdf5_path = group.name,
|
||||
name = group.name.split('/')[-1],
|
||||
**items)
|
||||
|
|
|
@ -3,18 +3,19 @@ import pytest
|
|||
import pandas as pd
|
||||
from pydantic import BaseModel, ValidationError
|
||||
from typing import List, Union, Optional
|
||||
from nwb_linkml.types import DataFrame
|
||||
|
||||
@pytest.mark.skip()
|
||||
def test_df():
|
||||
"""
|
||||
Dataframe class should behave like both a pydantic model and a dataframe
|
||||
"""
|
||||
from nwb_linkml.types.df import DataFrame
|
||||
|
||||
class MyDf(DataFrame):
|
||||
ints: List[int]
|
||||
strings: List[str]
|
||||
multi: List[int | str]
|
||||
opts: Optional[List[int]] = None
|
||||
class MyDf(DataFrame):
|
||||
ints: List[int]
|
||||
strings: List[str]
|
||||
multi: List[int | str]
|
||||
opts: Optional[List[int]] = None
|
||||
|
||||
good_kwargs = {
|
||||
'ints': [1,2,3],
|
||||
|
|
Loading…
Reference in a new issue