mirror of
https://github.com/p2p-ld/nwb-linkml.git
synced 2025-01-09 21:54:27 +00:00
Removing DataFrame type from imports, leaving as stub
This commit is contained in:
parent
f682105c1a
commit
9947edfed2
6 changed files with 95 additions and 271 deletions
|
@ -40,7 +40,8 @@ intersphinx_mapping = {
|
||||||
'numpy': ('https://numpy.org/doc/stable/', None),
|
'numpy': ('https://numpy.org/doc/stable/', None),
|
||||||
'pandas': ('https://pandas.pydata.org/docs/', None),
|
'pandas': ('https://pandas.pydata.org/docs/', None),
|
||||||
'pydantic': ('https://docs.pydantic.dev/latest/', None),
|
'pydantic': ('https://docs.pydantic.dev/latest/', None),
|
||||||
'h5py': ('https://docs.h5py.org/en/stable/', None)
|
'h5py': ('https://docs.h5py.org/en/stable/', None),
|
||||||
|
'dask': ('https://docs.dask.org/en/stable/', None)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,144 +0,0 @@
|
||||||
"""
|
|
||||||
Just saving a scratch file temporarily where i was trying a different strategy,
|
|
||||||
rather than doing one big recursive pass through, try and solve subsections
|
|
||||||
of the tree and then piece them together once you have the others done.
|
|
||||||
|
|
||||||
sort of working. I think what i need to do is populate the 'depends'
|
|
||||||
field more so that at each pass i can work through the items whose dependencies
|
|
||||||
have been solved from the bottom up.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from typing import List
|
|
||||||
from nwb_linkml.types.df import DataFrame
|
|
||||||
|
|
||||||
class MyDf(DataFrame):
|
|
||||||
ints: List[int]
|
|
||||||
|
|
||||||
a = MyDf(ints=[1,2,3])
|
|
||||||
|
|
||||||
|
|
||||||
from nwb_linkml.io.hdf5 import HDF5IO
|
|
||||||
import h5py
|
|
||||||
from typing import NamedTuple, Tuple, Optional
|
|
||||||
from nwb_linkml.io.hdf5 import HDF5IO
|
|
||||||
from nwb_linkml.maps.hdf5 import H5SourceItem, FlatH5, ReadQueue, flatten_hdf
|
|
||||||
from nwb_linkml.providers.schema import SchemaProvider
|
|
||||||
from rich import print
|
|
||||||
from pydantic import BaseModel
|
|
||||||
|
|
||||||
|
|
||||||
class Rank(NamedTuple):
|
|
||||||
has_depends: bool
|
|
||||||
not_leaf: bool
|
|
||||||
not_dataset: bool
|
|
||||||
has_type: bool
|
|
||||||
|
|
||||||
def sort_flat(item:Tuple[str, H5SourceItem]):
|
|
||||||
|
|
||||||
return Rank(
|
|
||||||
has_depends=len(item[1].depends)>0,
|
|
||||||
not_leaf = ~item[1].leaf,
|
|
||||||
not_dataset = item[1].h5_type != 'dataset',
|
|
||||||
has_type = 'neurodata_type' in item[1].attrs
|
|
||||||
)
|
|
||||||
|
|
||||||
def prune_empty(flat: FlatH5) -> FlatH5:
|
|
||||||
"""
|
|
||||||
Groups without children or attrs can be removed
|
|
||||||
"""
|
|
||||||
deletes = []
|
|
||||||
for k,v in flat.items():
|
|
||||||
if v.leaf and v.h5_type == 'group' and len(v.attrs) == 0:
|
|
||||||
deletes.append(k)
|
|
||||||
|
|
||||||
for k in deletes:
|
|
||||||
del flat[k]
|
|
||||||
|
|
||||||
return flat
|
|
||||||
|
|
||||||
def resolve_scalars(res: ReadQueue) -> ReadQueue:
|
|
||||||
for path, item in res.queue.copy().items():
|
|
||||||
if item.h5_type == 'group':
|
|
||||||
continue
|
|
||||||
dset = res.h5f.get(path)
|
|
||||||
if dset.shape == ():
|
|
||||||
res.completed[path] = dset[()]
|
|
||||||
res.queue.pop(path)
|
|
||||||
return res
|
|
||||||
|
|
||||||
def resolve_terminal_arrays(res:ReadQueue) -> ReadQueue:
|
|
||||||
"""Terminal arrays can just get loaded as a dict"""
|
|
||||||
for path, item in res.queue.copy().items():
|
|
||||||
if item.h5_type != 'dataset' or not item.leaf or len(item.depends) > 0:
|
|
||||||
continue
|
|
||||||
h5_object = res.h5f.get(path)
|
|
||||||
item_dict = {
|
|
||||||
'name': path.split('/')[-1],
|
|
||||||
'array': h5_object[:],
|
|
||||||
**h5_object.attrs,
|
|
||||||
}
|
|
||||||
res.completed[path] = item_dict
|
|
||||||
res.queue.pop(path)
|
|
||||||
return res
|
|
||||||
|
|
||||||
def attempt_parentless(res:ReadQueue, provider:SchemaProvider) -> ReadQueue:
|
|
||||||
"""Try the groups whose parents have no neurodata type (ie. acquisition)"""
|
|
||||||
for path, item in res.queue.copy().items():
|
|
||||||
if item.h5_type == 'dataset':
|
|
||||||
continue
|
|
||||||
group = res.h5f.get(path)
|
|
||||||
if 'neurodata_type' in group.parent.attrs.keys() or 'neurodata_type' not in group.attrs.keys():
|
|
||||||
continue
|
|
||||||
model = provider.get_class(group.attrs['namespace'], group.attrs['neurodata_type'])
|
|
||||||
res = naive_instantiation(group, model, res)
|
|
||||||
return res
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def naive_instantiation(element: h5py.Group|h5py.Dataset, model:BaseModel, res:ReadQueue) -> Optional[BaseModel]:
|
|
||||||
"""
|
|
||||||
Try to instantiate model with just the attrs and any resolved children
|
|
||||||
"""
|
|
||||||
print(element)
|
|
||||||
kwargs = {}
|
|
||||||
kwargs['name'] = element.name.split('/')[-1]
|
|
||||||
for k in element.attrs.keys():
|
|
||||||
try:
|
|
||||||
kwargs[k] = element.attrs[k]
|
|
||||||
except Exception as e:
|
|
||||||
print(f'couldnt load attr: {e}')
|
|
||||||
for key, child in element.items():
|
|
||||||
if child.name in res.completed:
|
|
||||||
kwargs[child.name] = res.completed[child.name]
|
|
||||||
|
|
||||||
kwargs = {k:v for k,v in kwargs.items() if k in model.model_fields.keys()}
|
|
||||||
|
|
||||||
try:
|
|
||||||
instance = model(**kwargs)
|
|
||||||
res.queue.pop(element.name)
|
|
||||||
res.completed[element.name] = instance
|
|
||||||
print('succeeded')
|
|
||||||
return res
|
|
||||||
except Exception as e:
|
|
||||||
print(f'failed: {e}')
|
|
||||||
return res
|
|
||||||
|
|
||||||
|
|
||||||
# --------------------------------------------------
|
|
||||||
path = '/Users/jonny/Dropbox/lab/p2p_ld/data/nwb/sub-738651046_ses-760693773_probe-769322820_ecephys.nwb'
|
|
||||||
|
|
||||||
h5io = HDF5IO(path)
|
|
||||||
provider = h5io.make_provider()
|
|
||||||
|
|
||||||
h5f = h5py.File(path)
|
|
||||||
flat = flatten_hdf(h5f)
|
|
||||||
|
|
||||||
flat = prune_empty(flat)
|
|
||||||
flat_sorted = dict(sorted(flat.items(), key=sort_flat))
|
|
||||||
|
|
||||||
res = ReadQueue(h5f=h5f, queue=flat_sorted.copy())
|
|
||||||
|
|
||||||
res = resolve_scalars(res)
|
|
||||||
res = resolve_terminal_arrays(res)
|
|
||||||
res = attempt_parentless(res, provider)
|
|
||||||
|
|
|
@ -1,82 +1,36 @@
|
||||||
"""
|
"""
|
||||||
Mapping functions for handling HDMF classes like DynamicTables
|
Mapping functions for handling HDMF classes like DynamicTables
|
||||||
"""
|
"""
|
||||||
import pdb
|
|
||||||
import warnings
|
|
||||||
from typing import List, Type, Optional, Any
|
from typing import List, Type, Optional, Any
|
||||||
import ast
|
import warnings
|
||||||
from nwb_linkml.types import DataFrame
|
|
||||||
|
|
||||||
import h5py
|
import h5py
|
||||||
from pydantic import create_model, BaseModel
|
from pydantic import create_model, BaseModel
|
||||||
from nwb_linkml.maps import dtype
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from nwb_linkml.types.hdf5 import HDF5_Path
|
from nwb_linkml.types.hdf5 import HDF5_Path
|
||||||
from nwb_linkml.types.ndarray import NDArray, NDArrayProxy
|
from nwb_linkml.types.ndarray import NDArray, NDArrayProxy
|
||||||
from nwb_linkml.annotations import get_inner_types
|
|
||||||
import dask.array as da
|
import dask.array as da
|
||||||
import nptyping
|
|
||||||
|
|
||||||
def model_from_dynamictable(group:h5py.Group, base:Optional[BaseModel] = None) -> Type[DataFrame]:
|
|
||||||
|
def model_from_dynamictable(group:h5py.Group, base:Optional[BaseModel] = None) -> Type[BaseModel]:
|
||||||
"""
|
"""
|
||||||
Create a pydantic model from a dynamic table
|
Create a pydantic model from a dynamic table
|
||||||
"""
|
"""
|
||||||
colnames = group.attrs['colnames']
|
colnames = group.attrs['colnames']
|
||||||
types = {}
|
types = {}
|
||||||
for col in colnames:
|
for col in colnames:
|
||||||
# idxname = col + '_index'
|
|
||||||
# if idxname in group.keys():
|
|
||||||
# idx = group.get(idxname)[0]
|
|
||||||
# dset = group.get(col)
|
|
||||||
# item = dset[idx]
|
|
||||||
# else:
|
|
||||||
# dset = group.get(col)
|
|
||||||
# item = dset[0]
|
|
||||||
# # read the first entry to see what we got
|
|
||||||
#
|
|
||||||
# if isinstance(item, bytes):
|
|
||||||
# item = item.decode('utf-8')
|
|
||||||
# if isinstance(item, str):
|
|
||||||
# # try to see if this is actually a list or smth encoded as a string
|
|
||||||
# try:
|
|
||||||
# item = ast.literal_eval(item)
|
|
||||||
# except (ValueError, SyntaxError):
|
|
||||||
# pass
|
|
||||||
|
|
||||||
# Get a nptypes type for the array
|
|
||||||
#pdb.set_trace()
|
|
||||||
|
|
||||||
# type_ = type(item)
|
|
||||||
# type_ = dtype.np_to_python.get(type_, type_)
|
|
||||||
# if type_ is h5py.h5r.Reference:
|
|
||||||
# #type_ = HDF5_Path
|
|
||||||
# type_ = 'String'
|
|
||||||
# elif type_ is np.ndarray:
|
|
||||||
# item: np.ndarray
|
|
||||||
# type_ = dtype.flat_to_npytyping[item.dtype.name]
|
|
||||||
|
|
||||||
#if type_ is not np.void:
|
|
||||||
#type_ = NDArray[Any, getattr(nptyping, dtype.flat_to_npytyping[item.dtype.name])]
|
|
||||||
|
|
||||||
#nptype = nptyping.typing_.name_per_dtype[group[col].dtype.type]
|
|
||||||
nptype = group[col].dtype.type
|
nptype = group[col].dtype.type
|
||||||
if nptype == np.void:
|
if nptype == np.void:
|
||||||
# warnings.warn(f"Cant handle numpy void type for column {col} in {group.name}")
|
warnings.warn(f"Cant handle numpy void type for column {col} in {group.name}")
|
||||||
continue
|
continue
|
||||||
type_ = Optional[NDArray[Any, nptype]]
|
type_ = Optional[NDArray[Any, nptype]]
|
||||||
|
|
||||||
|
# FIXME: handling nested column types that appear only in some versions?
|
||||||
# FIXME: handling nested column types that appear only in some versions?
|
|
||||||
#types[col] = (List[type_ | None], ...)
|
#types[col] = (List[type_ | None], ...)
|
||||||
types[col] = (type_, None)
|
types[col] = (type_, None)
|
||||||
|
|
||||||
# if base is None:
|
|
||||||
# #base = DataFrame
|
|
||||||
# base = BaseModel
|
|
||||||
# else:
|
|
||||||
# base = (BaseModel, base)
|
|
||||||
# #base = (DataFrame, base)
|
|
||||||
|
|
||||||
|
|
||||||
model = create_model(group.name.split('/')[-1], **types, __base__=base)
|
model = create_model(group.name.split('/')[-1], **types, __base__=base)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
@ -120,70 +74,6 @@ def dynamictable_to_model(
|
||||||
**items)
|
**items)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def dynamictable_to_df(group:h5py.Group,
|
|
||||||
model:Optional[Type[DataFrame]]=None,
|
|
||||||
base:Optional[BaseModel] = None) -> DataFrame:
|
|
||||||
if model is None:
|
|
||||||
model = model_from_dynamictable(group, base)
|
|
||||||
|
|
||||||
items = {}
|
|
||||||
for col, col_type in model.model_fields.items():
|
|
||||||
if col not in group.keys():
|
|
||||||
continue
|
|
||||||
idxname = col + '_index'
|
|
||||||
if idxname in group.keys():
|
|
||||||
idx = group.get(idxname)[:]
|
|
||||||
data = group.get(col)[idx-1]
|
|
||||||
else:
|
|
||||||
data = group.get(col)[:]
|
|
||||||
|
|
||||||
# Handle typing inside of list
|
|
||||||
if isinstance(data[0], bytes):
|
|
||||||
data = data.astype('unicode')
|
|
||||||
if isinstance(data[0], str):
|
|
||||||
# lists and other compound data types can get flattened out to strings when stored
|
|
||||||
# so we try and literal eval and recover them
|
|
||||||
try:
|
|
||||||
eval_type = type(ast.literal_eval(data[0]))
|
|
||||||
except (ValueError, SyntaxError):
|
|
||||||
eval_type = str
|
|
||||||
|
|
||||||
# if we've found one of those, get the data type within it.
|
|
||||||
if eval_type is not str:
|
|
||||||
eval_list = []
|
|
||||||
for item in data.tolist():
|
|
||||||
try:
|
|
||||||
eval_list.append(ast.literal_eval(item))
|
|
||||||
except ValueError:
|
|
||||||
eval_list.append(None)
|
|
||||||
data = eval_list
|
|
||||||
elif isinstance(data[0], h5py.h5r.Reference):
|
|
||||||
data = [HDF5_Path(group[d].name) for d in data]
|
|
||||||
elif isinstance(data[0], tuple) and any([isinstance(d, h5py.h5r.Reference) for d in data[0]]):
|
|
||||||
# references stored inside a tuple, reference + location.
|
|
||||||
# dereference them!?
|
|
||||||
dset = group.get(col)
|
|
||||||
names = dset.dtype.names
|
|
||||||
if names is not None and names[0] == 'idx_start' and names[1] == 'count':
|
|
||||||
data = dereference_reference_vector(dset, data)
|
|
||||||
|
|
||||||
else:
|
|
||||||
data = data.tolist()
|
|
||||||
|
|
||||||
# After list, check if we need to put this thing inside of
|
|
||||||
# another class, as indicated by the enclosing model
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
items[col] = data
|
|
||||||
|
|
||||||
return model(hdf5_path = group.name,
|
|
||||||
name = group.name.split('/')[-1],
|
|
||||||
**items)
|
|
||||||
|
|
||||||
|
|
||||||
def dereference_reference_vector(dset: h5py.Dataset, data:Optional[List[Any]]) -> List:
|
def dereference_reference_vector(dset: h5py.Dataset, data:Optional[List[Any]]) -> List:
|
||||||
"""
|
"""
|
||||||
Given a compound dataset with indices, counts, and object references, dereference to values
|
Given a compound dataset with indices, counts, and object references, dereference to values
|
||||||
|
|
|
@ -1,2 +1 @@
|
||||||
from nwb_linkml.types.ndarray import NDArray
|
from nwb_linkml.types.ndarray import NDArray
|
||||||
from nwb_linkml.types.df import DataFrame
|
|
|
@ -1,8 +1,19 @@
|
||||||
"""
|
"""
|
||||||
Pydantic models that behave like pandas dataframes
|
Pydantic models that behave like pandas dataframes
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
This is currently unused but kept in place as a stub in case it is worth revisiting in the future.
|
||||||
|
It turned out to be too momentarily difficult to make lazy-loading work with dask arrays per column
|
||||||
|
while still keeping pandas-like API intact. In the future we should investigate modifying the
|
||||||
|
:func:`dask.dataframe.read_hdf` function to treat individual hdf5 datasets like columns
|
||||||
|
|
||||||
|
pandas has been removed from dependencies for now, as it not used elsewhere, but it is
|
||||||
|
left in this module since it is necessary for it to make sense.
|
||||||
"""
|
"""
|
||||||
|
import ast
|
||||||
import pdb
|
import pdb
|
||||||
from typing import List, Any, get_origin, get_args, Union, Optional, Dict
|
from typing import List, Any, get_origin, get_args, Union, Optional, Dict, Type
|
||||||
from types import NoneType
|
from types import NoneType
|
||||||
|
|
||||||
import h5py
|
import h5py
|
||||||
|
@ -16,6 +27,10 @@ from pydantic import (
|
||||||
model_validator
|
model_validator
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from nwb_linkml.maps.hdmf import model_from_dynamictable, dereference_reference_vector
|
||||||
|
from nwb_linkml.types.hdf5 import HDF5_Path
|
||||||
|
|
||||||
|
|
||||||
class DataFrame(BaseModel, pd.DataFrame):
|
class DataFrame(BaseModel, pd.DataFrame):
|
||||||
"""
|
"""
|
||||||
Pydantic model root class that mimics a pandas dataframe.
|
Pydantic model root class that mimics a pandas dataframe.
|
||||||
|
@ -116,3 +131,65 @@ class DataFrame(BaseModel, pd.DataFrame):
|
||||||
for k, v in out.items()
|
for k, v in out.items()
|
||||||
}
|
}
|
||||||
return nxt(self.__class__(**out))
|
return nxt(self.__class__(**out))
|
||||||
|
|
||||||
|
|
||||||
|
def dynamictable_to_df(group:h5py.Group,
|
||||||
|
model:Optional[Type[DataFrame]]=None,
|
||||||
|
base:Optional[BaseModel] = None) -> DataFrame:
|
||||||
|
if model is None:
|
||||||
|
model = model_from_dynamictable(group, base)
|
||||||
|
|
||||||
|
items = {}
|
||||||
|
for col, col_type in model.model_fields.items():
|
||||||
|
if col not in group.keys():
|
||||||
|
continue
|
||||||
|
idxname = col + '_index'
|
||||||
|
if idxname in group.keys():
|
||||||
|
idx = group.get(idxname)[:]
|
||||||
|
data = group.get(col)[idx-1]
|
||||||
|
else:
|
||||||
|
data = group.get(col)[:]
|
||||||
|
|
||||||
|
# Handle typing inside of list
|
||||||
|
if isinstance(data[0], bytes):
|
||||||
|
data = data.astype('unicode')
|
||||||
|
if isinstance(data[0], str):
|
||||||
|
# lists and other compound data types can get flattened out to strings when stored
|
||||||
|
# so we try and literal eval and recover them
|
||||||
|
try:
|
||||||
|
eval_type = type(ast.literal_eval(data[0]))
|
||||||
|
except (ValueError, SyntaxError):
|
||||||
|
eval_type = str
|
||||||
|
|
||||||
|
# if we've found one of those, get the data type within it.
|
||||||
|
if eval_type is not str:
|
||||||
|
eval_list = []
|
||||||
|
for item in data.tolist():
|
||||||
|
try:
|
||||||
|
eval_list.append(ast.literal_eval(item))
|
||||||
|
except ValueError:
|
||||||
|
eval_list.append(None)
|
||||||
|
data = eval_list
|
||||||
|
elif isinstance(data[0], h5py.h5r.Reference):
|
||||||
|
data = [HDF5_Path(group[d].name) for d in data]
|
||||||
|
elif isinstance(data[0], tuple) and any([isinstance(d, h5py.h5r.Reference) for d in data[0]]):
|
||||||
|
# references stored inside a tuple, reference + location.
|
||||||
|
# dereference them!?
|
||||||
|
dset = group.get(col)
|
||||||
|
names = dset.dtype.names
|
||||||
|
if names is not None and names[0] == 'idx_start' and names[1] == 'count':
|
||||||
|
data = dereference_reference_vector(dset, data)
|
||||||
|
|
||||||
|
else:
|
||||||
|
data = data.tolist()
|
||||||
|
|
||||||
|
# After list, check if we need to put this thing inside of
|
||||||
|
# another class, as indicated by the enclosing model
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
items[col] = data
|
||||||
|
|
||||||
|
return model(hdf5_path = group.name,
|
||||||
|
name = group.name.split('/')[-1],
|
||||||
|
**items)
|
||||||
|
|
|
@ -3,18 +3,19 @@ import pytest
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pydantic import BaseModel, ValidationError
|
from pydantic import BaseModel, ValidationError
|
||||||
from typing import List, Union, Optional
|
from typing import List, Union, Optional
|
||||||
from nwb_linkml.types import DataFrame
|
|
||||||
|
|
||||||
|
@pytest.mark.skip()
|
||||||
def test_df():
|
def test_df():
|
||||||
"""
|
"""
|
||||||
Dataframe class should behave like both a pydantic model and a dataframe
|
Dataframe class should behave like both a pydantic model and a dataframe
|
||||||
"""
|
"""
|
||||||
|
from nwb_linkml.types.df import DataFrame
|
||||||
|
|
||||||
class MyDf(DataFrame):
|
class MyDf(DataFrame):
|
||||||
ints: List[int]
|
ints: List[int]
|
||||||
strings: List[str]
|
strings: List[str]
|
||||||
multi: List[int | str]
|
multi: List[int | str]
|
||||||
opts: Optional[List[int]] = None
|
opts: Optional[List[int]] = None
|
||||||
|
|
||||||
good_kwargs = {
|
good_kwargs = {
|
||||||
'ints': [1,2,3],
|
'ints': [1,2,3],
|
||||||
|
|
Loading…
Reference in a new issue