diff --git a/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_1_0/hdmf_common_table.py b/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_1_0/hdmf_common_table.py index e37cea0..f571cc5 100644 --- a/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_1_0/hdmf_common_table.py +++ b/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_1_0/hdmf_common_table.py @@ -148,31 +148,62 @@ class VectorIndexMixin(BaseModel, Generic[T]): kwargs["value"] = value super().__init__(**kwargs) - def _getitem_helper(self, arg: int) -> Union[list, NDArray]: + def _slice(self, arg: int) -> slice: """ Mimicking :func:`hdmf.common.table.VectorIndex.__getitem_helper` """ start = 0 if arg == 0 else self.value[arg - 1] end = self.value[arg] - return self.target.value[slice(start, end)] + return slice(start, end) def __getitem__(self, item: Union[int, slice, Iterable]) -> Any: if self.target is None: return self.value[item] else: if isinstance(item, (int, np.integer)): - return self._getitem_helper(item) + return self.target.value[self._slice(item)] elif isinstance(item, (slice, Iterable)): if isinstance(item, slice): item = range(*item.indices(len(self.value))) - return [self._getitem_helper(i) for i in item] - else: + return [self.target.value[self._slice(i)] for i in item] + else: # pragma: no cover raise AttributeError(f"Could not index with {item}") def __setitem__(self, key: Union[int, slice], value: Any) -> None: - if self._index: - # VectorIndex is the thing that knows how to do the slicing - self._index[key] = value + """ + Set a value on the :attr:`.target` . + + .. note:: + + Even though we correct the indexing logic from HDMF where the + _data_ is the thing that is provided by the API when one accesses + table.data (rather than table.data_index as hdmf does), + we will set to the target here (rather than to the index) + to be consistent. To modify the index, modify `self.value` directly + + """ + if self.target: + if isinstance(key, (int, np.integer)): + self.target.value[self._slice(key)] = value + elif isinstance(key, (slice, Iterable)): + if isinstance(key, slice): + key = range(*key.indices(len(self.value))) + + if isinstance(value, Iterable): + if len(key) != len(value): + raise ValueError( + "Can only assign equal-length iterable to a slice, manually index the" + " ragged values of of the target VectorData object if you need more" + " control" + ) + for i, subval in zip(key, value): + self.target.value[self._slice(i)] = subval + else: + for i in key: + self.target.value[self._slice(i)] = value + else: # pragma: no cover + raise AttributeError(f"Could not index with {key}") + else: self.value[key] = value @@ -203,9 +234,17 @@ class DynamicTableRegionMixin(BaseModel): _index: Optional["VectorIndex"] = None table: "DynamicTableMixin" - value: Optional[NDArray] = None + value: Optional[NDArray[Shape["*"], int]] = None - def __getitem__(self, item: Union[int, slice, Iterable]) -> Any: + @overload + def __getitem__(self, item: int) -> pd.DataFrame: ... + + @overload + def __getitem__(self, item: Union[slice, Iterable]) -> List[pd.DataFrame]: ... + + def __getitem__( + self, item: Union[int, slice, Iterable] + ) -> Union[pd.DataFrame, List[pd.DataFrame]]: """ Use ``value`` to index the table. Works analogously to ``VectorIndex`` despite this being a subclass of ``VectorData`` @@ -220,20 +259,27 @@ class DynamicTableRegionMixin(BaseModel): # so we index table with an array to construct # a list of lists of rows return [self.table[idx] for idx in self._index[item]] - else: + else: # pragma: no cover raise ValueError(f"Dont know how to index with {item}, need an int or a slice") else: if isinstance(item, (int, np.integer)): return self.table[self.value[item]] elif isinstance(item, (slice, Iterable)): + # Return a list of dataframe rows because this is most often used + # as a column in a DynamicTable, so while it would normally be + # ideal to just return the slice as above as a single df, + # we need each row to be separate to fill the column if isinstance(item, slice): item = range(*item.indices(len(self.value))) return [self.table[self.value[i]] for i in item] - else: + else: # pragma: no cover raise ValueError(f"Dont know how to index with {item}, need an int or a slice") def __setitem__(self, key: Union[int, str, slice], value: Any) -> None: - self.table[self.value[key]] = value + # self.table[self.value[key]] = value + raise NotImplementedError( + "Assigning values to tables is not implemented yet!" + ) # pragma: no cover class DynamicTableMixin(BaseModel): @@ -244,9 +290,10 @@ class DynamicTableMixin(BaseModel): but simplifying along the way :) """ - model_config = ConfigDict(extra="allow") + model_config = ConfigDict(extra="allow", validate_assignment=True) __pydantic_extra__: Dict[str, Union["VectorDataMixin", "VectorIndexMixin", "NDArray", list]] NON_COLUMN_FIELDS: ClassVar[tuple[str]] = ( + "id", "name", "colnames", "description", @@ -260,10 +307,6 @@ class DynamicTableMixin(BaseModel): def _columns(self) -> Dict[str, Union[list, "NDArray", "VectorDataMixin"]]: return {k: getattr(self, k) for i, k in enumerate(self.colnames)} - @property - def _columns_list(self) -> List[Union[list, "NDArray", "VectorDataMixin"]]: - return [getattr(self, k) for i, k in enumerate(self.colnames)] - @overload def __getitem__(self, item: str) -> Union[list, "NDArray", "VectorDataMixin"]: ... @@ -312,6 +355,7 @@ class DynamicTableMixin(BaseModel): return self._columns[item] if isinstance(item, (int, slice, np.integer, np.ndarray)): data = self._slice_range(item) + index = self.id[item] elif isinstance(item, tuple): if len(item) != 2: raise ValueError( @@ -329,11 +373,15 @@ class DynamicTableMixin(BaseModel): return self._columns[cols][rows] data = self._slice_range(rows, cols) + index = self.id[rows] else: raise ValueError(f"Unsure how to get item with key {item}") # cast to DF - return pd.DataFrame(data) + if not isinstance(index, Iterable): + index = [index] + index = pd.Index(data=index) + return pd.DataFrame(data, index=index) def _slice_range( self, rows: Union[int, slice, np.ndarray], cols: Optional[Union[str, List[str]]] = None @@ -345,31 +393,40 @@ class DynamicTableMixin(BaseModel): data = {} for k in cols: if isinstance(rows, np.ndarray): + # help wanted - this is probably cr*zy slow val = [self._columns[k][i] for i in rows] else: val = self._columns[k][rows] # scalars need to be wrapped in series for pandas + # do this by the iterability of the rows index not the value because + # we want all lengths from this method to be equal, and if the rows are + # scalar, that means length == 1 if not isinstance(rows, (Iterable, slice)): - val = pd.Series([val]) + val = [val] data[k] = val return data def __setitem__(self, key: str, value: Any) -> None: - raise NotImplementedError("TODO") + raise NotImplementedError("TODO") # pragma: no cover def __setattr__(self, key: str, value: Union[list, "NDArray", "VectorData"]): """ Add a column, appending it to ``colnames`` """ # don't use this while building the model - if not getattr(self, "__pydantic_complete__", False): + if not getattr(self, "__pydantic_complete__", False): # pragma: no cover return super().__setattr__(key, value) if key not in self.model_fields_set and not key.endswith("_index"): self.colnames.append(key) + # we get a recursion error if we setattr without having first added to + # extras if we need it to be there + if key not in self.model_fields and key not in self.__pydantic_extra__: + self.__pydantic_extra__[key] = value + return super().__setattr__(key, value) def __getattr__(self, item: str) -> Any: @@ -396,6 +453,8 @@ class DynamicTableMixin(BaseModel): """ Create ID column if not provided """ + if not isinstance(model, dict): + return model if "id" not in model: lengths = [] for key, val in model.items(): @@ -420,6 +479,8 @@ class DynamicTableMixin(BaseModel): the model dict is ordered after python3.6, so we can use that minus anything in :attr:`.NON_COLUMN_FIELDS` to determine order implied from passage order """ + if not isinstance(model, dict): + return model if "colnames" not in model: colnames = [ k @@ -455,19 +516,21 @@ class DynamicTableMixin(BaseModel): See :meth:`.cast_specified_columns` for handling columns in the class specification """ # if columns are not in the specification, cast to a generic VectorData - for key, val in model.items(): - if key in cls.model_fields: - continue - if not isinstance(val, (VectorData, VectorIndex)): - try: - if key.endswith("_index"): - model[key] = VectorIndex(name=key, description="", value=val) - else: - model[key] = VectorData(name=key, description="", value=val) - except ValidationError as e: - raise ValidationError( - f"field {key} cannot be cast to VectorData from {val}" - ) from e + + if isinstance(model, dict): + for key, val in model.items(): + if key in cls.model_fields: + continue + if not isinstance(val, (VectorData, VectorIndex)): + try: + if key.endswith("_index"): + model[key] = VectorIndex(name=key, description="", value=val) + else: + model[key] = VectorData(name=key, description="", value=val) + except ValidationError as e: # pragma: no cover + raise ValidationError( + f"field {key} cannot be cast to VectorData from {val}" + ) from e return model @model_validator(mode="after") @@ -499,8 +562,8 @@ class DynamicTableMixin(BaseModel): """ Ensure that all columns are equal length """ - lengths = [len(v) for v in self._columns.values()] - assert [length == lengths[0] for length in lengths], ( + lengths = [len(v) for v in self._columns.values()] + [len(self.id)] + assert all([length == lengths[0] for length in lengths]), ( "Columns are not of equal length! " f"Got colnames:\n{self.colnames}\nand lengths: {lengths}" ) @@ -536,15 +599,19 @@ class DynamicTableMixin(BaseModel): ) ) except Exception: - raise e + raise e from None -class AlignedDynamicTableMixin(DynamicTableMixin): +class AlignedDynamicTableMixin(BaseModel): """ Mixin to allow indexing multiple tables that are aligned on a common ID + + A great deal of code duplication because we need to avoid diamond inheritance + and also it's not so easy to copy a pydantic validator method. """ - __pydantic_extra__: Dict[str, "DynamicTableMixin"] + model_config = ConfigDict(extra="allow", validate_assignment=True) + __pydantic_extra__: Dict[str, Union["DynamicTableMixin", "VectorDataMixin", "VectorIndexMixin"]] NON_CATEGORY_FIELDS: ClassVar[tuple[str]] = ( "name", @@ -562,7 +629,7 @@ class AlignedDynamicTableMixin(DynamicTableMixin): return {k: getattr(self, k) for i, k in enumerate(self.categories)} def __getitem__( - self, item: Union[int, str, slice, Tuple[Union[int, slice], str]] + self, item: Union[int, str, slice, NDArray[Shape["*"], int], Tuple[Union[int, slice], str]] ) -> pd.DataFrame: """ Mimic hdmf: @@ -580,25 +647,78 @@ class AlignedDynamicTableMixin(DynamicTableMixin): elif isinstance(item, tuple) and len(item) == 2 and isinstance(item[1], str): # get a slice of a single table return self._categories[item[1]][item[0]] - elif isinstance(item, (int, slice)): + elif isinstance(item, (int, slice, Iterable)): # get a slice of all the tables ids = self.id[item] if not isinstance(ids, Iterable): ids = pd.Series([ids]) ids = pd.DataFrame({"id": ids}) - tables = [ids] + [table[item].reset_index() for table in self._categories.values()] + tables = [ids] + for category_name, category in self._categories.items(): + table = category[item] + if isinstance(table, pd.DataFrame): + table = table.reset_index() + elif isinstance(table, np.ndarray): + table = pd.DataFrame({category_name: [table]}) + elif isinstance(table, Iterable): + table = pd.DataFrame({category_name: table}) + else: + raise ValueError( + f"Don't know how to construct category table for {category_name}" + ) + tables.append(table) + names = [self.name] + self.categories # construct below in case we need to support array indexing in the future else: raise ValueError( f"Dont know how to index with {item}, " - "need an int, string, slice, or tuple[int | slice, str]" + "need an int, string, slice, ndarray, or tuple[int | slice, str]" ) df = pd.concat(tables, axis=1, keys=names) df.set_index((self.name, "id"), drop=True, inplace=True) return df + def __getattr__(self, item: str) -> Any: + """Try and use pandas df attrs if we don't have them""" + try: + return BaseModel.__getattr__(self, item) + except AttributeError as e: + try: + return getattr(self[:], item) + except AttributeError: + raise e from None + + def __len__(self) -> int: + """ + Use the id column to determine length. + + If the id column doesn't represent length accurately, it's a bug + """ + return len(self.id) + + @model_validator(mode="before") + @classmethod + def create_id(cls, model: Dict[str, Any]) -> Dict: + """ + Create ID column if not provided + """ + if "id" not in model: + lengths = [] + for key, val in model.items(): + # don't get lengths of columns with an index + if ( + f"{key}_index" in model + or (isinstance(val, VectorData) and val._index) + or key in cls.NON_CATEGORY_FIELDS + ): + continue + lengths.append(len(val)) + model["id"] = np.arange(np.max(lengths)) + + return model + @model_validator(mode="before") @classmethod def create_categories(cls, model: Dict[str, Any]) -> Dict: @@ -625,6 +745,42 @@ class AlignedDynamicTableMixin(DynamicTableMixin): model["categories"].extend(categories) return model + @model_validator(mode="after") + def resolve_targets(self) -> "DynamicTableMixin": + """ + Ensure that any implicitly indexed columns are linked, and create backlinks + """ + for key, col in self._categories.items(): + if isinstance(col, VectorData): + # find an index + idx = None + for field_name in self.model_fields_set: + if field_name in self.NON_CATEGORY_FIELDS or field_name == key: + continue + # implicit name-based index + field = getattr(self, field_name) + if isinstance(field, VectorIndex) and ( + field_name == f"{key}_index" or field.target is col + ): + idx = field + break + if idx is not None: + col._index = idx + idx.target = col + return self + + @model_validator(mode="after") + def ensure_equal_length_cols(self) -> "DynamicTableMixin": + """ + Ensure that all columns are equal length + """ + lengths = [len(v) for v in self._categories.values()] + [len(self.id)] + assert all([length == lengths[0] for length in lengths]), ( + "Columns are not of equal length! " + f"Got colnames:\n{self.categories}\nand lengths: {lengths}" + ) + return self + linkml_meta = LinkMLMeta( { diff --git a/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_1_2/hdmf_common_table.py b/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_1_2/hdmf_common_table.py index 128e62f..17128ad 100644 --- a/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_1_2/hdmf_common_table.py +++ b/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_1_2/hdmf_common_table.py @@ -148,31 +148,62 @@ class VectorIndexMixin(BaseModel, Generic[T]): kwargs["value"] = value super().__init__(**kwargs) - def _getitem_helper(self, arg: int) -> Union[list, NDArray]: + def _slice(self, arg: int) -> slice: """ Mimicking :func:`hdmf.common.table.VectorIndex.__getitem_helper` """ start = 0 if arg == 0 else self.value[arg - 1] end = self.value[arg] - return self.target.value[slice(start, end)] + return slice(start, end) def __getitem__(self, item: Union[int, slice, Iterable]) -> Any: if self.target is None: return self.value[item] else: if isinstance(item, (int, np.integer)): - return self._getitem_helper(item) + return self.target.value[self._slice(item)] elif isinstance(item, (slice, Iterable)): if isinstance(item, slice): item = range(*item.indices(len(self.value))) - return [self._getitem_helper(i) for i in item] - else: + return [self.target.value[self._slice(i)] for i in item] + else: # pragma: no cover raise AttributeError(f"Could not index with {item}") def __setitem__(self, key: Union[int, slice], value: Any) -> None: - if self._index: - # VectorIndex is the thing that knows how to do the slicing - self._index[key] = value + """ + Set a value on the :attr:`.target` . + + .. note:: + + Even though we correct the indexing logic from HDMF where the + _data_ is the thing that is provided by the API when one accesses + table.data (rather than table.data_index as hdmf does), + we will set to the target here (rather than to the index) + to be consistent. To modify the index, modify `self.value` directly + + """ + if self.target: + if isinstance(key, (int, np.integer)): + self.target.value[self._slice(key)] = value + elif isinstance(key, (slice, Iterable)): + if isinstance(key, slice): + key = range(*key.indices(len(self.value))) + + if isinstance(value, Iterable): + if len(key) != len(value): + raise ValueError( + "Can only assign equal-length iterable to a slice, manually index the" + " ragged values of of the target VectorData object if you need more" + " control" + ) + for i, subval in zip(key, value): + self.target.value[self._slice(i)] = subval + else: + for i in key: + self.target.value[self._slice(i)] = value + else: # pragma: no cover + raise AttributeError(f"Could not index with {key}") + else: self.value[key] = value @@ -203,9 +234,17 @@ class DynamicTableRegionMixin(BaseModel): _index: Optional["VectorIndex"] = None table: "DynamicTableMixin" - value: Optional[NDArray] = None + value: Optional[NDArray[Shape["*"], int]] = None - def __getitem__(self, item: Union[int, slice, Iterable]) -> Any: + @overload + def __getitem__(self, item: int) -> pd.DataFrame: ... + + @overload + def __getitem__(self, item: Union[slice, Iterable]) -> List[pd.DataFrame]: ... + + def __getitem__( + self, item: Union[int, slice, Iterable] + ) -> Union[pd.DataFrame, List[pd.DataFrame]]: """ Use ``value`` to index the table. Works analogously to ``VectorIndex`` despite this being a subclass of ``VectorData`` @@ -220,20 +259,27 @@ class DynamicTableRegionMixin(BaseModel): # so we index table with an array to construct # a list of lists of rows return [self.table[idx] for idx in self._index[item]] - else: + else: # pragma: no cover raise ValueError(f"Dont know how to index with {item}, need an int or a slice") else: if isinstance(item, (int, np.integer)): return self.table[self.value[item]] elif isinstance(item, (slice, Iterable)): + # Return a list of dataframe rows because this is most often used + # as a column in a DynamicTable, so while it would normally be + # ideal to just return the slice as above as a single df, + # we need each row to be separate to fill the column if isinstance(item, slice): item = range(*item.indices(len(self.value))) return [self.table[self.value[i]] for i in item] - else: + else: # pragma: no cover raise ValueError(f"Dont know how to index with {item}, need an int or a slice") def __setitem__(self, key: Union[int, str, slice], value: Any) -> None: - self.table[self.value[key]] = value + # self.table[self.value[key]] = value + raise NotImplementedError( + "Assigning values to tables is not implemented yet!" + ) # pragma: no cover class DynamicTableMixin(BaseModel): @@ -244,9 +290,10 @@ class DynamicTableMixin(BaseModel): but simplifying along the way :) """ - model_config = ConfigDict(extra="allow") + model_config = ConfigDict(extra="allow", validate_assignment=True) __pydantic_extra__: Dict[str, Union["VectorDataMixin", "VectorIndexMixin", "NDArray", list]] NON_COLUMN_FIELDS: ClassVar[tuple[str]] = ( + "id", "name", "colnames", "description", @@ -260,10 +307,6 @@ class DynamicTableMixin(BaseModel): def _columns(self) -> Dict[str, Union[list, "NDArray", "VectorDataMixin"]]: return {k: getattr(self, k) for i, k in enumerate(self.colnames)} - @property - def _columns_list(self) -> List[Union[list, "NDArray", "VectorDataMixin"]]: - return [getattr(self, k) for i, k in enumerate(self.colnames)] - @overload def __getitem__(self, item: str) -> Union[list, "NDArray", "VectorDataMixin"]: ... @@ -312,6 +355,7 @@ class DynamicTableMixin(BaseModel): return self._columns[item] if isinstance(item, (int, slice, np.integer, np.ndarray)): data = self._slice_range(item) + index = self.id[item] elif isinstance(item, tuple): if len(item) != 2: raise ValueError( @@ -329,11 +373,15 @@ class DynamicTableMixin(BaseModel): return self._columns[cols][rows] data = self._slice_range(rows, cols) + index = self.id[rows] else: raise ValueError(f"Unsure how to get item with key {item}") # cast to DF - return pd.DataFrame(data) + if not isinstance(index, Iterable): + index = [index] + index = pd.Index(data=index) + return pd.DataFrame(data, index=index) def _slice_range( self, rows: Union[int, slice, np.ndarray], cols: Optional[Union[str, List[str]]] = None @@ -345,31 +393,40 @@ class DynamicTableMixin(BaseModel): data = {} for k in cols: if isinstance(rows, np.ndarray): + # help wanted - this is probably cr*zy slow val = [self._columns[k][i] for i in rows] else: val = self._columns[k][rows] # scalars need to be wrapped in series for pandas + # do this by the iterability of the rows index not the value because + # we want all lengths from this method to be equal, and if the rows are + # scalar, that means length == 1 if not isinstance(rows, (Iterable, slice)): - val = pd.Series([val]) + val = [val] data[k] = val return data def __setitem__(self, key: str, value: Any) -> None: - raise NotImplementedError("TODO") + raise NotImplementedError("TODO") # pragma: no cover def __setattr__(self, key: str, value: Union[list, "NDArray", "VectorData"]): """ Add a column, appending it to ``colnames`` """ # don't use this while building the model - if not getattr(self, "__pydantic_complete__", False): + if not getattr(self, "__pydantic_complete__", False): # pragma: no cover return super().__setattr__(key, value) if key not in self.model_fields_set and not key.endswith("_index"): self.colnames.append(key) + # we get a recursion error if we setattr without having first added to + # extras if we need it to be there + if key not in self.model_fields and key not in self.__pydantic_extra__: + self.__pydantic_extra__[key] = value + return super().__setattr__(key, value) def __getattr__(self, item: str) -> Any: @@ -396,6 +453,8 @@ class DynamicTableMixin(BaseModel): """ Create ID column if not provided """ + if not isinstance(model, dict): + return model if "id" not in model: lengths = [] for key, val in model.items(): @@ -420,6 +479,8 @@ class DynamicTableMixin(BaseModel): the model dict is ordered after python3.6, so we can use that minus anything in :attr:`.NON_COLUMN_FIELDS` to determine order implied from passage order """ + if not isinstance(model, dict): + return model if "colnames" not in model: colnames = [ k @@ -455,19 +516,21 @@ class DynamicTableMixin(BaseModel): See :meth:`.cast_specified_columns` for handling columns in the class specification """ # if columns are not in the specification, cast to a generic VectorData - for key, val in model.items(): - if key in cls.model_fields: - continue - if not isinstance(val, (VectorData, VectorIndex)): - try: - if key.endswith("_index"): - model[key] = VectorIndex(name=key, description="", value=val) - else: - model[key] = VectorData(name=key, description="", value=val) - except ValidationError as e: - raise ValidationError( - f"field {key} cannot be cast to VectorData from {val}" - ) from e + + if isinstance(model, dict): + for key, val in model.items(): + if key in cls.model_fields: + continue + if not isinstance(val, (VectorData, VectorIndex)): + try: + if key.endswith("_index"): + model[key] = VectorIndex(name=key, description="", value=val) + else: + model[key] = VectorData(name=key, description="", value=val) + except ValidationError as e: # pragma: no cover + raise ValidationError( + f"field {key} cannot be cast to VectorData from {val}" + ) from e return model @model_validator(mode="after") @@ -499,8 +562,8 @@ class DynamicTableMixin(BaseModel): """ Ensure that all columns are equal length """ - lengths = [len(v) for v in self._columns.values()] - assert [length == lengths[0] for length in lengths], ( + lengths = [len(v) for v in self._columns.values()] + [len(self.id)] + assert all([length == lengths[0] for length in lengths]), ( "Columns are not of equal length! " f"Got colnames:\n{self.colnames}\nand lengths: {lengths}" ) @@ -536,15 +599,19 @@ class DynamicTableMixin(BaseModel): ) ) except Exception: - raise e + raise e from None -class AlignedDynamicTableMixin(DynamicTableMixin): +class AlignedDynamicTableMixin(BaseModel): """ Mixin to allow indexing multiple tables that are aligned on a common ID + + A great deal of code duplication because we need to avoid diamond inheritance + and also it's not so easy to copy a pydantic validator method. """ - __pydantic_extra__: Dict[str, "DynamicTableMixin"] + model_config = ConfigDict(extra="allow", validate_assignment=True) + __pydantic_extra__: Dict[str, Union["DynamicTableMixin", "VectorDataMixin", "VectorIndexMixin"]] NON_CATEGORY_FIELDS: ClassVar[tuple[str]] = ( "name", @@ -562,7 +629,7 @@ class AlignedDynamicTableMixin(DynamicTableMixin): return {k: getattr(self, k) for i, k in enumerate(self.categories)} def __getitem__( - self, item: Union[int, str, slice, Tuple[Union[int, slice], str]] + self, item: Union[int, str, slice, NDArray[Shape["*"], int], Tuple[Union[int, slice], str]] ) -> pd.DataFrame: """ Mimic hdmf: @@ -580,25 +647,78 @@ class AlignedDynamicTableMixin(DynamicTableMixin): elif isinstance(item, tuple) and len(item) == 2 and isinstance(item[1], str): # get a slice of a single table return self._categories[item[1]][item[0]] - elif isinstance(item, (int, slice)): + elif isinstance(item, (int, slice, Iterable)): # get a slice of all the tables ids = self.id[item] if not isinstance(ids, Iterable): ids = pd.Series([ids]) ids = pd.DataFrame({"id": ids}) - tables = [ids] + [table[item].reset_index() for table in self._categories.values()] + tables = [ids] + for category_name, category in self._categories.items(): + table = category[item] + if isinstance(table, pd.DataFrame): + table = table.reset_index() + elif isinstance(table, np.ndarray): + table = pd.DataFrame({category_name: [table]}) + elif isinstance(table, Iterable): + table = pd.DataFrame({category_name: table}) + else: + raise ValueError( + f"Don't know how to construct category table for {category_name}" + ) + tables.append(table) + names = [self.name] + self.categories # construct below in case we need to support array indexing in the future else: raise ValueError( f"Dont know how to index with {item}, " - "need an int, string, slice, or tuple[int | slice, str]" + "need an int, string, slice, ndarray, or tuple[int | slice, str]" ) df = pd.concat(tables, axis=1, keys=names) df.set_index((self.name, "id"), drop=True, inplace=True) return df + def __getattr__(self, item: str) -> Any: + """Try and use pandas df attrs if we don't have them""" + try: + return BaseModel.__getattr__(self, item) + except AttributeError as e: + try: + return getattr(self[:], item) + except AttributeError: + raise e from None + + def __len__(self) -> int: + """ + Use the id column to determine length. + + If the id column doesn't represent length accurately, it's a bug + """ + return len(self.id) + + @model_validator(mode="before") + @classmethod + def create_id(cls, model: Dict[str, Any]) -> Dict: + """ + Create ID column if not provided + """ + if "id" not in model: + lengths = [] + for key, val in model.items(): + # don't get lengths of columns with an index + if ( + f"{key}_index" in model + or (isinstance(val, VectorData) and val._index) + or key in cls.NON_CATEGORY_FIELDS + ): + continue + lengths.append(len(val)) + model["id"] = np.arange(np.max(lengths)) + + return model + @model_validator(mode="before") @classmethod def create_categories(cls, model: Dict[str, Any]) -> Dict: @@ -625,6 +745,42 @@ class AlignedDynamicTableMixin(DynamicTableMixin): model["categories"].extend(categories) return model + @model_validator(mode="after") + def resolve_targets(self) -> "DynamicTableMixin": + """ + Ensure that any implicitly indexed columns are linked, and create backlinks + """ + for key, col in self._categories.items(): + if isinstance(col, VectorData): + # find an index + idx = None + for field_name in self.model_fields_set: + if field_name in self.NON_CATEGORY_FIELDS or field_name == key: + continue + # implicit name-based index + field = getattr(self, field_name) + if isinstance(field, VectorIndex) and ( + field_name == f"{key}_index" or field.target is col + ): + idx = field + break + if idx is not None: + col._index = idx + idx.target = col + return self + + @model_validator(mode="after") + def ensure_equal_length_cols(self) -> "DynamicTableMixin": + """ + Ensure that all columns are equal length + """ + lengths = [len(v) for v in self._categories.values()] + [len(self.id)] + assert all([length == lengths[0] for length in lengths]), ( + "Columns are not of equal length! " + f"Got colnames:\n{self.categories}\nand lengths: {lengths}" + ) + return self + linkml_meta = LinkMLMeta( { diff --git a/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_1_3/hdmf_common_table.py b/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_1_3/hdmf_common_table.py index 12b84b1..8b25e7d 100644 --- a/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_1_3/hdmf_common_table.py +++ b/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_1_3/hdmf_common_table.py @@ -148,31 +148,62 @@ class VectorIndexMixin(BaseModel, Generic[T]): kwargs["value"] = value super().__init__(**kwargs) - def _getitem_helper(self, arg: int) -> Union[list, NDArray]: + def _slice(self, arg: int) -> slice: """ Mimicking :func:`hdmf.common.table.VectorIndex.__getitem_helper` """ start = 0 if arg == 0 else self.value[arg - 1] end = self.value[arg] - return self.target.value[slice(start, end)] + return slice(start, end) def __getitem__(self, item: Union[int, slice, Iterable]) -> Any: if self.target is None: return self.value[item] else: if isinstance(item, (int, np.integer)): - return self._getitem_helper(item) + return self.target.value[self._slice(item)] elif isinstance(item, (slice, Iterable)): if isinstance(item, slice): item = range(*item.indices(len(self.value))) - return [self._getitem_helper(i) for i in item] - else: + return [self.target.value[self._slice(i)] for i in item] + else: # pragma: no cover raise AttributeError(f"Could not index with {item}") def __setitem__(self, key: Union[int, slice], value: Any) -> None: - if self._index: - # VectorIndex is the thing that knows how to do the slicing - self._index[key] = value + """ + Set a value on the :attr:`.target` . + + .. note:: + + Even though we correct the indexing logic from HDMF where the + _data_ is the thing that is provided by the API when one accesses + table.data (rather than table.data_index as hdmf does), + we will set to the target here (rather than to the index) + to be consistent. To modify the index, modify `self.value` directly + + """ + if self.target: + if isinstance(key, (int, np.integer)): + self.target.value[self._slice(key)] = value + elif isinstance(key, (slice, Iterable)): + if isinstance(key, slice): + key = range(*key.indices(len(self.value))) + + if isinstance(value, Iterable): + if len(key) != len(value): + raise ValueError( + "Can only assign equal-length iterable to a slice, manually index the" + " ragged values of of the target VectorData object if you need more" + " control" + ) + for i, subval in zip(key, value): + self.target.value[self._slice(i)] = subval + else: + for i in key: + self.target.value[self._slice(i)] = value + else: # pragma: no cover + raise AttributeError(f"Could not index with {key}") + else: self.value[key] = value @@ -203,9 +234,17 @@ class DynamicTableRegionMixin(BaseModel): _index: Optional["VectorIndex"] = None table: "DynamicTableMixin" - value: Optional[NDArray] = None + value: Optional[NDArray[Shape["*"], int]] = None - def __getitem__(self, item: Union[int, slice, Iterable]) -> Any: + @overload + def __getitem__(self, item: int) -> pd.DataFrame: ... + + @overload + def __getitem__(self, item: Union[slice, Iterable]) -> List[pd.DataFrame]: ... + + def __getitem__( + self, item: Union[int, slice, Iterable] + ) -> Union[pd.DataFrame, List[pd.DataFrame]]: """ Use ``value`` to index the table. Works analogously to ``VectorIndex`` despite this being a subclass of ``VectorData`` @@ -220,20 +259,27 @@ class DynamicTableRegionMixin(BaseModel): # so we index table with an array to construct # a list of lists of rows return [self.table[idx] for idx in self._index[item]] - else: + else: # pragma: no cover raise ValueError(f"Dont know how to index with {item}, need an int or a slice") else: if isinstance(item, (int, np.integer)): return self.table[self.value[item]] elif isinstance(item, (slice, Iterable)): + # Return a list of dataframe rows because this is most often used + # as a column in a DynamicTable, so while it would normally be + # ideal to just return the slice as above as a single df, + # we need each row to be separate to fill the column if isinstance(item, slice): item = range(*item.indices(len(self.value))) return [self.table[self.value[i]] for i in item] - else: + else: # pragma: no cover raise ValueError(f"Dont know how to index with {item}, need an int or a slice") def __setitem__(self, key: Union[int, str, slice], value: Any) -> None: - self.table[self.value[key]] = value + # self.table[self.value[key]] = value + raise NotImplementedError( + "Assigning values to tables is not implemented yet!" + ) # pragma: no cover class DynamicTableMixin(BaseModel): @@ -244,9 +290,10 @@ class DynamicTableMixin(BaseModel): but simplifying along the way :) """ - model_config = ConfigDict(extra="allow") + model_config = ConfigDict(extra="allow", validate_assignment=True) __pydantic_extra__: Dict[str, Union["VectorDataMixin", "VectorIndexMixin", "NDArray", list]] NON_COLUMN_FIELDS: ClassVar[tuple[str]] = ( + "id", "name", "colnames", "description", @@ -260,10 +307,6 @@ class DynamicTableMixin(BaseModel): def _columns(self) -> Dict[str, Union[list, "NDArray", "VectorDataMixin"]]: return {k: getattr(self, k) for i, k in enumerate(self.colnames)} - @property - def _columns_list(self) -> List[Union[list, "NDArray", "VectorDataMixin"]]: - return [getattr(self, k) for i, k in enumerate(self.colnames)] - @overload def __getitem__(self, item: str) -> Union[list, "NDArray", "VectorDataMixin"]: ... @@ -312,6 +355,7 @@ class DynamicTableMixin(BaseModel): return self._columns[item] if isinstance(item, (int, slice, np.integer, np.ndarray)): data = self._slice_range(item) + index = self.id[item] elif isinstance(item, tuple): if len(item) != 2: raise ValueError( @@ -329,11 +373,15 @@ class DynamicTableMixin(BaseModel): return self._columns[cols][rows] data = self._slice_range(rows, cols) + index = self.id[rows] else: raise ValueError(f"Unsure how to get item with key {item}") # cast to DF - return pd.DataFrame(data) + if not isinstance(index, Iterable): + index = [index] + index = pd.Index(data=index) + return pd.DataFrame(data, index=index) def _slice_range( self, rows: Union[int, slice, np.ndarray], cols: Optional[Union[str, List[str]]] = None @@ -345,31 +393,40 @@ class DynamicTableMixin(BaseModel): data = {} for k in cols: if isinstance(rows, np.ndarray): + # help wanted - this is probably cr*zy slow val = [self._columns[k][i] for i in rows] else: val = self._columns[k][rows] # scalars need to be wrapped in series for pandas + # do this by the iterability of the rows index not the value because + # we want all lengths from this method to be equal, and if the rows are + # scalar, that means length == 1 if not isinstance(rows, (Iterable, slice)): - val = pd.Series([val]) + val = [val] data[k] = val return data def __setitem__(self, key: str, value: Any) -> None: - raise NotImplementedError("TODO") + raise NotImplementedError("TODO") # pragma: no cover def __setattr__(self, key: str, value: Union[list, "NDArray", "VectorData"]): """ Add a column, appending it to ``colnames`` """ # don't use this while building the model - if not getattr(self, "__pydantic_complete__", False): + if not getattr(self, "__pydantic_complete__", False): # pragma: no cover return super().__setattr__(key, value) if key not in self.model_fields_set and not key.endswith("_index"): self.colnames.append(key) + # we get a recursion error if we setattr without having first added to + # extras if we need it to be there + if key not in self.model_fields and key not in self.__pydantic_extra__: + self.__pydantic_extra__[key] = value + return super().__setattr__(key, value) def __getattr__(self, item: str) -> Any: @@ -396,6 +453,8 @@ class DynamicTableMixin(BaseModel): """ Create ID column if not provided """ + if not isinstance(model, dict): + return model if "id" not in model: lengths = [] for key, val in model.items(): @@ -420,6 +479,8 @@ class DynamicTableMixin(BaseModel): the model dict is ordered after python3.6, so we can use that minus anything in :attr:`.NON_COLUMN_FIELDS` to determine order implied from passage order """ + if not isinstance(model, dict): + return model if "colnames" not in model: colnames = [ k @@ -455,19 +516,21 @@ class DynamicTableMixin(BaseModel): See :meth:`.cast_specified_columns` for handling columns in the class specification """ # if columns are not in the specification, cast to a generic VectorData - for key, val in model.items(): - if key in cls.model_fields: - continue - if not isinstance(val, (VectorData, VectorIndex)): - try: - if key.endswith("_index"): - model[key] = VectorIndex(name=key, description="", value=val) - else: - model[key] = VectorData(name=key, description="", value=val) - except ValidationError as e: - raise ValidationError( - f"field {key} cannot be cast to VectorData from {val}" - ) from e + + if isinstance(model, dict): + for key, val in model.items(): + if key in cls.model_fields: + continue + if not isinstance(val, (VectorData, VectorIndex)): + try: + if key.endswith("_index"): + model[key] = VectorIndex(name=key, description="", value=val) + else: + model[key] = VectorData(name=key, description="", value=val) + except ValidationError as e: # pragma: no cover + raise ValidationError( + f"field {key} cannot be cast to VectorData from {val}" + ) from e return model @model_validator(mode="after") @@ -499,8 +562,8 @@ class DynamicTableMixin(BaseModel): """ Ensure that all columns are equal length """ - lengths = [len(v) for v in self._columns.values()] - assert [length == lengths[0] for length in lengths], ( + lengths = [len(v) for v in self._columns.values()] + [len(self.id)] + assert all([length == lengths[0] for length in lengths]), ( "Columns are not of equal length! " f"Got colnames:\n{self.colnames}\nand lengths: {lengths}" ) @@ -536,15 +599,19 @@ class DynamicTableMixin(BaseModel): ) ) except Exception: - raise e + raise e from None -class AlignedDynamicTableMixin(DynamicTableMixin): +class AlignedDynamicTableMixin(BaseModel): """ Mixin to allow indexing multiple tables that are aligned on a common ID + + A great deal of code duplication because we need to avoid diamond inheritance + and also it's not so easy to copy a pydantic validator method. """ - __pydantic_extra__: Dict[str, "DynamicTableMixin"] + model_config = ConfigDict(extra="allow", validate_assignment=True) + __pydantic_extra__: Dict[str, Union["DynamicTableMixin", "VectorDataMixin", "VectorIndexMixin"]] NON_CATEGORY_FIELDS: ClassVar[tuple[str]] = ( "name", @@ -562,7 +629,7 @@ class AlignedDynamicTableMixin(DynamicTableMixin): return {k: getattr(self, k) for i, k in enumerate(self.categories)} def __getitem__( - self, item: Union[int, str, slice, Tuple[Union[int, slice], str]] + self, item: Union[int, str, slice, NDArray[Shape["*"], int], Tuple[Union[int, slice], str]] ) -> pd.DataFrame: """ Mimic hdmf: @@ -580,25 +647,78 @@ class AlignedDynamicTableMixin(DynamicTableMixin): elif isinstance(item, tuple) and len(item) == 2 and isinstance(item[1], str): # get a slice of a single table return self._categories[item[1]][item[0]] - elif isinstance(item, (int, slice)): + elif isinstance(item, (int, slice, Iterable)): # get a slice of all the tables ids = self.id[item] if not isinstance(ids, Iterable): ids = pd.Series([ids]) ids = pd.DataFrame({"id": ids}) - tables = [ids] + [table[item].reset_index() for table in self._categories.values()] + tables = [ids] + for category_name, category in self._categories.items(): + table = category[item] + if isinstance(table, pd.DataFrame): + table = table.reset_index() + elif isinstance(table, np.ndarray): + table = pd.DataFrame({category_name: [table]}) + elif isinstance(table, Iterable): + table = pd.DataFrame({category_name: table}) + else: + raise ValueError( + f"Don't know how to construct category table for {category_name}" + ) + tables.append(table) + names = [self.name] + self.categories # construct below in case we need to support array indexing in the future else: raise ValueError( f"Dont know how to index with {item}, " - "need an int, string, slice, or tuple[int | slice, str]" + "need an int, string, slice, ndarray, or tuple[int | slice, str]" ) df = pd.concat(tables, axis=1, keys=names) df.set_index((self.name, "id"), drop=True, inplace=True) return df + def __getattr__(self, item: str) -> Any: + """Try and use pandas df attrs if we don't have them""" + try: + return BaseModel.__getattr__(self, item) + except AttributeError as e: + try: + return getattr(self[:], item) + except AttributeError: + raise e from None + + def __len__(self) -> int: + """ + Use the id column to determine length. + + If the id column doesn't represent length accurately, it's a bug + """ + return len(self.id) + + @model_validator(mode="before") + @classmethod + def create_id(cls, model: Dict[str, Any]) -> Dict: + """ + Create ID column if not provided + """ + if "id" not in model: + lengths = [] + for key, val in model.items(): + # don't get lengths of columns with an index + if ( + f"{key}_index" in model + or (isinstance(val, VectorData) and val._index) + or key in cls.NON_CATEGORY_FIELDS + ): + continue + lengths.append(len(val)) + model["id"] = np.arange(np.max(lengths)) + + return model + @model_validator(mode="before") @classmethod def create_categories(cls, model: Dict[str, Any]) -> Dict: @@ -625,6 +745,42 @@ class AlignedDynamicTableMixin(DynamicTableMixin): model["categories"].extend(categories) return model + @model_validator(mode="after") + def resolve_targets(self) -> "DynamicTableMixin": + """ + Ensure that any implicitly indexed columns are linked, and create backlinks + """ + for key, col in self._categories.items(): + if isinstance(col, VectorData): + # find an index + idx = None + for field_name in self.model_fields_set: + if field_name in self.NON_CATEGORY_FIELDS or field_name == key: + continue + # implicit name-based index + field = getattr(self, field_name) + if isinstance(field, VectorIndex) and ( + field_name == f"{key}_index" or field.target is col + ): + idx = field + break + if idx is not None: + col._index = idx + idx.target = col + return self + + @model_validator(mode="after") + def ensure_equal_length_cols(self) -> "DynamicTableMixin": + """ + Ensure that all columns are equal length + """ + lengths = [len(v) for v in self._categories.values()] + [len(self.id)] + assert all([length == lengths[0] for length in lengths]), ( + "Columns are not of equal length! " + f"Got colnames:\n{self.categories}\nand lengths: {lengths}" + ) + return self + linkml_meta = LinkMLMeta( { diff --git a/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_2_0/hdmf_common_table.py b/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_2_0/hdmf_common_table.py index 9c99479..561d242 100644 --- a/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_2_0/hdmf_common_table.py +++ b/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_2_0/hdmf_common_table.py @@ -149,31 +149,62 @@ class VectorIndexMixin(BaseModel, Generic[T]): kwargs["value"] = value super().__init__(**kwargs) - def _getitem_helper(self, arg: int) -> Union[list, NDArray]: + def _slice(self, arg: int) -> slice: """ Mimicking :func:`hdmf.common.table.VectorIndex.__getitem_helper` """ start = 0 if arg == 0 else self.value[arg - 1] end = self.value[arg] - return self.target.value[slice(start, end)] + return slice(start, end) def __getitem__(self, item: Union[int, slice, Iterable]) -> Any: if self.target is None: return self.value[item] else: if isinstance(item, (int, np.integer)): - return self._getitem_helper(item) + return self.target.value[self._slice(item)] elif isinstance(item, (slice, Iterable)): if isinstance(item, slice): item = range(*item.indices(len(self.value))) - return [self._getitem_helper(i) for i in item] - else: + return [self.target.value[self._slice(i)] for i in item] + else: # pragma: no cover raise AttributeError(f"Could not index with {item}") def __setitem__(self, key: Union[int, slice], value: Any) -> None: - if self._index: - # VectorIndex is the thing that knows how to do the slicing - self._index[key] = value + """ + Set a value on the :attr:`.target` . + + .. note:: + + Even though we correct the indexing logic from HDMF where the + _data_ is the thing that is provided by the API when one accesses + table.data (rather than table.data_index as hdmf does), + we will set to the target here (rather than to the index) + to be consistent. To modify the index, modify `self.value` directly + + """ + if self.target: + if isinstance(key, (int, np.integer)): + self.target.value[self._slice(key)] = value + elif isinstance(key, (slice, Iterable)): + if isinstance(key, slice): + key = range(*key.indices(len(self.value))) + + if isinstance(value, Iterable): + if len(key) != len(value): + raise ValueError( + "Can only assign equal-length iterable to a slice, manually index the" + " ragged values of of the target VectorData object if you need more" + " control" + ) + for i, subval in zip(key, value): + self.target.value[self._slice(i)] = subval + else: + for i in key: + self.target.value[self._slice(i)] = value + else: # pragma: no cover + raise AttributeError(f"Could not index with {key}") + else: self.value[key] = value @@ -204,9 +235,17 @@ class DynamicTableRegionMixin(BaseModel): _index: Optional["VectorIndex"] = None table: "DynamicTableMixin" - value: Optional[NDArray] = None + value: Optional[NDArray[Shape["*"], int]] = None - def __getitem__(self, item: Union[int, slice, Iterable]) -> Any: + @overload + def __getitem__(self, item: int) -> pd.DataFrame: ... + + @overload + def __getitem__(self, item: Union[slice, Iterable]) -> List[pd.DataFrame]: ... + + def __getitem__( + self, item: Union[int, slice, Iterable] + ) -> Union[pd.DataFrame, List[pd.DataFrame]]: """ Use ``value`` to index the table. Works analogously to ``VectorIndex`` despite this being a subclass of ``VectorData`` @@ -221,20 +260,27 @@ class DynamicTableRegionMixin(BaseModel): # so we index table with an array to construct # a list of lists of rows return [self.table[idx] for idx in self._index[item]] - else: + else: # pragma: no cover raise ValueError(f"Dont know how to index with {item}, need an int or a slice") else: if isinstance(item, (int, np.integer)): return self.table[self.value[item]] elif isinstance(item, (slice, Iterable)): + # Return a list of dataframe rows because this is most often used + # as a column in a DynamicTable, so while it would normally be + # ideal to just return the slice as above as a single df, + # we need each row to be separate to fill the column if isinstance(item, slice): item = range(*item.indices(len(self.value))) return [self.table[self.value[i]] for i in item] - else: + else: # pragma: no cover raise ValueError(f"Dont know how to index with {item}, need an int or a slice") def __setitem__(self, key: Union[int, str, slice], value: Any) -> None: - self.table[self.value[key]] = value + # self.table[self.value[key]] = value + raise NotImplementedError( + "Assigning values to tables is not implemented yet!" + ) # pragma: no cover class DynamicTableMixin(BaseModel): @@ -245,9 +291,10 @@ class DynamicTableMixin(BaseModel): but simplifying along the way :) """ - model_config = ConfigDict(extra="allow") + model_config = ConfigDict(extra="allow", validate_assignment=True) __pydantic_extra__: Dict[str, Union["VectorDataMixin", "VectorIndexMixin", "NDArray", list]] NON_COLUMN_FIELDS: ClassVar[tuple[str]] = ( + "id", "name", "colnames", "description", @@ -261,10 +308,6 @@ class DynamicTableMixin(BaseModel): def _columns(self) -> Dict[str, Union[list, "NDArray", "VectorDataMixin"]]: return {k: getattr(self, k) for i, k in enumerate(self.colnames)} - @property - def _columns_list(self) -> List[Union[list, "NDArray", "VectorDataMixin"]]: - return [getattr(self, k) for i, k in enumerate(self.colnames)] - @overload def __getitem__(self, item: str) -> Union[list, "NDArray", "VectorDataMixin"]: ... @@ -313,6 +356,7 @@ class DynamicTableMixin(BaseModel): return self._columns[item] if isinstance(item, (int, slice, np.integer, np.ndarray)): data = self._slice_range(item) + index = self.id[item] elif isinstance(item, tuple): if len(item) != 2: raise ValueError( @@ -330,11 +374,15 @@ class DynamicTableMixin(BaseModel): return self._columns[cols][rows] data = self._slice_range(rows, cols) + index = self.id[rows] else: raise ValueError(f"Unsure how to get item with key {item}") # cast to DF - return pd.DataFrame(data) + if not isinstance(index, Iterable): + index = [index] + index = pd.Index(data=index) + return pd.DataFrame(data, index=index) def _slice_range( self, rows: Union[int, slice, np.ndarray], cols: Optional[Union[str, List[str]]] = None @@ -346,31 +394,40 @@ class DynamicTableMixin(BaseModel): data = {} for k in cols: if isinstance(rows, np.ndarray): + # help wanted - this is probably cr*zy slow val = [self._columns[k][i] for i in rows] else: val = self._columns[k][rows] # scalars need to be wrapped in series for pandas + # do this by the iterability of the rows index not the value because + # we want all lengths from this method to be equal, and if the rows are + # scalar, that means length == 1 if not isinstance(rows, (Iterable, slice)): - val = pd.Series([val]) + val = [val] data[k] = val return data def __setitem__(self, key: str, value: Any) -> None: - raise NotImplementedError("TODO") + raise NotImplementedError("TODO") # pragma: no cover def __setattr__(self, key: str, value: Union[list, "NDArray", "VectorData"]): """ Add a column, appending it to ``colnames`` """ # don't use this while building the model - if not getattr(self, "__pydantic_complete__", False): + if not getattr(self, "__pydantic_complete__", False): # pragma: no cover return super().__setattr__(key, value) if key not in self.model_fields_set and not key.endswith("_index"): self.colnames.append(key) + # we get a recursion error if we setattr without having first added to + # extras if we need it to be there + if key not in self.model_fields and key not in self.__pydantic_extra__: + self.__pydantic_extra__[key] = value + return super().__setattr__(key, value) def __getattr__(self, item: str) -> Any: @@ -397,6 +454,8 @@ class DynamicTableMixin(BaseModel): """ Create ID column if not provided """ + if not isinstance(model, dict): + return model if "id" not in model: lengths = [] for key, val in model.items(): @@ -421,6 +480,8 @@ class DynamicTableMixin(BaseModel): the model dict is ordered after python3.6, so we can use that minus anything in :attr:`.NON_COLUMN_FIELDS` to determine order implied from passage order """ + if not isinstance(model, dict): + return model if "colnames" not in model: colnames = [ k @@ -456,19 +517,21 @@ class DynamicTableMixin(BaseModel): See :meth:`.cast_specified_columns` for handling columns in the class specification """ # if columns are not in the specification, cast to a generic VectorData - for key, val in model.items(): - if key in cls.model_fields: - continue - if not isinstance(val, (VectorData, VectorIndex)): - try: - if key.endswith("_index"): - model[key] = VectorIndex(name=key, description="", value=val) - else: - model[key] = VectorData(name=key, description="", value=val) - except ValidationError as e: - raise ValidationError( - f"field {key} cannot be cast to VectorData from {val}" - ) from e + + if isinstance(model, dict): + for key, val in model.items(): + if key in cls.model_fields: + continue + if not isinstance(val, (VectorData, VectorIndex)): + try: + if key.endswith("_index"): + model[key] = VectorIndex(name=key, description="", value=val) + else: + model[key] = VectorData(name=key, description="", value=val) + except ValidationError as e: # pragma: no cover + raise ValidationError( + f"field {key} cannot be cast to VectorData from {val}" + ) from e return model @model_validator(mode="after") @@ -500,8 +563,8 @@ class DynamicTableMixin(BaseModel): """ Ensure that all columns are equal length """ - lengths = [len(v) for v in self._columns.values()] - assert [length == lengths[0] for length in lengths], ( + lengths = [len(v) for v in self._columns.values()] + [len(self.id)] + assert all([length == lengths[0] for length in lengths]), ( "Columns are not of equal length! " f"Got colnames:\n{self.colnames}\nand lengths: {lengths}" ) @@ -537,15 +600,19 @@ class DynamicTableMixin(BaseModel): ) ) except Exception: - raise e + raise e from None -class AlignedDynamicTableMixin(DynamicTableMixin): +class AlignedDynamicTableMixin(BaseModel): """ Mixin to allow indexing multiple tables that are aligned on a common ID + + A great deal of code duplication because we need to avoid diamond inheritance + and also it's not so easy to copy a pydantic validator method. """ - __pydantic_extra__: Dict[str, "DynamicTableMixin"] + model_config = ConfigDict(extra="allow", validate_assignment=True) + __pydantic_extra__: Dict[str, Union["DynamicTableMixin", "VectorDataMixin", "VectorIndexMixin"]] NON_CATEGORY_FIELDS: ClassVar[tuple[str]] = ( "name", @@ -563,7 +630,7 @@ class AlignedDynamicTableMixin(DynamicTableMixin): return {k: getattr(self, k) for i, k in enumerate(self.categories)} def __getitem__( - self, item: Union[int, str, slice, Tuple[Union[int, slice], str]] + self, item: Union[int, str, slice, NDArray[Shape["*"], int], Tuple[Union[int, slice], str]] ) -> pd.DataFrame: """ Mimic hdmf: @@ -581,25 +648,78 @@ class AlignedDynamicTableMixin(DynamicTableMixin): elif isinstance(item, tuple) and len(item) == 2 and isinstance(item[1], str): # get a slice of a single table return self._categories[item[1]][item[0]] - elif isinstance(item, (int, slice)): + elif isinstance(item, (int, slice, Iterable)): # get a slice of all the tables ids = self.id[item] if not isinstance(ids, Iterable): ids = pd.Series([ids]) ids = pd.DataFrame({"id": ids}) - tables = [ids] + [table[item].reset_index() for table in self._categories.values()] + tables = [ids] + for category_name, category in self._categories.items(): + table = category[item] + if isinstance(table, pd.DataFrame): + table = table.reset_index() + elif isinstance(table, np.ndarray): + table = pd.DataFrame({category_name: [table]}) + elif isinstance(table, Iterable): + table = pd.DataFrame({category_name: table}) + else: + raise ValueError( + f"Don't know how to construct category table for {category_name}" + ) + tables.append(table) + names = [self.name] + self.categories # construct below in case we need to support array indexing in the future else: raise ValueError( f"Dont know how to index with {item}, " - "need an int, string, slice, or tuple[int | slice, str]" + "need an int, string, slice, ndarray, or tuple[int | slice, str]" ) df = pd.concat(tables, axis=1, keys=names) df.set_index((self.name, "id"), drop=True, inplace=True) return df + def __getattr__(self, item: str) -> Any: + """Try and use pandas df attrs if we don't have them""" + try: + return BaseModel.__getattr__(self, item) + except AttributeError as e: + try: + return getattr(self[:], item) + except AttributeError: + raise e from None + + def __len__(self) -> int: + """ + Use the id column to determine length. + + If the id column doesn't represent length accurately, it's a bug + """ + return len(self.id) + + @model_validator(mode="before") + @classmethod + def create_id(cls, model: Dict[str, Any]) -> Dict: + """ + Create ID column if not provided + """ + if "id" not in model: + lengths = [] + for key, val in model.items(): + # don't get lengths of columns with an index + if ( + f"{key}_index" in model + or (isinstance(val, VectorData) and val._index) + or key in cls.NON_CATEGORY_FIELDS + ): + continue + lengths.append(len(val)) + model["id"] = np.arange(np.max(lengths)) + + return model + @model_validator(mode="before") @classmethod def create_categories(cls, model: Dict[str, Any]) -> Dict: @@ -626,6 +746,42 @@ class AlignedDynamicTableMixin(DynamicTableMixin): model["categories"].extend(categories) return model + @model_validator(mode="after") + def resolve_targets(self) -> "DynamicTableMixin": + """ + Ensure that any implicitly indexed columns are linked, and create backlinks + """ + for key, col in self._categories.items(): + if isinstance(col, VectorData): + # find an index + idx = None + for field_name in self.model_fields_set: + if field_name in self.NON_CATEGORY_FIELDS or field_name == key: + continue + # implicit name-based index + field = getattr(self, field_name) + if isinstance(field, VectorIndex) and ( + field_name == f"{key}_index" or field.target is col + ): + idx = field + break + if idx is not None: + col._index = idx + idx.target = col + return self + + @model_validator(mode="after") + def ensure_equal_length_cols(self) -> "DynamicTableMixin": + """ + Ensure that all columns are equal length + """ + lengths = [len(v) for v in self._categories.values()] + [len(self.id)] + assert all([length == lengths[0] for length in lengths]), ( + "Columns are not of equal length! " + f"Got colnames:\n{self.categories}\nand lengths: {lengths}" + ) + return self + linkml_meta = LinkMLMeta( { diff --git a/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_2_1/hdmf_common_table.py b/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_2_1/hdmf_common_table.py index b487609..9ff2a6c 100644 --- a/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_2_1/hdmf_common_table.py +++ b/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_2_1/hdmf_common_table.py @@ -149,31 +149,62 @@ class VectorIndexMixin(BaseModel, Generic[T]): kwargs["value"] = value super().__init__(**kwargs) - def _getitem_helper(self, arg: int) -> Union[list, NDArray]: + def _slice(self, arg: int) -> slice: """ Mimicking :func:`hdmf.common.table.VectorIndex.__getitem_helper` """ start = 0 if arg == 0 else self.value[arg - 1] end = self.value[arg] - return self.target.value[slice(start, end)] + return slice(start, end) def __getitem__(self, item: Union[int, slice, Iterable]) -> Any: if self.target is None: return self.value[item] else: if isinstance(item, (int, np.integer)): - return self._getitem_helper(item) + return self.target.value[self._slice(item)] elif isinstance(item, (slice, Iterable)): if isinstance(item, slice): item = range(*item.indices(len(self.value))) - return [self._getitem_helper(i) for i in item] - else: + return [self.target.value[self._slice(i)] for i in item] + else: # pragma: no cover raise AttributeError(f"Could not index with {item}") def __setitem__(self, key: Union[int, slice], value: Any) -> None: - if self._index: - # VectorIndex is the thing that knows how to do the slicing - self._index[key] = value + """ + Set a value on the :attr:`.target` . + + .. note:: + + Even though we correct the indexing logic from HDMF where the + _data_ is the thing that is provided by the API when one accesses + table.data (rather than table.data_index as hdmf does), + we will set to the target here (rather than to the index) + to be consistent. To modify the index, modify `self.value` directly + + """ + if self.target: + if isinstance(key, (int, np.integer)): + self.target.value[self._slice(key)] = value + elif isinstance(key, (slice, Iterable)): + if isinstance(key, slice): + key = range(*key.indices(len(self.value))) + + if isinstance(value, Iterable): + if len(key) != len(value): + raise ValueError( + "Can only assign equal-length iterable to a slice, manually index the" + " ragged values of of the target VectorData object if you need more" + " control" + ) + for i, subval in zip(key, value): + self.target.value[self._slice(i)] = subval + else: + for i in key: + self.target.value[self._slice(i)] = value + else: # pragma: no cover + raise AttributeError(f"Could not index with {key}") + else: self.value[key] = value @@ -204,9 +235,17 @@ class DynamicTableRegionMixin(BaseModel): _index: Optional["VectorIndex"] = None table: "DynamicTableMixin" - value: Optional[NDArray] = None + value: Optional[NDArray[Shape["*"], int]] = None - def __getitem__(self, item: Union[int, slice, Iterable]) -> Any: + @overload + def __getitem__(self, item: int) -> pd.DataFrame: ... + + @overload + def __getitem__(self, item: Union[slice, Iterable]) -> List[pd.DataFrame]: ... + + def __getitem__( + self, item: Union[int, slice, Iterable] + ) -> Union[pd.DataFrame, List[pd.DataFrame]]: """ Use ``value`` to index the table. Works analogously to ``VectorIndex`` despite this being a subclass of ``VectorData`` @@ -221,20 +260,27 @@ class DynamicTableRegionMixin(BaseModel): # so we index table with an array to construct # a list of lists of rows return [self.table[idx] for idx in self._index[item]] - else: + else: # pragma: no cover raise ValueError(f"Dont know how to index with {item}, need an int or a slice") else: if isinstance(item, (int, np.integer)): return self.table[self.value[item]] elif isinstance(item, (slice, Iterable)): + # Return a list of dataframe rows because this is most often used + # as a column in a DynamicTable, so while it would normally be + # ideal to just return the slice as above as a single df, + # we need each row to be separate to fill the column if isinstance(item, slice): item = range(*item.indices(len(self.value))) return [self.table[self.value[i]] for i in item] - else: + else: # pragma: no cover raise ValueError(f"Dont know how to index with {item}, need an int or a slice") def __setitem__(self, key: Union[int, str, slice], value: Any) -> None: - self.table[self.value[key]] = value + # self.table[self.value[key]] = value + raise NotImplementedError( + "Assigning values to tables is not implemented yet!" + ) # pragma: no cover class DynamicTableMixin(BaseModel): @@ -245,9 +291,10 @@ class DynamicTableMixin(BaseModel): but simplifying along the way :) """ - model_config = ConfigDict(extra="allow") + model_config = ConfigDict(extra="allow", validate_assignment=True) __pydantic_extra__: Dict[str, Union["VectorDataMixin", "VectorIndexMixin", "NDArray", list]] NON_COLUMN_FIELDS: ClassVar[tuple[str]] = ( + "id", "name", "colnames", "description", @@ -261,10 +308,6 @@ class DynamicTableMixin(BaseModel): def _columns(self) -> Dict[str, Union[list, "NDArray", "VectorDataMixin"]]: return {k: getattr(self, k) for i, k in enumerate(self.colnames)} - @property - def _columns_list(self) -> List[Union[list, "NDArray", "VectorDataMixin"]]: - return [getattr(self, k) for i, k in enumerate(self.colnames)] - @overload def __getitem__(self, item: str) -> Union[list, "NDArray", "VectorDataMixin"]: ... @@ -313,6 +356,7 @@ class DynamicTableMixin(BaseModel): return self._columns[item] if isinstance(item, (int, slice, np.integer, np.ndarray)): data = self._slice_range(item) + index = self.id[item] elif isinstance(item, tuple): if len(item) != 2: raise ValueError( @@ -330,11 +374,15 @@ class DynamicTableMixin(BaseModel): return self._columns[cols][rows] data = self._slice_range(rows, cols) + index = self.id[rows] else: raise ValueError(f"Unsure how to get item with key {item}") # cast to DF - return pd.DataFrame(data) + if not isinstance(index, Iterable): + index = [index] + index = pd.Index(data=index) + return pd.DataFrame(data, index=index) def _slice_range( self, rows: Union[int, slice, np.ndarray], cols: Optional[Union[str, List[str]]] = None @@ -346,31 +394,40 @@ class DynamicTableMixin(BaseModel): data = {} for k in cols: if isinstance(rows, np.ndarray): + # help wanted - this is probably cr*zy slow val = [self._columns[k][i] for i in rows] else: val = self._columns[k][rows] # scalars need to be wrapped in series for pandas + # do this by the iterability of the rows index not the value because + # we want all lengths from this method to be equal, and if the rows are + # scalar, that means length == 1 if not isinstance(rows, (Iterable, slice)): - val = pd.Series([val]) + val = [val] data[k] = val return data def __setitem__(self, key: str, value: Any) -> None: - raise NotImplementedError("TODO") + raise NotImplementedError("TODO") # pragma: no cover def __setattr__(self, key: str, value: Union[list, "NDArray", "VectorData"]): """ Add a column, appending it to ``colnames`` """ # don't use this while building the model - if not getattr(self, "__pydantic_complete__", False): + if not getattr(self, "__pydantic_complete__", False): # pragma: no cover return super().__setattr__(key, value) if key not in self.model_fields_set and not key.endswith("_index"): self.colnames.append(key) + # we get a recursion error if we setattr without having first added to + # extras if we need it to be there + if key not in self.model_fields and key not in self.__pydantic_extra__: + self.__pydantic_extra__[key] = value + return super().__setattr__(key, value) def __getattr__(self, item: str) -> Any: @@ -397,6 +454,8 @@ class DynamicTableMixin(BaseModel): """ Create ID column if not provided """ + if not isinstance(model, dict): + return model if "id" not in model: lengths = [] for key, val in model.items(): @@ -421,6 +480,8 @@ class DynamicTableMixin(BaseModel): the model dict is ordered after python3.6, so we can use that minus anything in :attr:`.NON_COLUMN_FIELDS` to determine order implied from passage order """ + if not isinstance(model, dict): + return model if "colnames" not in model: colnames = [ k @@ -456,19 +517,21 @@ class DynamicTableMixin(BaseModel): See :meth:`.cast_specified_columns` for handling columns in the class specification """ # if columns are not in the specification, cast to a generic VectorData - for key, val in model.items(): - if key in cls.model_fields: - continue - if not isinstance(val, (VectorData, VectorIndex)): - try: - if key.endswith("_index"): - model[key] = VectorIndex(name=key, description="", value=val) - else: - model[key] = VectorData(name=key, description="", value=val) - except ValidationError as e: - raise ValidationError( - f"field {key} cannot be cast to VectorData from {val}" - ) from e + + if isinstance(model, dict): + for key, val in model.items(): + if key in cls.model_fields: + continue + if not isinstance(val, (VectorData, VectorIndex)): + try: + if key.endswith("_index"): + model[key] = VectorIndex(name=key, description="", value=val) + else: + model[key] = VectorData(name=key, description="", value=val) + except ValidationError as e: # pragma: no cover + raise ValidationError( + f"field {key} cannot be cast to VectorData from {val}" + ) from e return model @model_validator(mode="after") @@ -500,8 +563,8 @@ class DynamicTableMixin(BaseModel): """ Ensure that all columns are equal length """ - lengths = [len(v) for v in self._columns.values()] - assert [length == lengths[0] for length in lengths], ( + lengths = [len(v) for v in self._columns.values()] + [len(self.id)] + assert all([length == lengths[0] for length in lengths]), ( "Columns are not of equal length! " f"Got colnames:\n{self.colnames}\nand lengths: {lengths}" ) @@ -537,15 +600,19 @@ class DynamicTableMixin(BaseModel): ) ) except Exception: - raise e + raise e from None -class AlignedDynamicTableMixin(DynamicTableMixin): +class AlignedDynamicTableMixin(BaseModel): """ Mixin to allow indexing multiple tables that are aligned on a common ID + + A great deal of code duplication because we need to avoid diamond inheritance + and also it's not so easy to copy a pydantic validator method. """ - __pydantic_extra__: Dict[str, "DynamicTableMixin"] + model_config = ConfigDict(extra="allow", validate_assignment=True) + __pydantic_extra__: Dict[str, Union["DynamicTableMixin", "VectorDataMixin", "VectorIndexMixin"]] NON_CATEGORY_FIELDS: ClassVar[tuple[str]] = ( "name", @@ -563,7 +630,7 @@ class AlignedDynamicTableMixin(DynamicTableMixin): return {k: getattr(self, k) for i, k in enumerate(self.categories)} def __getitem__( - self, item: Union[int, str, slice, Tuple[Union[int, slice], str]] + self, item: Union[int, str, slice, NDArray[Shape["*"], int], Tuple[Union[int, slice], str]] ) -> pd.DataFrame: """ Mimic hdmf: @@ -581,25 +648,78 @@ class AlignedDynamicTableMixin(DynamicTableMixin): elif isinstance(item, tuple) and len(item) == 2 and isinstance(item[1], str): # get a slice of a single table return self._categories[item[1]][item[0]] - elif isinstance(item, (int, slice)): + elif isinstance(item, (int, slice, Iterable)): # get a slice of all the tables ids = self.id[item] if not isinstance(ids, Iterable): ids = pd.Series([ids]) ids = pd.DataFrame({"id": ids}) - tables = [ids] + [table[item].reset_index() for table in self._categories.values()] + tables = [ids] + for category_name, category in self._categories.items(): + table = category[item] + if isinstance(table, pd.DataFrame): + table = table.reset_index() + elif isinstance(table, np.ndarray): + table = pd.DataFrame({category_name: [table]}) + elif isinstance(table, Iterable): + table = pd.DataFrame({category_name: table}) + else: + raise ValueError( + f"Don't know how to construct category table for {category_name}" + ) + tables.append(table) + names = [self.name] + self.categories # construct below in case we need to support array indexing in the future else: raise ValueError( f"Dont know how to index with {item}, " - "need an int, string, slice, or tuple[int | slice, str]" + "need an int, string, slice, ndarray, or tuple[int | slice, str]" ) df = pd.concat(tables, axis=1, keys=names) df.set_index((self.name, "id"), drop=True, inplace=True) return df + def __getattr__(self, item: str) -> Any: + """Try and use pandas df attrs if we don't have them""" + try: + return BaseModel.__getattr__(self, item) + except AttributeError as e: + try: + return getattr(self[:], item) + except AttributeError: + raise e from None + + def __len__(self) -> int: + """ + Use the id column to determine length. + + If the id column doesn't represent length accurately, it's a bug + """ + return len(self.id) + + @model_validator(mode="before") + @classmethod + def create_id(cls, model: Dict[str, Any]) -> Dict: + """ + Create ID column if not provided + """ + if "id" not in model: + lengths = [] + for key, val in model.items(): + # don't get lengths of columns with an index + if ( + f"{key}_index" in model + or (isinstance(val, VectorData) and val._index) + or key in cls.NON_CATEGORY_FIELDS + ): + continue + lengths.append(len(val)) + model["id"] = np.arange(np.max(lengths)) + + return model + @model_validator(mode="before") @classmethod def create_categories(cls, model: Dict[str, Any]) -> Dict: @@ -626,6 +746,42 @@ class AlignedDynamicTableMixin(DynamicTableMixin): model["categories"].extend(categories) return model + @model_validator(mode="after") + def resolve_targets(self) -> "DynamicTableMixin": + """ + Ensure that any implicitly indexed columns are linked, and create backlinks + """ + for key, col in self._categories.items(): + if isinstance(col, VectorData): + # find an index + idx = None + for field_name in self.model_fields_set: + if field_name in self.NON_CATEGORY_FIELDS or field_name == key: + continue + # implicit name-based index + field = getattr(self, field_name) + if isinstance(field, VectorIndex) and ( + field_name == f"{key}_index" or field.target is col + ): + idx = field + break + if idx is not None: + col._index = idx + idx.target = col + return self + + @model_validator(mode="after") + def ensure_equal_length_cols(self) -> "DynamicTableMixin": + """ + Ensure that all columns are equal length + """ + lengths = [len(v) for v in self._categories.values()] + [len(self.id)] + assert all([length == lengths[0] for length in lengths]), ( + "Columns are not of equal length! " + f"Got colnames:\n{self.categories}\nand lengths: {lengths}" + ) + return self + linkml_meta = LinkMLMeta( { diff --git a/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_3_0/hdmf_common_table.py b/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_3_0/hdmf_common_table.py index 14214b5..1ecc2ec 100644 --- a/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_3_0/hdmf_common_table.py +++ b/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_3_0/hdmf_common_table.py @@ -149,31 +149,62 @@ class VectorIndexMixin(BaseModel, Generic[T]): kwargs["value"] = value super().__init__(**kwargs) - def _getitem_helper(self, arg: int) -> Union[list, NDArray]: + def _slice(self, arg: int) -> slice: """ Mimicking :func:`hdmf.common.table.VectorIndex.__getitem_helper` """ start = 0 if arg == 0 else self.value[arg - 1] end = self.value[arg] - return self.target.value[slice(start, end)] + return slice(start, end) def __getitem__(self, item: Union[int, slice, Iterable]) -> Any: if self.target is None: return self.value[item] else: if isinstance(item, (int, np.integer)): - return self._getitem_helper(item) + return self.target.value[self._slice(item)] elif isinstance(item, (slice, Iterable)): if isinstance(item, slice): item = range(*item.indices(len(self.value))) - return [self._getitem_helper(i) for i in item] - else: + return [self.target.value[self._slice(i)] for i in item] + else: # pragma: no cover raise AttributeError(f"Could not index with {item}") def __setitem__(self, key: Union[int, slice], value: Any) -> None: - if self._index: - # VectorIndex is the thing that knows how to do the slicing - self._index[key] = value + """ + Set a value on the :attr:`.target` . + + .. note:: + + Even though we correct the indexing logic from HDMF where the + _data_ is the thing that is provided by the API when one accesses + table.data (rather than table.data_index as hdmf does), + we will set to the target here (rather than to the index) + to be consistent. To modify the index, modify `self.value` directly + + """ + if self.target: + if isinstance(key, (int, np.integer)): + self.target.value[self._slice(key)] = value + elif isinstance(key, (slice, Iterable)): + if isinstance(key, slice): + key = range(*key.indices(len(self.value))) + + if isinstance(value, Iterable): + if len(key) != len(value): + raise ValueError( + "Can only assign equal-length iterable to a slice, manually index the" + " ragged values of of the target VectorData object if you need more" + " control" + ) + for i, subval in zip(key, value): + self.target.value[self._slice(i)] = subval + else: + for i in key: + self.target.value[self._slice(i)] = value + else: # pragma: no cover + raise AttributeError(f"Could not index with {key}") + else: self.value[key] = value @@ -204,9 +235,17 @@ class DynamicTableRegionMixin(BaseModel): _index: Optional["VectorIndex"] = None table: "DynamicTableMixin" - value: Optional[NDArray] = None + value: Optional[NDArray[Shape["*"], int]] = None - def __getitem__(self, item: Union[int, slice, Iterable]) -> Any: + @overload + def __getitem__(self, item: int) -> pd.DataFrame: ... + + @overload + def __getitem__(self, item: Union[slice, Iterable]) -> List[pd.DataFrame]: ... + + def __getitem__( + self, item: Union[int, slice, Iterable] + ) -> Union[pd.DataFrame, List[pd.DataFrame]]: """ Use ``value`` to index the table. Works analogously to ``VectorIndex`` despite this being a subclass of ``VectorData`` @@ -221,20 +260,27 @@ class DynamicTableRegionMixin(BaseModel): # so we index table with an array to construct # a list of lists of rows return [self.table[idx] for idx in self._index[item]] - else: + else: # pragma: no cover raise ValueError(f"Dont know how to index with {item}, need an int or a slice") else: if isinstance(item, (int, np.integer)): return self.table[self.value[item]] elif isinstance(item, (slice, Iterable)): + # Return a list of dataframe rows because this is most often used + # as a column in a DynamicTable, so while it would normally be + # ideal to just return the slice as above as a single df, + # we need each row to be separate to fill the column if isinstance(item, slice): item = range(*item.indices(len(self.value))) return [self.table[self.value[i]] for i in item] - else: + else: # pragma: no cover raise ValueError(f"Dont know how to index with {item}, need an int or a slice") def __setitem__(self, key: Union[int, str, slice], value: Any) -> None: - self.table[self.value[key]] = value + # self.table[self.value[key]] = value + raise NotImplementedError( + "Assigning values to tables is not implemented yet!" + ) # pragma: no cover class DynamicTableMixin(BaseModel): @@ -245,9 +291,10 @@ class DynamicTableMixin(BaseModel): but simplifying along the way :) """ - model_config = ConfigDict(extra="allow") + model_config = ConfigDict(extra="allow", validate_assignment=True) __pydantic_extra__: Dict[str, Union["VectorDataMixin", "VectorIndexMixin", "NDArray", list]] NON_COLUMN_FIELDS: ClassVar[tuple[str]] = ( + "id", "name", "colnames", "description", @@ -261,10 +308,6 @@ class DynamicTableMixin(BaseModel): def _columns(self) -> Dict[str, Union[list, "NDArray", "VectorDataMixin"]]: return {k: getattr(self, k) for i, k in enumerate(self.colnames)} - @property - def _columns_list(self) -> List[Union[list, "NDArray", "VectorDataMixin"]]: - return [getattr(self, k) for i, k in enumerate(self.colnames)] - @overload def __getitem__(self, item: str) -> Union[list, "NDArray", "VectorDataMixin"]: ... @@ -313,6 +356,7 @@ class DynamicTableMixin(BaseModel): return self._columns[item] if isinstance(item, (int, slice, np.integer, np.ndarray)): data = self._slice_range(item) + index = self.id[item] elif isinstance(item, tuple): if len(item) != 2: raise ValueError( @@ -330,11 +374,15 @@ class DynamicTableMixin(BaseModel): return self._columns[cols][rows] data = self._slice_range(rows, cols) + index = self.id[rows] else: raise ValueError(f"Unsure how to get item with key {item}") # cast to DF - return pd.DataFrame(data) + if not isinstance(index, Iterable): + index = [index] + index = pd.Index(data=index) + return pd.DataFrame(data, index=index) def _slice_range( self, rows: Union[int, slice, np.ndarray], cols: Optional[Union[str, List[str]]] = None @@ -346,31 +394,40 @@ class DynamicTableMixin(BaseModel): data = {} for k in cols: if isinstance(rows, np.ndarray): + # help wanted - this is probably cr*zy slow val = [self._columns[k][i] for i in rows] else: val = self._columns[k][rows] # scalars need to be wrapped in series for pandas + # do this by the iterability of the rows index not the value because + # we want all lengths from this method to be equal, and if the rows are + # scalar, that means length == 1 if not isinstance(rows, (Iterable, slice)): - val = pd.Series([val]) + val = [val] data[k] = val return data def __setitem__(self, key: str, value: Any) -> None: - raise NotImplementedError("TODO") + raise NotImplementedError("TODO") # pragma: no cover def __setattr__(self, key: str, value: Union[list, "NDArray", "VectorData"]): """ Add a column, appending it to ``colnames`` """ # don't use this while building the model - if not getattr(self, "__pydantic_complete__", False): + if not getattr(self, "__pydantic_complete__", False): # pragma: no cover return super().__setattr__(key, value) if key not in self.model_fields_set and not key.endswith("_index"): self.colnames.append(key) + # we get a recursion error if we setattr without having first added to + # extras if we need it to be there + if key not in self.model_fields and key not in self.__pydantic_extra__: + self.__pydantic_extra__[key] = value + return super().__setattr__(key, value) def __getattr__(self, item: str) -> Any: @@ -397,6 +454,8 @@ class DynamicTableMixin(BaseModel): """ Create ID column if not provided """ + if not isinstance(model, dict): + return model if "id" not in model: lengths = [] for key, val in model.items(): @@ -421,6 +480,8 @@ class DynamicTableMixin(BaseModel): the model dict is ordered after python3.6, so we can use that minus anything in :attr:`.NON_COLUMN_FIELDS` to determine order implied from passage order """ + if not isinstance(model, dict): + return model if "colnames" not in model: colnames = [ k @@ -456,19 +517,21 @@ class DynamicTableMixin(BaseModel): See :meth:`.cast_specified_columns` for handling columns in the class specification """ # if columns are not in the specification, cast to a generic VectorData - for key, val in model.items(): - if key in cls.model_fields: - continue - if not isinstance(val, (VectorData, VectorIndex)): - try: - if key.endswith("_index"): - model[key] = VectorIndex(name=key, description="", value=val) - else: - model[key] = VectorData(name=key, description="", value=val) - except ValidationError as e: - raise ValidationError( - f"field {key} cannot be cast to VectorData from {val}" - ) from e + + if isinstance(model, dict): + for key, val in model.items(): + if key in cls.model_fields: + continue + if not isinstance(val, (VectorData, VectorIndex)): + try: + if key.endswith("_index"): + model[key] = VectorIndex(name=key, description="", value=val) + else: + model[key] = VectorData(name=key, description="", value=val) + except ValidationError as e: # pragma: no cover + raise ValidationError( + f"field {key} cannot be cast to VectorData from {val}" + ) from e return model @model_validator(mode="after") @@ -500,8 +563,8 @@ class DynamicTableMixin(BaseModel): """ Ensure that all columns are equal length """ - lengths = [len(v) for v in self._columns.values()] - assert [length == lengths[0] for length in lengths], ( + lengths = [len(v) for v in self._columns.values()] + [len(self.id)] + assert all([length == lengths[0] for length in lengths]), ( "Columns are not of equal length! " f"Got colnames:\n{self.colnames}\nand lengths: {lengths}" ) @@ -537,15 +600,19 @@ class DynamicTableMixin(BaseModel): ) ) except Exception: - raise e + raise e from None -class AlignedDynamicTableMixin(DynamicTableMixin): +class AlignedDynamicTableMixin(BaseModel): """ Mixin to allow indexing multiple tables that are aligned on a common ID + + A great deal of code duplication because we need to avoid diamond inheritance + and also it's not so easy to copy a pydantic validator method. """ - __pydantic_extra__: Dict[str, "DynamicTableMixin"] + model_config = ConfigDict(extra="allow", validate_assignment=True) + __pydantic_extra__: Dict[str, Union["DynamicTableMixin", "VectorDataMixin", "VectorIndexMixin"]] NON_CATEGORY_FIELDS: ClassVar[tuple[str]] = ( "name", @@ -563,7 +630,7 @@ class AlignedDynamicTableMixin(DynamicTableMixin): return {k: getattr(self, k) for i, k in enumerate(self.categories)} def __getitem__( - self, item: Union[int, str, slice, Tuple[Union[int, slice], str]] + self, item: Union[int, str, slice, NDArray[Shape["*"], int], Tuple[Union[int, slice], str]] ) -> pd.DataFrame: """ Mimic hdmf: @@ -581,25 +648,78 @@ class AlignedDynamicTableMixin(DynamicTableMixin): elif isinstance(item, tuple) and len(item) == 2 and isinstance(item[1], str): # get a slice of a single table return self._categories[item[1]][item[0]] - elif isinstance(item, (int, slice)): + elif isinstance(item, (int, slice, Iterable)): # get a slice of all the tables ids = self.id[item] if not isinstance(ids, Iterable): ids = pd.Series([ids]) ids = pd.DataFrame({"id": ids}) - tables = [ids] + [table[item].reset_index() for table in self._categories.values()] + tables = [ids] + for category_name, category in self._categories.items(): + table = category[item] + if isinstance(table, pd.DataFrame): + table = table.reset_index() + elif isinstance(table, np.ndarray): + table = pd.DataFrame({category_name: [table]}) + elif isinstance(table, Iterable): + table = pd.DataFrame({category_name: table}) + else: + raise ValueError( + f"Don't know how to construct category table for {category_name}" + ) + tables.append(table) + names = [self.name] + self.categories # construct below in case we need to support array indexing in the future else: raise ValueError( f"Dont know how to index with {item}, " - "need an int, string, slice, or tuple[int | slice, str]" + "need an int, string, slice, ndarray, or tuple[int | slice, str]" ) df = pd.concat(tables, axis=1, keys=names) df.set_index((self.name, "id"), drop=True, inplace=True) return df + def __getattr__(self, item: str) -> Any: + """Try and use pandas df attrs if we don't have them""" + try: + return BaseModel.__getattr__(self, item) + except AttributeError as e: + try: + return getattr(self[:], item) + except AttributeError: + raise e from None + + def __len__(self) -> int: + """ + Use the id column to determine length. + + If the id column doesn't represent length accurately, it's a bug + """ + return len(self.id) + + @model_validator(mode="before") + @classmethod + def create_id(cls, model: Dict[str, Any]) -> Dict: + """ + Create ID column if not provided + """ + if "id" not in model: + lengths = [] + for key, val in model.items(): + # don't get lengths of columns with an index + if ( + f"{key}_index" in model + or (isinstance(val, VectorData) and val._index) + or key in cls.NON_CATEGORY_FIELDS + ): + continue + lengths.append(len(val)) + model["id"] = np.arange(np.max(lengths)) + + return model + @model_validator(mode="before") @classmethod def create_categories(cls, model: Dict[str, Any]) -> Dict: @@ -626,6 +746,42 @@ class AlignedDynamicTableMixin(DynamicTableMixin): model["categories"].extend(categories) return model + @model_validator(mode="after") + def resolve_targets(self) -> "DynamicTableMixin": + """ + Ensure that any implicitly indexed columns are linked, and create backlinks + """ + for key, col in self._categories.items(): + if isinstance(col, VectorData): + # find an index + idx = None + for field_name in self.model_fields_set: + if field_name in self.NON_CATEGORY_FIELDS or field_name == key: + continue + # implicit name-based index + field = getattr(self, field_name) + if isinstance(field, VectorIndex) and ( + field_name == f"{key}_index" or field.target is col + ): + idx = field + break + if idx is not None: + col._index = idx + idx.target = col + return self + + @model_validator(mode="after") + def ensure_equal_length_cols(self) -> "DynamicTableMixin": + """ + Ensure that all columns are equal length + """ + lengths = [len(v) for v in self._categories.values()] + [len(self.id)] + assert all([length == lengths[0] for length in lengths]), ( + "Columns are not of equal length! " + f"Got colnames:\n{self.categories}\nand lengths: {lengths}" + ) + return self + linkml_meta = LinkMLMeta( { diff --git a/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_4_0/hdmf_common_table.py b/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_4_0/hdmf_common_table.py index f79660e..d37f163 100644 --- a/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_4_0/hdmf_common_table.py +++ b/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_4_0/hdmf_common_table.py @@ -149,31 +149,62 @@ class VectorIndexMixin(BaseModel, Generic[T]): kwargs["value"] = value super().__init__(**kwargs) - def _getitem_helper(self, arg: int) -> Union[list, NDArray]: + def _slice(self, arg: int) -> slice: """ Mimicking :func:`hdmf.common.table.VectorIndex.__getitem_helper` """ start = 0 if arg == 0 else self.value[arg - 1] end = self.value[arg] - return self.target.value[slice(start, end)] + return slice(start, end) def __getitem__(self, item: Union[int, slice, Iterable]) -> Any: if self.target is None: return self.value[item] else: if isinstance(item, (int, np.integer)): - return self._getitem_helper(item) + return self.target.value[self._slice(item)] elif isinstance(item, (slice, Iterable)): if isinstance(item, slice): item = range(*item.indices(len(self.value))) - return [self._getitem_helper(i) for i in item] - else: + return [self.target.value[self._slice(i)] for i in item] + else: # pragma: no cover raise AttributeError(f"Could not index with {item}") def __setitem__(self, key: Union[int, slice], value: Any) -> None: - if self._index: - # VectorIndex is the thing that knows how to do the slicing - self._index[key] = value + """ + Set a value on the :attr:`.target` . + + .. note:: + + Even though we correct the indexing logic from HDMF where the + _data_ is the thing that is provided by the API when one accesses + table.data (rather than table.data_index as hdmf does), + we will set to the target here (rather than to the index) + to be consistent. To modify the index, modify `self.value` directly + + """ + if self.target: + if isinstance(key, (int, np.integer)): + self.target.value[self._slice(key)] = value + elif isinstance(key, (slice, Iterable)): + if isinstance(key, slice): + key = range(*key.indices(len(self.value))) + + if isinstance(value, Iterable): + if len(key) != len(value): + raise ValueError( + "Can only assign equal-length iterable to a slice, manually index the" + " ragged values of of the target VectorData object if you need more" + " control" + ) + for i, subval in zip(key, value): + self.target.value[self._slice(i)] = subval + else: + for i in key: + self.target.value[self._slice(i)] = value + else: # pragma: no cover + raise AttributeError(f"Could not index with {key}") + else: self.value[key] = value @@ -204,9 +235,17 @@ class DynamicTableRegionMixin(BaseModel): _index: Optional["VectorIndex"] = None table: "DynamicTableMixin" - value: Optional[NDArray] = None + value: Optional[NDArray[Shape["*"], int]] = None - def __getitem__(self, item: Union[int, slice, Iterable]) -> Any: + @overload + def __getitem__(self, item: int) -> pd.DataFrame: ... + + @overload + def __getitem__(self, item: Union[slice, Iterable]) -> List[pd.DataFrame]: ... + + def __getitem__( + self, item: Union[int, slice, Iterable] + ) -> Union[pd.DataFrame, List[pd.DataFrame]]: """ Use ``value`` to index the table. Works analogously to ``VectorIndex`` despite this being a subclass of ``VectorData`` @@ -221,20 +260,27 @@ class DynamicTableRegionMixin(BaseModel): # so we index table with an array to construct # a list of lists of rows return [self.table[idx] for idx in self._index[item]] - else: + else: # pragma: no cover raise ValueError(f"Dont know how to index with {item}, need an int or a slice") else: if isinstance(item, (int, np.integer)): return self.table[self.value[item]] elif isinstance(item, (slice, Iterable)): + # Return a list of dataframe rows because this is most often used + # as a column in a DynamicTable, so while it would normally be + # ideal to just return the slice as above as a single df, + # we need each row to be separate to fill the column if isinstance(item, slice): item = range(*item.indices(len(self.value))) return [self.table[self.value[i]] for i in item] - else: + else: # pragma: no cover raise ValueError(f"Dont know how to index with {item}, need an int or a slice") def __setitem__(self, key: Union[int, str, slice], value: Any) -> None: - self.table[self.value[key]] = value + # self.table[self.value[key]] = value + raise NotImplementedError( + "Assigning values to tables is not implemented yet!" + ) # pragma: no cover class DynamicTableMixin(BaseModel): @@ -245,9 +291,10 @@ class DynamicTableMixin(BaseModel): but simplifying along the way :) """ - model_config = ConfigDict(extra="allow") + model_config = ConfigDict(extra="allow", validate_assignment=True) __pydantic_extra__: Dict[str, Union["VectorDataMixin", "VectorIndexMixin", "NDArray", list]] NON_COLUMN_FIELDS: ClassVar[tuple[str]] = ( + "id", "name", "colnames", "description", @@ -261,10 +308,6 @@ class DynamicTableMixin(BaseModel): def _columns(self) -> Dict[str, Union[list, "NDArray", "VectorDataMixin"]]: return {k: getattr(self, k) for i, k in enumerate(self.colnames)} - @property - def _columns_list(self) -> List[Union[list, "NDArray", "VectorDataMixin"]]: - return [getattr(self, k) for i, k in enumerate(self.colnames)] - @overload def __getitem__(self, item: str) -> Union[list, "NDArray", "VectorDataMixin"]: ... @@ -313,6 +356,7 @@ class DynamicTableMixin(BaseModel): return self._columns[item] if isinstance(item, (int, slice, np.integer, np.ndarray)): data = self._slice_range(item) + index = self.id[item] elif isinstance(item, tuple): if len(item) != 2: raise ValueError( @@ -330,11 +374,15 @@ class DynamicTableMixin(BaseModel): return self._columns[cols][rows] data = self._slice_range(rows, cols) + index = self.id[rows] else: raise ValueError(f"Unsure how to get item with key {item}") # cast to DF - return pd.DataFrame(data) + if not isinstance(index, Iterable): + index = [index] + index = pd.Index(data=index) + return pd.DataFrame(data, index=index) def _slice_range( self, rows: Union[int, slice, np.ndarray], cols: Optional[Union[str, List[str]]] = None @@ -346,31 +394,40 @@ class DynamicTableMixin(BaseModel): data = {} for k in cols: if isinstance(rows, np.ndarray): + # help wanted - this is probably cr*zy slow val = [self._columns[k][i] for i in rows] else: val = self._columns[k][rows] # scalars need to be wrapped in series for pandas + # do this by the iterability of the rows index not the value because + # we want all lengths from this method to be equal, and if the rows are + # scalar, that means length == 1 if not isinstance(rows, (Iterable, slice)): - val = pd.Series([val]) + val = [val] data[k] = val return data def __setitem__(self, key: str, value: Any) -> None: - raise NotImplementedError("TODO") + raise NotImplementedError("TODO") # pragma: no cover def __setattr__(self, key: str, value: Union[list, "NDArray", "VectorData"]): """ Add a column, appending it to ``colnames`` """ # don't use this while building the model - if not getattr(self, "__pydantic_complete__", False): + if not getattr(self, "__pydantic_complete__", False): # pragma: no cover return super().__setattr__(key, value) if key not in self.model_fields_set and not key.endswith("_index"): self.colnames.append(key) + # we get a recursion error if we setattr without having first added to + # extras if we need it to be there + if key not in self.model_fields and key not in self.__pydantic_extra__: + self.__pydantic_extra__[key] = value + return super().__setattr__(key, value) def __getattr__(self, item: str) -> Any: @@ -397,6 +454,8 @@ class DynamicTableMixin(BaseModel): """ Create ID column if not provided """ + if not isinstance(model, dict): + return model if "id" not in model: lengths = [] for key, val in model.items(): @@ -421,6 +480,8 @@ class DynamicTableMixin(BaseModel): the model dict is ordered after python3.6, so we can use that minus anything in :attr:`.NON_COLUMN_FIELDS` to determine order implied from passage order """ + if not isinstance(model, dict): + return model if "colnames" not in model: colnames = [ k @@ -456,19 +517,21 @@ class DynamicTableMixin(BaseModel): See :meth:`.cast_specified_columns` for handling columns in the class specification """ # if columns are not in the specification, cast to a generic VectorData - for key, val in model.items(): - if key in cls.model_fields: - continue - if not isinstance(val, (VectorData, VectorIndex)): - try: - if key.endswith("_index"): - model[key] = VectorIndex(name=key, description="", value=val) - else: - model[key] = VectorData(name=key, description="", value=val) - except ValidationError as e: - raise ValidationError( - f"field {key} cannot be cast to VectorData from {val}" - ) from e + + if isinstance(model, dict): + for key, val in model.items(): + if key in cls.model_fields: + continue + if not isinstance(val, (VectorData, VectorIndex)): + try: + if key.endswith("_index"): + model[key] = VectorIndex(name=key, description="", value=val) + else: + model[key] = VectorData(name=key, description="", value=val) + except ValidationError as e: # pragma: no cover + raise ValidationError( + f"field {key} cannot be cast to VectorData from {val}" + ) from e return model @model_validator(mode="after") @@ -500,8 +563,8 @@ class DynamicTableMixin(BaseModel): """ Ensure that all columns are equal length """ - lengths = [len(v) for v in self._columns.values()] - assert [length == lengths[0] for length in lengths], ( + lengths = [len(v) for v in self._columns.values()] + [len(self.id)] + assert all([length == lengths[0] for length in lengths]), ( "Columns are not of equal length! " f"Got colnames:\n{self.colnames}\nand lengths: {lengths}" ) @@ -537,15 +600,19 @@ class DynamicTableMixin(BaseModel): ) ) except Exception: - raise e + raise e from None -class AlignedDynamicTableMixin(DynamicTableMixin): +class AlignedDynamicTableMixin(BaseModel): """ Mixin to allow indexing multiple tables that are aligned on a common ID + + A great deal of code duplication because we need to avoid diamond inheritance + and also it's not so easy to copy a pydantic validator method. """ - __pydantic_extra__: Dict[str, "DynamicTableMixin"] + model_config = ConfigDict(extra="allow", validate_assignment=True) + __pydantic_extra__: Dict[str, Union["DynamicTableMixin", "VectorDataMixin", "VectorIndexMixin"]] NON_CATEGORY_FIELDS: ClassVar[tuple[str]] = ( "name", @@ -563,7 +630,7 @@ class AlignedDynamicTableMixin(DynamicTableMixin): return {k: getattr(self, k) for i, k in enumerate(self.categories)} def __getitem__( - self, item: Union[int, str, slice, Tuple[Union[int, slice], str]] + self, item: Union[int, str, slice, NDArray[Shape["*"], int], Tuple[Union[int, slice], str]] ) -> pd.DataFrame: """ Mimic hdmf: @@ -581,25 +648,78 @@ class AlignedDynamicTableMixin(DynamicTableMixin): elif isinstance(item, tuple) and len(item) == 2 and isinstance(item[1], str): # get a slice of a single table return self._categories[item[1]][item[0]] - elif isinstance(item, (int, slice)): + elif isinstance(item, (int, slice, Iterable)): # get a slice of all the tables ids = self.id[item] if not isinstance(ids, Iterable): ids = pd.Series([ids]) ids = pd.DataFrame({"id": ids}) - tables = [ids] + [table[item].reset_index() for table in self._categories.values()] + tables = [ids] + for category_name, category in self._categories.items(): + table = category[item] + if isinstance(table, pd.DataFrame): + table = table.reset_index() + elif isinstance(table, np.ndarray): + table = pd.DataFrame({category_name: [table]}) + elif isinstance(table, Iterable): + table = pd.DataFrame({category_name: table}) + else: + raise ValueError( + f"Don't know how to construct category table for {category_name}" + ) + tables.append(table) + names = [self.name] + self.categories # construct below in case we need to support array indexing in the future else: raise ValueError( f"Dont know how to index with {item}, " - "need an int, string, slice, or tuple[int | slice, str]" + "need an int, string, slice, ndarray, or tuple[int | slice, str]" ) df = pd.concat(tables, axis=1, keys=names) df.set_index((self.name, "id"), drop=True, inplace=True) return df + def __getattr__(self, item: str) -> Any: + """Try and use pandas df attrs if we don't have them""" + try: + return BaseModel.__getattr__(self, item) + except AttributeError as e: + try: + return getattr(self[:], item) + except AttributeError: + raise e from None + + def __len__(self) -> int: + """ + Use the id column to determine length. + + If the id column doesn't represent length accurately, it's a bug + """ + return len(self.id) + + @model_validator(mode="before") + @classmethod + def create_id(cls, model: Dict[str, Any]) -> Dict: + """ + Create ID column if not provided + """ + if "id" not in model: + lengths = [] + for key, val in model.items(): + # don't get lengths of columns with an index + if ( + f"{key}_index" in model + or (isinstance(val, VectorData) and val._index) + or key in cls.NON_CATEGORY_FIELDS + ): + continue + lengths.append(len(val)) + model["id"] = np.arange(np.max(lengths)) + + return model + @model_validator(mode="before") @classmethod def create_categories(cls, model: Dict[str, Any]) -> Dict: @@ -626,6 +746,42 @@ class AlignedDynamicTableMixin(DynamicTableMixin): model["categories"].extend(categories) return model + @model_validator(mode="after") + def resolve_targets(self) -> "DynamicTableMixin": + """ + Ensure that any implicitly indexed columns are linked, and create backlinks + """ + for key, col in self._categories.items(): + if isinstance(col, VectorData): + # find an index + idx = None + for field_name in self.model_fields_set: + if field_name in self.NON_CATEGORY_FIELDS or field_name == key: + continue + # implicit name-based index + field = getattr(self, field_name) + if isinstance(field, VectorIndex) and ( + field_name == f"{key}_index" or field.target is col + ): + idx = field + break + if idx is not None: + col._index = idx + idx.target = col + return self + + @model_validator(mode="after") + def ensure_equal_length_cols(self) -> "DynamicTableMixin": + """ + Ensure that all columns are equal length + """ + lengths = [len(v) for v in self._categories.values()] + [len(self.id)] + assert all([length == lengths[0] for length in lengths]), ( + "Columns are not of equal length! " + f"Got colnames:\n{self.categories}\nand lengths: {lengths}" + ) + return self + linkml_meta = LinkMLMeta( { diff --git a/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_5_0/hdmf_common_table.py b/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_5_0/hdmf_common_table.py index 6f6d0b0..c53154c 100644 --- a/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_5_0/hdmf_common_table.py +++ b/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_5_0/hdmf_common_table.py @@ -149,31 +149,62 @@ class VectorIndexMixin(BaseModel, Generic[T]): kwargs["value"] = value super().__init__(**kwargs) - def _getitem_helper(self, arg: int) -> Union[list, NDArray]: + def _slice(self, arg: int) -> slice: """ Mimicking :func:`hdmf.common.table.VectorIndex.__getitem_helper` """ start = 0 if arg == 0 else self.value[arg - 1] end = self.value[arg] - return self.target.value[slice(start, end)] + return slice(start, end) def __getitem__(self, item: Union[int, slice, Iterable]) -> Any: if self.target is None: return self.value[item] else: if isinstance(item, (int, np.integer)): - return self._getitem_helper(item) + return self.target.value[self._slice(item)] elif isinstance(item, (slice, Iterable)): if isinstance(item, slice): item = range(*item.indices(len(self.value))) - return [self._getitem_helper(i) for i in item] - else: + return [self.target.value[self._slice(i)] for i in item] + else: # pragma: no cover raise AttributeError(f"Could not index with {item}") def __setitem__(self, key: Union[int, slice], value: Any) -> None: - if self._index: - # VectorIndex is the thing that knows how to do the slicing - self._index[key] = value + """ + Set a value on the :attr:`.target` . + + .. note:: + + Even though we correct the indexing logic from HDMF where the + _data_ is the thing that is provided by the API when one accesses + table.data (rather than table.data_index as hdmf does), + we will set to the target here (rather than to the index) + to be consistent. To modify the index, modify `self.value` directly + + """ + if self.target: + if isinstance(key, (int, np.integer)): + self.target.value[self._slice(key)] = value + elif isinstance(key, (slice, Iterable)): + if isinstance(key, slice): + key = range(*key.indices(len(self.value))) + + if isinstance(value, Iterable): + if len(key) != len(value): + raise ValueError( + "Can only assign equal-length iterable to a slice, manually index the" + " ragged values of of the target VectorData object if you need more" + " control" + ) + for i, subval in zip(key, value): + self.target.value[self._slice(i)] = subval + else: + for i in key: + self.target.value[self._slice(i)] = value + else: # pragma: no cover + raise AttributeError(f"Could not index with {key}") + else: self.value[key] = value @@ -204,9 +235,17 @@ class DynamicTableRegionMixin(BaseModel): _index: Optional["VectorIndex"] = None table: "DynamicTableMixin" - value: Optional[NDArray] = None + value: Optional[NDArray[Shape["*"], int]] = None - def __getitem__(self, item: Union[int, slice, Iterable]) -> Any: + @overload + def __getitem__(self, item: int) -> pd.DataFrame: ... + + @overload + def __getitem__(self, item: Union[slice, Iterable]) -> List[pd.DataFrame]: ... + + def __getitem__( + self, item: Union[int, slice, Iterable] + ) -> Union[pd.DataFrame, List[pd.DataFrame]]: """ Use ``value`` to index the table. Works analogously to ``VectorIndex`` despite this being a subclass of ``VectorData`` @@ -221,20 +260,27 @@ class DynamicTableRegionMixin(BaseModel): # so we index table with an array to construct # a list of lists of rows return [self.table[idx] for idx in self._index[item]] - else: + else: # pragma: no cover raise ValueError(f"Dont know how to index with {item}, need an int or a slice") else: if isinstance(item, (int, np.integer)): return self.table[self.value[item]] elif isinstance(item, (slice, Iterable)): + # Return a list of dataframe rows because this is most often used + # as a column in a DynamicTable, so while it would normally be + # ideal to just return the slice as above as a single df, + # we need each row to be separate to fill the column if isinstance(item, slice): item = range(*item.indices(len(self.value))) return [self.table[self.value[i]] for i in item] - else: + else: # pragma: no cover raise ValueError(f"Dont know how to index with {item}, need an int or a slice") def __setitem__(self, key: Union[int, str, slice], value: Any) -> None: - self.table[self.value[key]] = value + # self.table[self.value[key]] = value + raise NotImplementedError( + "Assigning values to tables is not implemented yet!" + ) # pragma: no cover class DynamicTableMixin(BaseModel): @@ -245,9 +291,10 @@ class DynamicTableMixin(BaseModel): but simplifying along the way :) """ - model_config = ConfigDict(extra="allow") + model_config = ConfigDict(extra="allow", validate_assignment=True) __pydantic_extra__: Dict[str, Union["VectorDataMixin", "VectorIndexMixin", "NDArray", list]] NON_COLUMN_FIELDS: ClassVar[tuple[str]] = ( + "id", "name", "colnames", "description", @@ -261,10 +308,6 @@ class DynamicTableMixin(BaseModel): def _columns(self) -> Dict[str, Union[list, "NDArray", "VectorDataMixin"]]: return {k: getattr(self, k) for i, k in enumerate(self.colnames)} - @property - def _columns_list(self) -> List[Union[list, "NDArray", "VectorDataMixin"]]: - return [getattr(self, k) for i, k in enumerate(self.colnames)] - @overload def __getitem__(self, item: str) -> Union[list, "NDArray", "VectorDataMixin"]: ... @@ -313,6 +356,7 @@ class DynamicTableMixin(BaseModel): return self._columns[item] if isinstance(item, (int, slice, np.integer, np.ndarray)): data = self._slice_range(item) + index = self.id[item] elif isinstance(item, tuple): if len(item) != 2: raise ValueError( @@ -330,11 +374,15 @@ class DynamicTableMixin(BaseModel): return self._columns[cols][rows] data = self._slice_range(rows, cols) + index = self.id[rows] else: raise ValueError(f"Unsure how to get item with key {item}") # cast to DF - return pd.DataFrame(data) + if not isinstance(index, Iterable): + index = [index] + index = pd.Index(data=index) + return pd.DataFrame(data, index=index) def _slice_range( self, rows: Union[int, slice, np.ndarray], cols: Optional[Union[str, List[str]]] = None @@ -346,31 +394,40 @@ class DynamicTableMixin(BaseModel): data = {} for k in cols: if isinstance(rows, np.ndarray): + # help wanted - this is probably cr*zy slow val = [self._columns[k][i] for i in rows] else: val = self._columns[k][rows] # scalars need to be wrapped in series for pandas + # do this by the iterability of the rows index not the value because + # we want all lengths from this method to be equal, and if the rows are + # scalar, that means length == 1 if not isinstance(rows, (Iterable, slice)): - val = pd.Series([val]) + val = [val] data[k] = val return data def __setitem__(self, key: str, value: Any) -> None: - raise NotImplementedError("TODO") + raise NotImplementedError("TODO") # pragma: no cover def __setattr__(self, key: str, value: Union[list, "NDArray", "VectorData"]): """ Add a column, appending it to ``colnames`` """ # don't use this while building the model - if not getattr(self, "__pydantic_complete__", False): + if not getattr(self, "__pydantic_complete__", False): # pragma: no cover return super().__setattr__(key, value) if key not in self.model_fields_set and not key.endswith("_index"): self.colnames.append(key) + # we get a recursion error if we setattr without having first added to + # extras if we need it to be there + if key not in self.model_fields and key not in self.__pydantic_extra__: + self.__pydantic_extra__[key] = value + return super().__setattr__(key, value) def __getattr__(self, item: str) -> Any: @@ -397,6 +454,8 @@ class DynamicTableMixin(BaseModel): """ Create ID column if not provided """ + if not isinstance(model, dict): + return model if "id" not in model: lengths = [] for key, val in model.items(): @@ -421,6 +480,8 @@ class DynamicTableMixin(BaseModel): the model dict is ordered after python3.6, so we can use that minus anything in :attr:`.NON_COLUMN_FIELDS` to determine order implied from passage order """ + if not isinstance(model, dict): + return model if "colnames" not in model: colnames = [ k @@ -456,19 +517,21 @@ class DynamicTableMixin(BaseModel): See :meth:`.cast_specified_columns` for handling columns in the class specification """ # if columns are not in the specification, cast to a generic VectorData - for key, val in model.items(): - if key in cls.model_fields: - continue - if not isinstance(val, (VectorData, VectorIndex)): - try: - if key.endswith("_index"): - model[key] = VectorIndex(name=key, description="", value=val) - else: - model[key] = VectorData(name=key, description="", value=val) - except ValidationError as e: - raise ValidationError( - f"field {key} cannot be cast to VectorData from {val}" - ) from e + + if isinstance(model, dict): + for key, val in model.items(): + if key in cls.model_fields: + continue + if not isinstance(val, (VectorData, VectorIndex)): + try: + if key.endswith("_index"): + model[key] = VectorIndex(name=key, description="", value=val) + else: + model[key] = VectorData(name=key, description="", value=val) + except ValidationError as e: # pragma: no cover + raise ValidationError( + f"field {key} cannot be cast to VectorData from {val}" + ) from e return model @model_validator(mode="after") @@ -500,8 +563,8 @@ class DynamicTableMixin(BaseModel): """ Ensure that all columns are equal length """ - lengths = [len(v) for v in self._columns.values()] - assert [length == lengths[0] for length in lengths], ( + lengths = [len(v) for v in self._columns.values()] + [len(self.id)] + assert all([length == lengths[0] for length in lengths]), ( "Columns are not of equal length! " f"Got colnames:\n{self.colnames}\nand lengths: {lengths}" ) @@ -537,15 +600,19 @@ class DynamicTableMixin(BaseModel): ) ) except Exception: - raise e + raise e from None -class AlignedDynamicTableMixin(DynamicTableMixin): +class AlignedDynamicTableMixin(BaseModel): """ Mixin to allow indexing multiple tables that are aligned on a common ID + + A great deal of code duplication because we need to avoid diamond inheritance + and also it's not so easy to copy a pydantic validator method. """ - __pydantic_extra__: Dict[str, "DynamicTableMixin"] + model_config = ConfigDict(extra="allow", validate_assignment=True) + __pydantic_extra__: Dict[str, Union["DynamicTableMixin", "VectorDataMixin", "VectorIndexMixin"]] NON_CATEGORY_FIELDS: ClassVar[tuple[str]] = ( "name", @@ -563,7 +630,7 @@ class AlignedDynamicTableMixin(DynamicTableMixin): return {k: getattr(self, k) for i, k in enumerate(self.categories)} def __getitem__( - self, item: Union[int, str, slice, Tuple[Union[int, slice], str]] + self, item: Union[int, str, slice, NDArray[Shape["*"], int], Tuple[Union[int, slice], str]] ) -> pd.DataFrame: """ Mimic hdmf: @@ -581,25 +648,78 @@ class AlignedDynamicTableMixin(DynamicTableMixin): elif isinstance(item, tuple) and len(item) == 2 and isinstance(item[1], str): # get a slice of a single table return self._categories[item[1]][item[0]] - elif isinstance(item, (int, slice)): + elif isinstance(item, (int, slice, Iterable)): # get a slice of all the tables ids = self.id[item] if not isinstance(ids, Iterable): ids = pd.Series([ids]) ids = pd.DataFrame({"id": ids}) - tables = [ids] + [table[item].reset_index() for table in self._categories.values()] + tables = [ids] + for category_name, category in self._categories.items(): + table = category[item] + if isinstance(table, pd.DataFrame): + table = table.reset_index() + elif isinstance(table, np.ndarray): + table = pd.DataFrame({category_name: [table]}) + elif isinstance(table, Iterable): + table = pd.DataFrame({category_name: table}) + else: + raise ValueError( + f"Don't know how to construct category table for {category_name}" + ) + tables.append(table) + names = [self.name] + self.categories # construct below in case we need to support array indexing in the future else: raise ValueError( f"Dont know how to index with {item}, " - "need an int, string, slice, or tuple[int | slice, str]" + "need an int, string, slice, ndarray, or tuple[int | slice, str]" ) df = pd.concat(tables, axis=1, keys=names) df.set_index((self.name, "id"), drop=True, inplace=True) return df + def __getattr__(self, item: str) -> Any: + """Try and use pandas df attrs if we don't have them""" + try: + return BaseModel.__getattr__(self, item) + except AttributeError as e: + try: + return getattr(self[:], item) + except AttributeError: + raise e from None + + def __len__(self) -> int: + """ + Use the id column to determine length. + + If the id column doesn't represent length accurately, it's a bug + """ + return len(self.id) + + @model_validator(mode="before") + @classmethod + def create_id(cls, model: Dict[str, Any]) -> Dict: + """ + Create ID column if not provided + """ + if "id" not in model: + lengths = [] + for key, val in model.items(): + # don't get lengths of columns with an index + if ( + f"{key}_index" in model + or (isinstance(val, VectorData) and val._index) + or key in cls.NON_CATEGORY_FIELDS + ): + continue + lengths.append(len(val)) + model["id"] = np.arange(np.max(lengths)) + + return model + @model_validator(mode="before") @classmethod def create_categories(cls, model: Dict[str, Any]) -> Dict: @@ -626,6 +746,42 @@ class AlignedDynamicTableMixin(DynamicTableMixin): model["categories"].extend(categories) return model + @model_validator(mode="after") + def resolve_targets(self) -> "DynamicTableMixin": + """ + Ensure that any implicitly indexed columns are linked, and create backlinks + """ + for key, col in self._categories.items(): + if isinstance(col, VectorData): + # find an index + idx = None + for field_name in self.model_fields_set: + if field_name in self.NON_CATEGORY_FIELDS or field_name == key: + continue + # implicit name-based index + field = getattr(self, field_name) + if isinstance(field, VectorIndex) and ( + field_name == f"{key}_index" or field.target is col + ): + idx = field + break + if idx is not None: + col._index = idx + idx.target = col + return self + + @model_validator(mode="after") + def ensure_equal_length_cols(self) -> "DynamicTableMixin": + """ + Ensure that all columns are equal length + """ + lengths = [len(v) for v in self._categories.values()] + [len(self.id)] + assert all([length == lengths[0] for length in lengths]), ( + "Columns are not of equal length! " + f"Got colnames:\n{self.categories}\nand lengths: {lengths}" + ) + return self + linkml_meta = LinkMLMeta( { diff --git a/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_5_1/hdmf_common_table.py b/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_5_1/hdmf_common_table.py index 3590de9..39a4eb3 100644 --- a/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_5_1/hdmf_common_table.py +++ b/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_5_1/hdmf_common_table.py @@ -149,31 +149,62 @@ class VectorIndexMixin(BaseModel, Generic[T]): kwargs["value"] = value super().__init__(**kwargs) - def _getitem_helper(self, arg: int) -> Union[list, NDArray]: + def _slice(self, arg: int) -> slice: """ Mimicking :func:`hdmf.common.table.VectorIndex.__getitem_helper` """ start = 0 if arg == 0 else self.value[arg - 1] end = self.value[arg] - return self.target.value[slice(start, end)] + return slice(start, end) def __getitem__(self, item: Union[int, slice, Iterable]) -> Any: if self.target is None: return self.value[item] else: if isinstance(item, (int, np.integer)): - return self._getitem_helper(item) + return self.target.value[self._slice(item)] elif isinstance(item, (slice, Iterable)): if isinstance(item, slice): item = range(*item.indices(len(self.value))) - return [self._getitem_helper(i) for i in item] - else: + return [self.target.value[self._slice(i)] for i in item] + else: # pragma: no cover raise AttributeError(f"Could not index with {item}") def __setitem__(self, key: Union[int, slice], value: Any) -> None: - if self._index: - # VectorIndex is the thing that knows how to do the slicing - self._index[key] = value + """ + Set a value on the :attr:`.target` . + + .. note:: + + Even though we correct the indexing logic from HDMF where the + _data_ is the thing that is provided by the API when one accesses + table.data (rather than table.data_index as hdmf does), + we will set to the target here (rather than to the index) + to be consistent. To modify the index, modify `self.value` directly + + """ + if self.target: + if isinstance(key, (int, np.integer)): + self.target.value[self._slice(key)] = value + elif isinstance(key, (slice, Iterable)): + if isinstance(key, slice): + key = range(*key.indices(len(self.value))) + + if isinstance(value, Iterable): + if len(key) != len(value): + raise ValueError( + "Can only assign equal-length iterable to a slice, manually index the" + " ragged values of of the target VectorData object if you need more" + " control" + ) + for i, subval in zip(key, value): + self.target.value[self._slice(i)] = subval + else: + for i in key: + self.target.value[self._slice(i)] = value + else: # pragma: no cover + raise AttributeError(f"Could not index with {key}") + else: self.value[key] = value @@ -204,9 +235,17 @@ class DynamicTableRegionMixin(BaseModel): _index: Optional["VectorIndex"] = None table: "DynamicTableMixin" - value: Optional[NDArray] = None + value: Optional[NDArray[Shape["*"], int]] = None - def __getitem__(self, item: Union[int, slice, Iterable]) -> Any: + @overload + def __getitem__(self, item: int) -> pd.DataFrame: ... + + @overload + def __getitem__(self, item: Union[slice, Iterable]) -> List[pd.DataFrame]: ... + + def __getitem__( + self, item: Union[int, slice, Iterable] + ) -> Union[pd.DataFrame, List[pd.DataFrame]]: """ Use ``value`` to index the table. Works analogously to ``VectorIndex`` despite this being a subclass of ``VectorData`` @@ -221,20 +260,27 @@ class DynamicTableRegionMixin(BaseModel): # so we index table with an array to construct # a list of lists of rows return [self.table[idx] for idx in self._index[item]] - else: + else: # pragma: no cover raise ValueError(f"Dont know how to index with {item}, need an int or a slice") else: if isinstance(item, (int, np.integer)): return self.table[self.value[item]] elif isinstance(item, (slice, Iterable)): + # Return a list of dataframe rows because this is most often used + # as a column in a DynamicTable, so while it would normally be + # ideal to just return the slice as above as a single df, + # we need each row to be separate to fill the column if isinstance(item, slice): item = range(*item.indices(len(self.value))) return [self.table[self.value[i]] for i in item] - else: + else: # pragma: no cover raise ValueError(f"Dont know how to index with {item}, need an int or a slice") def __setitem__(self, key: Union[int, str, slice], value: Any) -> None: - self.table[self.value[key]] = value + # self.table[self.value[key]] = value + raise NotImplementedError( + "Assigning values to tables is not implemented yet!" + ) # pragma: no cover class DynamicTableMixin(BaseModel): @@ -245,9 +291,10 @@ class DynamicTableMixin(BaseModel): but simplifying along the way :) """ - model_config = ConfigDict(extra="allow") + model_config = ConfigDict(extra="allow", validate_assignment=True) __pydantic_extra__: Dict[str, Union["VectorDataMixin", "VectorIndexMixin", "NDArray", list]] NON_COLUMN_FIELDS: ClassVar[tuple[str]] = ( + "id", "name", "colnames", "description", @@ -261,10 +308,6 @@ class DynamicTableMixin(BaseModel): def _columns(self) -> Dict[str, Union[list, "NDArray", "VectorDataMixin"]]: return {k: getattr(self, k) for i, k in enumerate(self.colnames)} - @property - def _columns_list(self) -> List[Union[list, "NDArray", "VectorDataMixin"]]: - return [getattr(self, k) for i, k in enumerate(self.colnames)] - @overload def __getitem__(self, item: str) -> Union[list, "NDArray", "VectorDataMixin"]: ... @@ -313,6 +356,7 @@ class DynamicTableMixin(BaseModel): return self._columns[item] if isinstance(item, (int, slice, np.integer, np.ndarray)): data = self._slice_range(item) + index = self.id[item] elif isinstance(item, tuple): if len(item) != 2: raise ValueError( @@ -330,11 +374,15 @@ class DynamicTableMixin(BaseModel): return self._columns[cols][rows] data = self._slice_range(rows, cols) + index = self.id[rows] else: raise ValueError(f"Unsure how to get item with key {item}") # cast to DF - return pd.DataFrame(data) + if not isinstance(index, Iterable): + index = [index] + index = pd.Index(data=index) + return pd.DataFrame(data, index=index) def _slice_range( self, rows: Union[int, slice, np.ndarray], cols: Optional[Union[str, List[str]]] = None @@ -346,31 +394,40 @@ class DynamicTableMixin(BaseModel): data = {} for k in cols: if isinstance(rows, np.ndarray): + # help wanted - this is probably cr*zy slow val = [self._columns[k][i] for i in rows] else: val = self._columns[k][rows] # scalars need to be wrapped in series for pandas + # do this by the iterability of the rows index not the value because + # we want all lengths from this method to be equal, and if the rows are + # scalar, that means length == 1 if not isinstance(rows, (Iterable, slice)): - val = pd.Series([val]) + val = [val] data[k] = val return data def __setitem__(self, key: str, value: Any) -> None: - raise NotImplementedError("TODO") + raise NotImplementedError("TODO") # pragma: no cover def __setattr__(self, key: str, value: Union[list, "NDArray", "VectorData"]): """ Add a column, appending it to ``colnames`` """ # don't use this while building the model - if not getattr(self, "__pydantic_complete__", False): + if not getattr(self, "__pydantic_complete__", False): # pragma: no cover return super().__setattr__(key, value) if key not in self.model_fields_set and not key.endswith("_index"): self.colnames.append(key) + # we get a recursion error if we setattr without having first added to + # extras if we need it to be there + if key not in self.model_fields and key not in self.__pydantic_extra__: + self.__pydantic_extra__[key] = value + return super().__setattr__(key, value) def __getattr__(self, item: str) -> Any: @@ -397,6 +454,8 @@ class DynamicTableMixin(BaseModel): """ Create ID column if not provided """ + if not isinstance(model, dict): + return model if "id" not in model: lengths = [] for key, val in model.items(): @@ -421,6 +480,8 @@ class DynamicTableMixin(BaseModel): the model dict is ordered after python3.6, so we can use that minus anything in :attr:`.NON_COLUMN_FIELDS` to determine order implied from passage order """ + if not isinstance(model, dict): + return model if "colnames" not in model: colnames = [ k @@ -456,19 +517,21 @@ class DynamicTableMixin(BaseModel): See :meth:`.cast_specified_columns` for handling columns in the class specification """ # if columns are not in the specification, cast to a generic VectorData - for key, val in model.items(): - if key in cls.model_fields: - continue - if not isinstance(val, (VectorData, VectorIndex)): - try: - if key.endswith("_index"): - model[key] = VectorIndex(name=key, description="", value=val) - else: - model[key] = VectorData(name=key, description="", value=val) - except ValidationError as e: - raise ValidationError( - f"field {key} cannot be cast to VectorData from {val}" - ) from e + + if isinstance(model, dict): + for key, val in model.items(): + if key in cls.model_fields: + continue + if not isinstance(val, (VectorData, VectorIndex)): + try: + if key.endswith("_index"): + model[key] = VectorIndex(name=key, description="", value=val) + else: + model[key] = VectorData(name=key, description="", value=val) + except ValidationError as e: # pragma: no cover + raise ValidationError( + f"field {key} cannot be cast to VectorData from {val}" + ) from e return model @model_validator(mode="after") @@ -500,8 +563,8 @@ class DynamicTableMixin(BaseModel): """ Ensure that all columns are equal length """ - lengths = [len(v) for v in self._columns.values()] - assert [length == lengths[0] for length in lengths], ( + lengths = [len(v) for v in self._columns.values()] + [len(self.id)] + assert all([length == lengths[0] for length in lengths]), ( "Columns are not of equal length! " f"Got colnames:\n{self.colnames}\nand lengths: {lengths}" ) @@ -537,15 +600,19 @@ class DynamicTableMixin(BaseModel): ) ) except Exception: - raise e + raise e from None -class AlignedDynamicTableMixin(DynamicTableMixin): +class AlignedDynamicTableMixin(BaseModel): """ Mixin to allow indexing multiple tables that are aligned on a common ID + + A great deal of code duplication because we need to avoid diamond inheritance + and also it's not so easy to copy a pydantic validator method. """ - __pydantic_extra__: Dict[str, "DynamicTableMixin"] + model_config = ConfigDict(extra="allow", validate_assignment=True) + __pydantic_extra__: Dict[str, Union["DynamicTableMixin", "VectorDataMixin", "VectorIndexMixin"]] NON_CATEGORY_FIELDS: ClassVar[tuple[str]] = ( "name", @@ -563,7 +630,7 @@ class AlignedDynamicTableMixin(DynamicTableMixin): return {k: getattr(self, k) for i, k in enumerate(self.categories)} def __getitem__( - self, item: Union[int, str, slice, Tuple[Union[int, slice], str]] + self, item: Union[int, str, slice, NDArray[Shape["*"], int], Tuple[Union[int, slice], str]] ) -> pd.DataFrame: """ Mimic hdmf: @@ -581,25 +648,78 @@ class AlignedDynamicTableMixin(DynamicTableMixin): elif isinstance(item, tuple) and len(item) == 2 and isinstance(item[1], str): # get a slice of a single table return self._categories[item[1]][item[0]] - elif isinstance(item, (int, slice)): + elif isinstance(item, (int, slice, Iterable)): # get a slice of all the tables ids = self.id[item] if not isinstance(ids, Iterable): ids = pd.Series([ids]) ids = pd.DataFrame({"id": ids}) - tables = [ids] + [table[item].reset_index() for table in self._categories.values()] + tables = [ids] + for category_name, category in self._categories.items(): + table = category[item] + if isinstance(table, pd.DataFrame): + table = table.reset_index() + elif isinstance(table, np.ndarray): + table = pd.DataFrame({category_name: [table]}) + elif isinstance(table, Iterable): + table = pd.DataFrame({category_name: table}) + else: + raise ValueError( + f"Don't know how to construct category table for {category_name}" + ) + tables.append(table) + names = [self.name] + self.categories # construct below in case we need to support array indexing in the future else: raise ValueError( f"Dont know how to index with {item}, " - "need an int, string, slice, or tuple[int | slice, str]" + "need an int, string, slice, ndarray, or tuple[int | slice, str]" ) df = pd.concat(tables, axis=1, keys=names) df.set_index((self.name, "id"), drop=True, inplace=True) return df + def __getattr__(self, item: str) -> Any: + """Try and use pandas df attrs if we don't have them""" + try: + return BaseModel.__getattr__(self, item) + except AttributeError as e: + try: + return getattr(self[:], item) + except AttributeError: + raise e from None + + def __len__(self) -> int: + """ + Use the id column to determine length. + + If the id column doesn't represent length accurately, it's a bug + """ + return len(self.id) + + @model_validator(mode="before") + @classmethod + def create_id(cls, model: Dict[str, Any]) -> Dict: + """ + Create ID column if not provided + """ + if "id" not in model: + lengths = [] + for key, val in model.items(): + # don't get lengths of columns with an index + if ( + f"{key}_index" in model + or (isinstance(val, VectorData) and val._index) + or key in cls.NON_CATEGORY_FIELDS + ): + continue + lengths.append(len(val)) + model["id"] = np.arange(np.max(lengths)) + + return model + @model_validator(mode="before") @classmethod def create_categories(cls, model: Dict[str, Any]) -> Dict: @@ -626,6 +746,42 @@ class AlignedDynamicTableMixin(DynamicTableMixin): model["categories"].extend(categories) return model + @model_validator(mode="after") + def resolve_targets(self) -> "DynamicTableMixin": + """ + Ensure that any implicitly indexed columns are linked, and create backlinks + """ + for key, col in self._categories.items(): + if isinstance(col, VectorData): + # find an index + idx = None + for field_name in self.model_fields_set: + if field_name in self.NON_CATEGORY_FIELDS or field_name == key: + continue + # implicit name-based index + field = getattr(self, field_name) + if isinstance(field, VectorIndex) and ( + field_name == f"{key}_index" or field.target is col + ): + idx = field + break + if idx is not None: + col._index = idx + idx.target = col + return self + + @model_validator(mode="after") + def ensure_equal_length_cols(self) -> "DynamicTableMixin": + """ + Ensure that all columns are equal length + """ + lengths = [len(v) for v in self._categories.values()] + [len(self.id)] + assert all([length == lengths[0] for length in lengths]), ( + "Columns are not of equal length! " + f"Got colnames:\n{self.categories}\nand lengths: {lengths}" + ) + return self + linkml_meta = LinkMLMeta( { diff --git a/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_6_0/hdmf_common_table.py b/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_6_0/hdmf_common_table.py index 07b20e3..da0bc73 100644 --- a/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_6_0/hdmf_common_table.py +++ b/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_6_0/hdmf_common_table.py @@ -149,31 +149,62 @@ class VectorIndexMixin(BaseModel, Generic[T]): kwargs["value"] = value super().__init__(**kwargs) - def _getitem_helper(self, arg: int) -> Union[list, NDArray]: + def _slice(self, arg: int) -> slice: """ Mimicking :func:`hdmf.common.table.VectorIndex.__getitem_helper` """ start = 0 if arg == 0 else self.value[arg - 1] end = self.value[arg] - return self.target.value[slice(start, end)] + return slice(start, end) def __getitem__(self, item: Union[int, slice, Iterable]) -> Any: if self.target is None: return self.value[item] else: if isinstance(item, (int, np.integer)): - return self._getitem_helper(item) + return self.target.value[self._slice(item)] elif isinstance(item, (slice, Iterable)): if isinstance(item, slice): item = range(*item.indices(len(self.value))) - return [self._getitem_helper(i) for i in item] - else: + return [self.target.value[self._slice(i)] for i in item] + else: # pragma: no cover raise AttributeError(f"Could not index with {item}") def __setitem__(self, key: Union[int, slice], value: Any) -> None: - if self._index: - # VectorIndex is the thing that knows how to do the slicing - self._index[key] = value + """ + Set a value on the :attr:`.target` . + + .. note:: + + Even though we correct the indexing logic from HDMF where the + _data_ is the thing that is provided by the API when one accesses + table.data (rather than table.data_index as hdmf does), + we will set to the target here (rather than to the index) + to be consistent. To modify the index, modify `self.value` directly + + """ + if self.target: + if isinstance(key, (int, np.integer)): + self.target.value[self._slice(key)] = value + elif isinstance(key, (slice, Iterable)): + if isinstance(key, slice): + key = range(*key.indices(len(self.value))) + + if isinstance(value, Iterable): + if len(key) != len(value): + raise ValueError( + "Can only assign equal-length iterable to a slice, manually index the" + " ragged values of of the target VectorData object if you need more" + " control" + ) + for i, subval in zip(key, value): + self.target.value[self._slice(i)] = subval + else: + for i in key: + self.target.value[self._slice(i)] = value + else: # pragma: no cover + raise AttributeError(f"Could not index with {key}") + else: self.value[key] = value @@ -204,9 +235,17 @@ class DynamicTableRegionMixin(BaseModel): _index: Optional["VectorIndex"] = None table: "DynamicTableMixin" - value: Optional[NDArray] = None + value: Optional[NDArray[Shape["*"], int]] = None - def __getitem__(self, item: Union[int, slice, Iterable]) -> Any: + @overload + def __getitem__(self, item: int) -> pd.DataFrame: ... + + @overload + def __getitem__(self, item: Union[slice, Iterable]) -> List[pd.DataFrame]: ... + + def __getitem__( + self, item: Union[int, slice, Iterable] + ) -> Union[pd.DataFrame, List[pd.DataFrame]]: """ Use ``value`` to index the table. Works analogously to ``VectorIndex`` despite this being a subclass of ``VectorData`` @@ -221,20 +260,27 @@ class DynamicTableRegionMixin(BaseModel): # so we index table with an array to construct # a list of lists of rows return [self.table[idx] for idx in self._index[item]] - else: + else: # pragma: no cover raise ValueError(f"Dont know how to index with {item}, need an int or a slice") else: if isinstance(item, (int, np.integer)): return self.table[self.value[item]] elif isinstance(item, (slice, Iterable)): + # Return a list of dataframe rows because this is most often used + # as a column in a DynamicTable, so while it would normally be + # ideal to just return the slice as above as a single df, + # we need each row to be separate to fill the column if isinstance(item, slice): item = range(*item.indices(len(self.value))) return [self.table[self.value[i]] for i in item] - else: + else: # pragma: no cover raise ValueError(f"Dont know how to index with {item}, need an int or a slice") def __setitem__(self, key: Union[int, str, slice], value: Any) -> None: - self.table[self.value[key]] = value + # self.table[self.value[key]] = value + raise NotImplementedError( + "Assigning values to tables is not implemented yet!" + ) # pragma: no cover class DynamicTableMixin(BaseModel): @@ -245,9 +291,10 @@ class DynamicTableMixin(BaseModel): but simplifying along the way :) """ - model_config = ConfigDict(extra="allow") + model_config = ConfigDict(extra="allow", validate_assignment=True) __pydantic_extra__: Dict[str, Union["VectorDataMixin", "VectorIndexMixin", "NDArray", list]] NON_COLUMN_FIELDS: ClassVar[tuple[str]] = ( + "id", "name", "colnames", "description", @@ -261,10 +308,6 @@ class DynamicTableMixin(BaseModel): def _columns(self) -> Dict[str, Union[list, "NDArray", "VectorDataMixin"]]: return {k: getattr(self, k) for i, k in enumerate(self.colnames)} - @property - def _columns_list(self) -> List[Union[list, "NDArray", "VectorDataMixin"]]: - return [getattr(self, k) for i, k in enumerate(self.colnames)] - @overload def __getitem__(self, item: str) -> Union[list, "NDArray", "VectorDataMixin"]: ... @@ -313,6 +356,7 @@ class DynamicTableMixin(BaseModel): return self._columns[item] if isinstance(item, (int, slice, np.integer, np.ndarray)): data = self._slice_range(item) + index = self.id[item] elif isinstance(item, tuple): if len(item) != 2: raise ValueError( @@ -330,11 +374,15 @@ class DynamicTableMixin(BaseModel): return self._columns[cols][rows] data = self._slice_range(rows, cols) + index = self.id[rows] else: raise ValueError(f"Unsure how to get item with key {item}") # cast to DF - return pd.DataFrame(data) + if not isinstance(index, Iterable): + index = [index] + index = pd.Index(data=index) + return pd.DataFrame(data, index=index) def _slice_range( self, rows: Union[int, slice, np.ndarray], cols: Optional[Union[str, List[str]]] = None @@ -346,31 +394,40 @@ class DynamicTableMixin(BaseModel): data = {} for k in cols: if isinstance(rows, np.ndarray): + # help wanted - this is probably cr*zy slow val = [self._columns[k][i] for i in rows] else: val = self._columns[k][rows] # scalars need to be wrapped in series for pandas + # do this by the iterability of the rows index not the value because + # we want all lengths from this method to be equal, and if the rows are + # scalar, that means length == 1 if not isinstance(rows, (Iterable, slice)): - val = pd.Series([val]) + val = [val] data[k] = val return data def __setitem__(self, key: str, value: Any) -> None: - raise NotImplementedError("TODO") + raise NotImplementedError("TODO") # pragma: no cover def __setattr__(self, key: str, value: Union[list, "NDArray", "VectorData"]): """ Add a column, appending it to ``colnames`` """ # don't use this while building the model - if not getattr(self, "__pydantic_complete__", False): + if not getattr(self, "__pydantic_complete__", False): # pragma: no cover return super().__setattr__(key, value) if key not in self.model_fields_set and not key.endswith("_index"): self.colnames.append(key) + # we get a recursion error if we setattr without having first added to + # extras if we need it to be there + if key not in self.model_fields and key not in self.__pydantic_extra__: + self.__pydantic_extra__[key] = value + return super().__setattr__(key, value) def __getattr__(self, item: str) -> Any: @@ -397,6 +454,8 @@ class DynamicTableMixin(BaseModel): """ Create ID column if not provided """ + if not isinstance(model, dict): + return model if "id" not in model: lengths = [] for key, val in model.items(): @@ -421,6 +480,8 @@ class DynamicTableMixin(BaseModel): the model dict is ordered after python3.6, so we can use that minus anything in :attr:`.NON_COLUMN_FIELDS` to determine order implied from passage order """ + if not isinstance(model, dict): + return model if "colnames" not in model: colnames = [ k @@ -456,19 +517,21 @@ class DynamicTableMixin(BaseModel): See :meth:`.cast_specified_columns` for handling columns in the class specification """ # if columns are not in the specification, cast to a generic VectorData - for key, val in model.items(): - if key in cls.model_fields: - continue - if not isinstance(val, (VectorData, VectorIndex)): - try: - if key.endswith("_index"): - model[key] = VectorIndex(name=key, description="", value=val) - else: - model[key] = VectorData(name=key, description="", value=val) - except ValidationError as e: - raise ValidationError( - f"field {key} cannot be cast to VectorData from {val}" - ) from e + + if isinstance(model, dict): + for key, val in model.items(): + if key in cls.model_fields: + continue + if not isinstance(val, (VectorData, VectorIndex)): + try: + if key.endswith("_index"): + model[key] = VectorIndex(name=key, description="", value=val) + else: + model[key] = VectorData(name=key, description="", value=val) + except ValidationError as e: # pragma: no cover + raise ValidationError( + f"field {key} cannot be cast to VectorData from {val}" + ) from e return model @model_validator(mode="after") @@ -500,8 +563,8 @@ class DynamicTableMixin(BaseModel): """ Ensure that all columns are equal length """ - lengths = [len(v) for v in self._columns.values()] - assert [length == lengths[0] for length in lengths], ( + lengths = [len(v) for v in self._columns.values()] + [len(self.id)] + assert all([length == lengths[0] for length in lengths]), ( "Columns are not of equal length! " f"Got colnames:\n{self.colnames}\nand lengths: {lengths}" ) @@ -537,15 +600,19 @@ class DynamicTableMixin(BaseModel): ) ) except Exception: - raise e + raise e from None -class AlignedDynamicTableMixin(DynamicTableMixin): +class AlignedDynamicTableMixin(BaseModel): """ Mixin to allow indexing multiple tables that are aligned on a common ID + + A great deal of code duplication because we need to avoid diamond inheritance + and also it's not so easy to copy a pydantic validator method. """ - __pydantic_extra__: Dict[str, "DynamicTableMixin"] + model_config = ConfigDict(extra="allow", validate_assignment=True) + __pydantic_extra__: Dict[str, Union["DynamicTableMixin", "VectorDataMixin", "VectorIndexMixin"]] NON_CATEGORY_FIELDS: ClassVar[tuple[str]] = ( "name", @@ -563,7 +630,7 @@ class AlignedDynamicTableMixin(DynamicTableMixin): return {k: getattr(self, k) for i, k in enumerate(self.categories)} def __getitem__( - self, item: Union[int, str, slice, Tuple[Union[int, slice], str]] + self, item: Union[int, str, slice, NDArray[Shape["*"], int], Tuple[Union[int, slice], str]] ) -> pd.DataFrame: """ Mimic hdmf: @@ -581,25 +648,78 @@ class AlignedDynamicTableMixin(DynamicTableMixin): elif isinstance(item, tuple) and len(item) == 2 and isinstance(item[1], str): # get a slice of a single table return self._categories[item[1]][item[0]] - elif isinstance(item, (int, slice)): + elif isinstance(item, (int, slice, Iterable)): # get a slice of all the tables ids = self.id[item] if not isinstance(ids, Iterable): ids = pd.Series([ids]) ids = pd.DataFrame({"id": ids}) - tables = [ids] + [table[item].reset_index() for table in self._categories.values()] + tables = [ids] + for category_name, category in self._categories.items(): + table = category[item] + if isinstance(table, pd.DataFrame): + table = table.reset_index() + elif isinstance(table, np.ndarray): + table = pd.DataFrame({category_name: [table]}) + elif isinstance(table, Iterable): + table = pd.DataFrame({category_name: table}) + else: + raise ValueError( + f"Don't know how to construct category table for {category_name}" + ) + tables.append(table) + names = [self.name] + self.categories # construct below in case we need to support array indexing in the future else: raise ValueError( f"Dont know how to index with {item}, " - "need an int, string, slice, or tuple[int | slice, str]" + "need an int, string, slice, ndarray, or tuple[int | slice, str]" ) df = pd.concat(tables, axis=1, keys=names) df.set_index((self.name, "id"), drop=True, inplace=True) return df + def __getattr__(self, item: str) -> Any: + """Try and use pandas df attrs if we don't have them""" + try: + return BaseModel.__getattr__(self, item) + except AttributeError as e: + try: + return getattr(self[:], item) + except AttributeError: + raise e from None + + def __len__(self) -> int: + """ + Use the id column to determine length. + + If the id column doesn't represent length accurately, it's a bug + """ + return len(self.id) + + @model_validator(mode="before") + @classmethod + def create_id(cls, model: Dict[str, Any]) -> Dict: + """ + Create ID column if not provided + """ + if "id" not in model: + lengths = [] + for key, val in model.items(): + # don't get lengths of columns with an index + if ( + f"{key}_index" in model + or (isinstance(val, VectorData) and val._index) + or key in cls.NON_CATEGORY_FIELDS + ): + continue + lengths.append(len(val)) + model["id"] = np.arange(np.max(lengths)) + + return model + @model_validator(mode="before") @classmethod def create_categories(cls, model: Dict[str, Any]) -> Dict: @@ -626,6 +746,42 @@ class AlignedDynamicTableMixin(DynamicTableMixin): model["categories"].extend(categories) return model + @model_validator(mode="after") + def resolve_targets(self) -> "DynamicTableMixin": + """ + Ensure that any implicitly indexed columns are linked, and create backlinks + """ + for key, col in self._categories.items(): + if isinstance(col, VectorData): + # find an index + idx = None + for field_name in self.model_fields_set: + if field_name in self.NON_CATEGORY_FIELDS or field_name == key: + continue + # implicit name-based index + field = getattr(self, field_name) + if isinstance(field, VectorIndex) and ( + field_name == f"{key}_index" or field.target is col + ): + idx = field + break + if idx is not None: + col._index = idx + idx.target = col + return self + + @model_validator(mode="after") + def ensure_equal_length_cols(self) -> "DynamicTableMixin": + """ + Ensure that all columns are equal length + """ + lengths = [len(v) for v in self._categories.values()] + [len(self.id)] + assert all([length == lengths[0] for length in lengths]), ( + "Columns are not of equal length! " + f"Got colnames:\n{self.categories}\nand lengths: {lengths}" + ) + return self + linkml_meta = LinkMLMeta( { diff --git a/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_7_0/hdmf_common_table.py b/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_7_0/hdmf_common_table.py index 5e7a82f..627f80d 100644 --- a/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_7_0/hdmf_common_table.py +++ b/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_7_0/hdmf_common_table.py @@ -149,31 +149,62 @@ class VectorIndexMixin(BaseModel, Generic[T]): kwargs["value"] = value super().__init__(**kwargs) - def _getitem_helper(self, arg: int) -> Union[list, NDArray]: + def _slice(self, arg: int) -> slice: """ Mimicking :func:`hdmf.common.table.VectorIndex.__getitem_helper` """ start = 0 if arg == 0 else self.value[arg - 1] end = self.value[arg] - return self.target.value[slice(start, end)] + return slice(start, end) def __getitem__(self, item: Union[int, slice, Iterable]) -> Any: if self.target is None: return self.value[item] else: if isinstance(item, (int, np.integer)): - return self._getitem_helper(item) + return self.target.value[self._slice(item)] elif isinstance(item, (slice, Iterable)): if isinstance(item, slice): item = range(*item.indices(len(self.value))) - return [self._getitem_helper(i) for i in item] - else: + return [self.target.value[self._slice(i)] for i in item] + else: # pragma: no cover raise AttributeError(f"Could not index with {item}") def __setitem__(self, key: Union[int, slice], value: Any) -> None: - if self._index: - # VectorIndex is the thing that knows how to do the slicing - self._index[key] = value + """ + Set a value on the :attr:`.target` . + + .. note:: + + Even though we correct the indexing logic from HDMF where the + _data_ is the thing that is provided by the API when one accesses + table.data (rather than table.data_index as hdmf does), + we will set to the target here (rather than to the index) + to be consistent. To modify the index, modify `self.value` directly + + """ + if self.target: + if isinstance(key, (int, np.integer)): + self.target.value[self._slice(key)] = value + elif isinstance(key, (slice, Iterable)): + if isinstance(key, slice): + key = range(*key.indices(len(self.value))) + + if isinstance(value, Iterable): + if len(key) != len(value): + raise ValueError( + "Can only assign equal-length iterable to a slice, manually index the" + " ragged values of of the target VectorData object if you need more" + " control" + ) + for i, subval in zip(key, value): + self.target.value[self._slice(i)] = subval + else: + for i in key: + self.target.value[self._slice(i)] = value + else: # pragma: no cover + raise AttributeError(f"Could not index with {key}") + else: self.value[key] = value @@ -204,9 +235,17 @@ class DynamicTableRegionMixin(BaseModel): _index: Optional["VectorIndex"] = None table: "DynamicTableMixin" - value: Optional[NDArray] = None + value: Optional[NDArray[Shape["*"], int]] = None - def __getitem__(self, item: Union[int, slice, Iterable]) -> Any: + @overload + def __getitem__(self, item: int) -> pd.DataFrame: ... + + @overload + def __getitem__(self, item: Union[slice, Iterable]) -> List[pd.DataFrame]: ... + + def __getitem__( + self, item: Union[int, slice, Iterable] + ) -> Union[pd.DataFrame, List[pd.DataFrame]]: """ Use ``value`` to index the table. Works analogously to ``VectorIndex`` despite this being a subclass of ``VectorData`` @@ -221,20 +260,27 @@ class DynamicTableRegionMixin(BaseModel): # so we index table with an array to construct # a list of lists of rows return [self.table[idx] for idx in self._index[item]] - else: + else: # pragma: no cover raise ValueError(f"Dont know how to index with {item}, need an int or a slice") else: if isinstance(item, (int, np.integer)): return self.table[self.value[item]] elif isinstance(item, (slice, Iterable)): + # Return a list of dataframe rows because this is most often used + # as a column in a DynamicTable, so while it would normally be + # ideal to just return the slice as above as a single df, + # we need each row to be separate to fill the column if isinstance(item, slice): item = range(*item.indices(len(self.value))) return [self.table[self.value[i]] for i in item] - else: + else: # pragma: no cover raise ValueError(f"Dont know how to index with {item}, need an int or a slice") def __setitem__(self, key: Union[int, str, slice], value: Any) -> None: - self.table[self.value[key]] = value + # self.table[self.value[key]] = value + raise NotImplementedError( + "Assigning values to tables is not implemented yet!" + ) # pragma: no cover class DynamicTableMixin(BaseModel): @@ -245,9 +291,10 @@ class DynamicTableMixin(BaseModel): but simplifying along the way :) """ - model_config = ConfigDict(extra="allow") + model_config = ConfigDict(extra="allow", validate_assignment=True) __pydantic_extra__: Dict[str, Union["VectorDataMixin", "VectorIndexMixin", "NDArray", list]] NON_COLUMN_FIELDS: ClassVar[tuple[str]] = ( + "id", "name", "colnames", "description", @@ -261,10 +308,6 @@ class DynamicTableMixin(BaseModel): def _columns(self) -> Dict[str, Union[list, "NDArray", "VectorDataMixin"]]: return {k: getattr(self, k) for i, k in enumerate(self.colnames)} - @property - def _columns_list(self) -> List[Union[list, "NDArray", "VectorDataMixin"]]: - return [getattr(self, k) for i, k in enumerate(self.colnames)] - @overload def __getitem__(self, item: str) -> Union[list, "NDArray", "VectorDataMixin"]: ... @@ -313,6 +356,7 @@ class DynamicTableMixin(BaseModel): return self._columns[item] if isinstance(item, (int, slice, np.integer, np.ndarray)): data = self._slice_range(item) + index = self.id[item] elif isinstance(item, tuple): if len(item) != 2: raise ValueError( @@ -330,11 +374,15 @@ class DynamicTableMixin(BaseModel): return self._columns[cols][rows] data = self._slice_range(rows, cols) + index = self.id[rows] else: raise ValueError(f"Unsure how to get item with key {item}") # cast to DF - return pd.DataFrame(data) + if not isinstance(index, Iterable): + index = [index] + index = pd.Index(data=index) + return pd.DataFrame(data, index=index) def _slice_range( self, rows: Union[int, slice, np.ndarray], cols: Optional[Union[str, List[str]]] = None @@ -346,31 +394,40 @@ class DynamicTableMixin(BaseModel): data = {} for k in cols: if isinstance(rows, np.ndarray): + # help wanted - this is probably cr*zy slow val = [self._columns[k][i] for i in rows] else: val = self._columns[k][rows] # scalars need to be wrapped in series for pandas + # do this by the iterability of the rows index not the value because + # we want all lengths from this method to be equal, and if the rows are + # scalar, that means length == 1 if not isinstance(rows, (Iterable, slice)): - val = pd.Series([val]) + val = [val] data[k] = val return data def __setitem__(self, key: str, value: Any) -> None: - raise NotImplementedError("TODO") + raise NotImplementedError("TODO") # pragma: no cover def __setattr__(self, key: str, value: Union[list, "NDArray", "VectorData"]): """ Add a column, appending it to ``colnames`` """ # don't use this while building the model - if not getattr(self, "__pydantic_complete__", False): + if not getattr(self, "__pydantic_complete__", False): # pragma: no cover return super().__setattr__(key, value) if key not in self.model_fields_set and not key.endswith("_index"): self.colnames.append(key) + # we get a recursion error if we setattr without having first added to + # extras if we need it to be there + if key not in self.model_fields and key not in self.__pydantic_extra__: + self.__pydantic_extra__[key] = value + return super().__setattr__(key, value) def __getattr__(self, item: str) -> Any: @@ -397,6 +454,8 @@ class DynamicTableMixin(BaseModel): """ Create ID column if not provided """ + if not isinstance(model, dict): + return model if "id" not in model: lengths = [] for key, val in model.items(): @@ -421,6 +480,8 @@ class DynamicTableMixin(BaseModel): the model dict is ordered after python3.6, so we can use that minus anything in :attr:`.NON_COLUMN_FIELDS` to determine order implied from passage order """ + if not isinstance(model, dict): + return model if "colnames" not in model: colnames = [ k @@ -456,19 +517,21 @@ class DynamicTableMixin(BaseModel): See :meth:`.cast_specified_columns` for handling columns in the class specification """ # if columns are not in the specification, cast to a generic VectorData - for key, val in model.items(): - if key in cls.model_fields: - continue - if not isinstance(val, (VectorData, VectorIndex)): - try: - if key.endswith("_index"): - model[key] = VectorIndex(name=key, description="", value=val) - else: - model[key] = VectorData(name=key, description="", value=val) - except ValidationError as e: - raise ValidationError( - f"field {key} cannot be cast to VectorData from {val}" - ) from e + + if isinstance(model, dict): + for key, val in model.items(): + if key in cls.model_fields: + continue + if not isinstance(val, (VectorData, VectorIndex)): + try: + if key.endswith("_index"): + model[key] = VectorIndex(name=key, description="", value=val) + else: + model[key] = VectorData(name=key, description="", value=val) + except ValidationError as e: # pragma: no cover + raise ValidationError( + f"field {key} cannot be cast to VectorData from {val}" + ) from e return model @model_validator(mode="after") @@ -500,8 +563,8 @@ class DynamicTableMixin(BaseModel): """ Ensure that all columns are equal length """ - lengths = [len(v) for v in self._columns.values()] - assert [length == lengths[0] for length in lengths], ( + lengths = [len(v) for v in self._columns.values()] + [len(self.id)] + assert all([length == lengths[0] for length in lengths]), ( "Columns are not of equal length! " f"Got colnames:\n{self.colnames}\nand lengths: {lengths}" ) @@ -537,15 +600,19 @@ class DynamicTableMixin(BaseModel): ) ) except Exception: - raise e + raise e from None -class AlignedDynamicTableMixin(DynamicTableMixin): +class AlignedDynamicTableMixin(BaseModel): """ Mixin to allow indexing multiple tables that are aligned on a common ID + + A great deal of code duplication because we need to avoid diamond inheritance + and also it's not so easy to copy a pydantic validator method. """ - __pydantic_extra__: Dict[str, "DynamicTableMixin"] + model_config = ConfigDict(extra="allow", validate_assignment=True) + __pydantic_extra__: Dict[str, Union["DynamicTableMixin", "VectorDataMixin", "VectorIndexMixin"]] NON_CATEGORY_FIELDS: ClassVar[tuple[str]] = ( "name", @@ -563,7 +630,7 @@ class AlignedDynamicTableMixin(DynamicTableMixin): return {k: getattr(self, k) for i, k in enumerate(self.categories)} def __getitem__( - self, item: Union[int, str, slice, Tuple[Union[int, slice], str]] + self, item: Union[int, str, slice, NDArray[Shape["*"], int], Tuple[Union[int, slice], str]] ) -> pd.DataFrame: """ Mimic hdmf: @@ -581,25 +648,78 @@ class AlignedDynamicTableMixin(DynamicTableMixin): elif isinstance(item, tuple) and len(item) == 2 and isinstance(item[1], str): # get a slice of a single table return self._categories[item[1]][item[0]] - elif isinstance(item, (int, slice)): + elif isinstance(item, (int, slice, Iterable)): # get a slice of all the tables ids = self.id[item] if not isinstance(ids, Iterable): ids = pd.Series([ids]) ids = pd.DataFrame({"id": ids}) - tables = [ids] + [table[item].reset_index() for table in self._categories.values()] + tables = [ids] + for category_name, category in self._categories.items(): + table = category[item] + if isinstance(table, pd.DataFrame): + table = table.reset_index() + elif isinstance(table, np.ndarray): + table = pd.DataFrame({category_name: [table]}) + elif isinstance(table, Iterable): + table = pd.DataFrame({category_name: table}) + else: + raise ValueError( + f"Don't know how to construct category table for {category_name}" + ) + tables.append(table) + names = [self.name] + self.categories # construct below in case we need to support array indexing in the future else: raise ValueError( f"Dont know how to index with {item}, " - "need an int, string, slice, or tuple[int | slice, str]" + "need an int, string, slice, ndarray, or tuple[int | slice, str]" ) df = pd.concat(tables, axis=1, keys=names) df.set_index((self.name, "id"), drop=True, inplace=True) return df + def __getattr__(self, item: str) -> Any: + """Try and use pandas df attrs if we don't have them""" + try: + return BaseModel.__getattr__(self, item) + except AttributeError as e: + try: + return getattr(self[:], item) + except AttributeError: + raise e from None + + def __len__(self) -> int: + """ + Use the id column to determine length. + + If the id column doesn't represent length accurately, it's a bug + """ + return len(self.id) + + @model_validator(mode="before") + @classmethod + def create_id(cls, model: Dict[str, Any]) -> Dict: + """ + Create ID column if not provided + """ + if "id" not in model: + lengths = [] + for key, val in model.items(): + # don't get lengths of columns with an index + if ( + f"{key}_index" in model + or (isinstance(val, VectorData) and val._index) + or key in cls.NON_CATEGORY_FIELDS + ): + continue + lengths.append(len(val)) + model["id"] = np.arange(np.max(lengths)) + + return model + @model_validator(mode="before") @classmethod def create_categories(cls, model: Dict[str, Any]) -> Dict: @@ -626,6 +746,42 @@ class AlignedDynamicTableMixin(DynamicTableMixin): model["categories"].extend(categories) return model + @model_validator(mode="after") + def resolve_targets(self) -> "DynamicTableMixin": + """ + Ensure that any implicitly indexed columns are linked, and create backlinks + """ + for key, col in self._categories.items(): + if isinstance(col, VectorData): + # find an index + idx = None + for field_name in self.model_fields_set: + if field_name in self.NON_CATEGORY_FIELDS or field_name == key: + continue + # implicit name-based index + field = getattr(self, field_name) + if isinstance(field, VectorIndex) and ( + field_name == f"{key}_index" or field.target is col + ): + idx = field + break + if idx is not None: + col._index = idx + idx.target = col + return self + + @model_validator(mode="after") + def ensure_equal_length_cols(self) -> "DynamicTableMixin": + """ + Ensure that all columns are equal length + """ + lengths = [len(v) for v in self._categories.values()] + [len(self.id)] + assert all([length == lengths[0] for length in lengths]), ( + "Columns are not of equal length! " + f"Got colnames:\n{self.categories}\nand lengths: {lengths}" + ) + return self + linkml_meta = LinkMLMeta( { diff --git a/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_8_0/hdmf_common_table.py b/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_8_0/hdmf_common_table.py index 7a6660a..5a29869 100644 --- a/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_8_0/hdmf_common_table.py +++ b/nwb_linkml/src/nwb_linkml/models/pydantic/hdmf_common/v1_8_0/hdmf_common_table.py @@ -149,31 +149,62 @@ class VectorIndexMixin(BaseModel, Generic[T]): kwargs["value"] = value super().__init__(**kwargs) - def _getitem_helper(self, arg: int) -> Union[list, NDArray]: + def _slice(self, arg: int) -> slice: """ Mimicking :func:`hdmf.common.table.VectorIndex.__getitem_helper` """ start = 0 if arg == 0 else self.value[arg - 1] end = self.value[arg] - return self.target.value[slice(start, end)] + return slice(start, end) def __getitem__(self, item: Union[int, slice, Iterable]) -> Any: if self.target is None: return self.value[item] else: if isinstance(item, (int, np.integer)): - return self._getitem_helper(item) + return self.target.value[self._slice(item)] elif isinstance(item, (slice, Iterable)): if isinstance(item, slice): item = range(*item.indices(len(self.value))) - return [self._getitem_helper(i) for i in item] - else: + return [self.target.value[self._slice(i)] for i in item] + else: # pragma: no cover raise AttributeError(f"Could not index with {item}") def __setitem__(self, key: Union[int, slice], value: Any) -> None: - if self._index: - # VectorIndex is the thing that knows how to do the slicing - self._index[key] = value + """ + Set a value on the :attr:`.target` . + + .. note:: + + Even though we correct the indexing logic from HDMF where the + _data_ is the thing that is provided by the API when one accesses + table.data (rather than table.data_index as hdmf does), + we will set to the target here (rather than to the index) + to be consistent. To modify the index, modify `self.value` directly + + """ + if self.target: + if isinstance(key, (int, np.integer)): + self.target.value[self._slice(key)] = value + elif isinstance(key, (slice, Iterable)): + if isinstance(key, slice): + key = range(*key.indices(len(self.value))) + + if isinstance(value, Iterable): + if len(key) != len(value): + raise ValueError( + "Can only assign equal-length iterable to a slice, manually index the" + " ragged values of of the target VectorData object if you need more" + " control" + ) + for i, subval in zip(key, value): + self.target.value[self._slice(i)] = subval + else: + for i in key: + self.target.value[self._slice(i)] = value + else: # pragma: no cover + raise AttributeError(f"Could not index with {key}") + else: self.value[key] = value @@ -204,9 +235,17 @@ class DynamicTableRegionMixin(BaseModel): _index: Optional["VectorIndex"] = None table: "DynamicTableMixin" - value: Optional[NDArray] = None + value: Optional[NDArray[Shape["*"], int]] = None - def __getitem__(self, item: Union[int, slice, Iterable]) -> Any: + @overload + def __getitem__(self, item: int) -> pd.DataFrame: ... + + @overload + def __getitem__(self, item: Union[slice, Iterable]) -> List[pd.DataFrame]: ... + + def __getitem__( + self, item: Union[int, slice, Iterable] + ) -> Union[pd.DataFrame, List[pd.DataFrame]]: """ Use ``value`` to index the table. Works analogously to ``VectorIndex`` despite this being a subclass of ``VectorData`` @@ -221,20 +260,27 @@ class DynamicTableRegionMixin(BaseModel): # so we index table with an array to construct # a list of lists of rows return [self.table[idx] for idx in self._index[item]] - else: + else: # pragma: no cover raise ValueError(f"Dont know how to index with {item}, need an int or a slice") else: if isinstance(item, (int, np.integer)): return self.table[self.value[item]] elif isinstance(item, (slice, Iterable)): + # Return a list of dataframe rows because this is most often used + # as a column in a DynamicTable, so while it would normally be + # ideal to just return the slice as above as a single df, + # we need each row to be separate to fill the column if isinstance(item, slice): item = range(*item.indices(len(self.value))) return [self.table[self.value[i]] for i in item] - else: + else: # pragma: no cover raise ValueError(f"Dont know how to index with {item}, need an int or a slice") def __setitem__(self, key: Union[int, str, slice], value: Any) -> None: - self.table[self.value[key]] = value + # self.table[self.value[key]] = value + raise NotImplementedError( + "Assigning values to tables is not implemented yet!" + ) # pragma: no cover class DynamicTableMixin(BaseModel): @@ -245,9 +291,10 @@ class DynamicTableMixin(BaseModel): but simplifying along the way :) """ - model_config = ConfigDict(extra="allow") + model_config = ConfigDict(extra="allow", validate_assignment=True) __pydantic_extra__: Dict[str, Union["VectorDataMixin", "VectorIndexMixin", "NDArray", list]] NON_COLUMN_FIELDS: ClassVar[tuple[str]] = ( + "id", "name", "colnames", "description", @@ -261,10 +308,6 @@ class DynamicTableMixin(BaseModel): def _columns(self) -> Dict[str, Union[list, "NDArray", "VectorDataMixin"]]: return {k: getattr(self, k) for i, k in enumerate(self.colnames)} - @property - def _columns_list(self) -> List[Union[list, "NDArray", "VectorDataMixin"]]: - return [getattr(self, k) for i, k in enumerate(self.colnames)] - @overload def __getitem__(self, item: str) -> Union[list, "NDArray", "VectorDataMixin"]: ... @@ -313,6 +356,7 @@ class DynamicTableMixin(BaseModel): return self._columns[item] if isinstance(item, (int, slice, np.integer, np.ndarray)): data = self._slice_range(item) + index = self.id[item] elif isinstance(item, tuple): if len(item) != 2: raise ValueError( @@ -330,11 +374,15 @@ class DynamicTableMixin(BaseModel): return self._columns[cols][rows] data = self._slice_range(rows, cols) + index = self.id[rows] else: raise ValueError(f"Unsure how to get item with key {item}") # cast to DF - return pd.DataFrame(data) + if not isinstance(index, Iterable): + index = [index] + index = pd.Index(data=index) + return pd.DataFrame(data, index=index) def _slice_range( self, rows: Union[int, slice, np.ndarray], cols: Optional[Union[str, List[str]]] = None @@ -346,31 +394,40 @@ class DynamicTableMixin(BaseModel): data = {} for k in cols: if isinstance(rows, np.ndarray): + # help wanted - this is probably cr*zy slow val = [self._columns[k][i] for i in rows] else: val = self._columns[k][rows] # scalars need to be wrapped in series for pandas + # do this by the iterability of the rows index not the value because + # we want all lengths from this method to be equal, and if the rows are + # scalar, that means length == 1 if not isinstance(rows, (Iterable, slice)): - val = pd.Series([val]) + val = [val] data[k] = val return data def __setitem__(self, key: str, value: Any) -> None: - raise NotImplementedError("TODO") + raise NotImplementedError("TODO") # pragma: no cover def __setattr__(self, key: str, value: Union[list, "NDArray", "VectorData"]): """ Add a column, appending it to ``colnames`` """ # don't use this while building the model - if not getattr(self, "__pydantic_complete__", False): + if not getattr(self, "__pydantic_complete__", False): # pragma: no cover return super().__setattr__(key, value) if key not in self.model_fields_set and not key.endswith("_index"): self.colnames.append(key) + # we get a recursion error if we setattr without having first added to + # extras if we need it to be there + if key not in self.model_fields and key not in self.__pydantic_extra__: + self.__pydantic_extra__[key] = value + return super().__setattr__(key, value) def __getattr__(self, item: str) -> Any: @@ -397,6 +454,8 @@ class DynamicTableMixin(BaseModel): """ Create ID column if not provided """ + if not isinstance(model, dict): + return model if "id" not in model: lengths = [] for key, val in model.items(): @@ -421,6 +480,8 @@ class DynamicTableMixin(BaseModel): the model dict is ordered after python3.6, so we can use that minus anything in :attr:`.NON_COLUMN_FIELDS` to determine order implied from passage order """ + if not isinstance(model, dict): + return model if "colnames" not in model: colnames = [ k @@ -456,19 +517,21 @@ class DynamicTableMixin(BaseModel): See :meth:`.cast_specified_columns` for handling columns in the class specification """ # if columns are not in the specification, cast to a generic VectorData - for key, val in model.items(): - if key in cls.model_fields: - continue - if not isinstance(val, (VectorData, VectorIndex)): - try: - if key.endswith("_index"): - model[key] = VectorIndex(name=key, description="", value=val) - else: - model[key] = VectorData(name=key, description="", value=val) - except ValidationError as e: - raise ValidationError( - f"field {key} cannot be cast to VectorData from {val}" - ) from e + + if isinstance(model, dict): + for key, val in model.items(): + if key in cls.model_fields: + continue + if not isinstance(val, (VectorData, VectorIndex)): + try: + if key.endswith("_index"): + model[key] = VectorIndex(name=key, description="", value=val) + else: + model[key] = VectorData(name=key, description="", value=val) + except ValidationError as e: # pragma: no cover + raise ValidationError( + f"field {key} cannot be cast to VectorData from {val}" + ) from e return model @model_validator(mode="after") @@ -500,8 +563,8 @@ class DynamicTableMixin(BaseModel): """ Ensure that all columns are equal length """ - lengths = [len(v) for v in self._columns.values()] - assert [length == lengths[0] for length in lengths], ( + lengths = [len(v) for v in self._columns.values()] + [len(self.id)] + assert all([length == lengths[0] for length in lengths]), ( "Columns are not of equal length! " f"Got colnames:\n{self.colnames}\nand lengths: {lengths}" ) @@ -537,15 +600,19 @@ class DynamicTableMixin(BaseModel): ) ) except Exception: - raise e + raise e from None -class AlignedDynamicTableMixin(DynamicTableMixin): +class AlignedDynamicTableMixin(BaseModel): """ Mixin to allow indexing multiple tables that are aligned on a common ID + + A great deal of code duplication because we need to avoid diamond inheritance + and also it's not so easy to copy a pydantic validator method. """ - __pydantic_extra__: Dict[str, "DynamicTableMixin"] + model_config = ConfigDict(extra="allow", validate_assignment=True) + __pydantic_extra__: Dict[str, Union["DynamicTableMixin", "VectorDataMixin", "VectorIndexMixin"]] NON_CATEGORY_FIELDS: ClassVar[tuple[str]] = ( "name", @@ -563,7 +630,7 @@ class AlignedDynamicTableMixin(DynamicTableMixin): return {k: getattr(self, k) for i, k in enumerate(self.categories)} def __getitem__( - self, item: Union[int, str, slice, Tuple[Union[int, slice], str]] + self, item: Union[int, str, slice, NDArray[Shape["*"], int], Tuple[Union[int, slice], str]] ) -> pd.DataFrame: """ Mimic hdmf: @@ -581,25 +648,78 @@ class AlignedDynamicTableMixin(DynamicTableMixin): elif isinstance(item, tuple) and len(item) == 2 and isinstance(item[1], str): # get a slice of a single table return self._categories[item[1]][item[0]] - elif isinstance(item, (int, slice)): + elif isinstance(item, (int, slice, Iterable)): # get a slice of all the tables ids = self.id[item] if not isinstance(ids, Iterable): ids = pd.Series([ids]) ids = pd.DataFrame({"id": ids}) - tables = [ids] + [table[item].reset_index() for table in self._categories.values()] + tables = [ids] + for category_name, category in self._categories.items(): + table = category[item] + if isinstance(table, pd.DataFrame): + table = table.reset_index() + elif isinstance(table, np.ndarray): + table = pd.DataFrame({category_name: [table]}) + elif isinstance(table, Iterable): + table = pd.DataFrame({category_name: table}) + else: + raise ValueError( + f"Don't know how to construct category table for {category_name}" + ) + tables.append(table) + names = [self.name] + self.categories # construct below in case we need to support array indexing in the future else: raise ValueError( f"Dont know how to index with {item}, " - "need an int, string, slice, or tuple[int | slice, str]" + "need an int, string, slice, ndarray, or tuple[int | slice, str]" ) df = pd.concat(tables, axis=1, keys=names) df.set_index((self.name, "id"), drop=True, inplace=True) return df + def __getattr__(self, item: str) -> Any: + """Try and use pandas df attrs if we don't have them""" + try: + return BaseModel.__getattr__(self, item) + except AttributeError as e: + try: + return getattr(self[:], item) + except AttributeError: + raise e from None + + def __len__(self) -> int: + """ + Use the id column to determine length. + + If the id column doesn't represent length accurately, it's a bug + """ + return len(self.id) + + @model_validator(mode="before") + @classmethod + def create_id(cls, model: Dict[str, Any]) -> Dict: + """ + Create ID column if not provided + """ + if "id" not in model: + lengths = [] + for key, val in model.items(): + # don't get lengths of columns with an index + if ( + f"{key}_index" in model + or (isinstance(val, VectorData) and val._index) + or key in cls.NON_CATEGORY_FIELDS + ): + continue + lengths.append(len(val)) + model["id"] = np.arange(np.max(lengths)) + + return model + @model_validator(mode="before") @classmethod def create_categories(cls, model: Dict[str, Any]) -> Dict: @@ -626,6 +746,42 @@ class AlignedDynamicTableMixin(DynamicTableMixin): model["categories"].extend(categories) return model + @model_validator(mode="after") + def resolve_targets(self) -> "DynamicTableMixin": + """ + Ensure that any implicitly indexed columns are linked, and create backlinks + """ + for key, col in self._categories.items(): + if isinstance(col, VectorData): + # find an index + idx = None + for field_name in self.model_fields_set: + if field_name in self.NON_CATEGORY_FIELDS or field_name == key: + continue + # implicit name-based index + field = getattr(self, field_name) + if isinstance(field, VectorIndex) and ( + field_name == f"{key}_index" or field.target is col + ): + idx = field + break + if idx is not None: + col._index = idx + idx.target = col + return self + + @model_validator(mode="after") + def ensure_equal_length_cols(self) -> "DynamicTableMixin": + """ + Ensure that all columns are equal length + """ + lengths = [len(v) for v in self._categories.values()] + [len(self.id)] + assert all([length == lengths[0] for length in lengths]), ( + "Columns are not of equal length! " + f"Got colnames:\n{self.categories}\nand lengths: {lengths}" + ) + return self + linkml_meta = LinkMLMeta( {