From 57fa3d34a276114ccf6fb32c2ea1286444599cfa Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Sat, 23 Sep 2023 00:08:59 -0700 Subject: [PATCH] pandas dataframe mimic --- docs/_notes/todo.md | 3 +- nwb_linkml/poetry.lock | 134 +++++++++++++++++- nwb_linkml/pyproject.toml | 3 +- nwb_linkml/src/nwb_linkml/io/hdf5_scratch.py | 9 ++ nwb_linkml/src/nwb_linkml/types/__init__.py | 3 +- nwb_linkml/src/nwb_linkml/types/df.py | 111 +++++++++++++++ nwb_linkml/tests/test_types/test_df.py | 59 ++++++++ .../{ndarray.py => test_ndarray.py} | 0 8 files changed, 317 insertions(+), 5 deletions(-) create mode 100644 nwb_linkml/src/nwb_linkml/types/df.py create mode 100644 nwb_linkml/tests/test_types/test_df.py rename nwb_linkml/tests/test_types/{ndarray.py => test_ndarray.py} (100%) diff --git a/docs/_notes/todo.md b/docs/_notes/todo.md index fad2cf1..019de15 100644 --- a/docs/_notes/todo.md +++ b/docs/_notes/todo.md @@ -2,4 +2,5 @@ Stuff to keep track of that might have been manually overrided that needs to be fixed pre-release -- Coerce all listlike things into lists if they are passed as single elements! \ No newline at end of file +- Coerce all listlike things into lists if they are passed as single elements! +- Use [fsspec](https://filesystem-spec.readthedocs.io/en/latest/index.html) to interface with DANDI! \ No newline at end of file diff --git a/nwb_linkml/poetry.lock b/nwb_linkml/poetry.lock index cbba7b4..9b624fa 100644 --- a/nwb_linkml/poetry.lock +++ b/nwb_linkml/poetry.lock @@ -1103,6 +1103,47 @@ files = [ {file = "numpy-1.25.2.tar.gz", hash = "sha256:fd608e19c8d7c55021dffd43bfe5492fab8cc105cc8986f813f8c3c048b38760"}, ] +[[package]] +name = "numpy" +version = "1.26.0" +description = "Fundamental package for array computing in Python" +optional = false +python-versions = "<3.13,>=3.9" +files = [ + {file = "numpy-1.26.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f8db2f125746e44dce707dd44d4f4efeea8d7e2b43aace3f8d1f235cfa2733dd"}, + {file = "numpy-1.26.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0621f7daf973d34d18b4e4bafb210bbaf1ef5e0100b5fa750bd9cde84c7ac292"}, + {file = "numpy-1.26.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51be5f8c349fdd1a5568e72713a21f518e7d6707bcf8503b528b88d33b57dc68"}, + {file = "numpy-1.26.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:767254ad364991ccfc4d81b8152912e53e103ec192d1bb4ea6b1f5a7117040be"}, + {file = "numpy-1.26.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:436c8e9a4bdeeee84e3e59614d38c3dbd3235838a877af8c211cfcac8a80b8d3"}, + {file = "numpy-1.26.0-cp310-cp310-win32.whl", hash = "sha256:c2e698cb0c6dda9372ea98a0344245ee65bdc1c9dd939cceed6bb91256837896"}, + {file = "numpy-1.26.0-cp310-cp310-win_amd64.whl", hash = "sha256:09aaee96c2cbdea95de76ecb8a586cb687d281c881f5f17bfc0fb7f5890f6b91"}, + {file = "numpy-1.26.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:637c58b468a69869258b8ae26f4a4c6ff8abffd4a8334c830ffb63e0feefe99a"}, + {file = "numpy-1.26.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:306545e234503a24fe9ae95ebf84d25cba1fdc27db971aa2d9f1ab6bba19a9dd"}, + {file = "numpy-1.26.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c6adc33561bd1d46f81131d5352348350fc23df4d742bb246cdfca606ea1208"}, + {file = "numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e062aa24638bb5018b7841977c360d2f5917268d125c833a686b7cbabbec496c"}, + {file = "numpy-1.26.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:546b7dd7e22f3c6861463bebb000646fa730e55df5ee4a0224408b5694cc6148"}, + {file = "numpy-1.26.0-cp311-cp311-win32.whl", hash = "sha256:c0b45c8b65b79337dee5134d038346d30e109e9e2e9d43464a2970e5c0e93229"}, + {file = "numpy-1.26.0-cp311-cp311-win_amd64.whl", hash = "sha256:eae430ecf5794cb7ae7fa3808740b015aa80747e5266153128ef055975a72b99"}, + {file = "numpy-1.26.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:166b36197e9debc4e384e9c652ba60c0bacc216d0fc89e78f973a9760b503388"}, + {file = "numpy-1.26.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f042f66d0b4ae6d48e70e28d487376204d3cbf43b84c03bac57e28dac6151581"}, + {file = "numpy-1.26.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5e18e5b14a7560d8acf1c596688f4dfd19b4f2945b245a71e5af4ddb7422feb"}, + {file = "numpy-1.26.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f6bad22a791226d0a5c7c27a80a20e11cfe09ad5ef9084d4d3fc4a299cca505"}, + {file = "numpy-1.26.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4acc65dd65da28060e206c8f27a573455ed724e6179941edb19f97e58161bb69"}, + {file = "numpy-1.26.0-cp312-cp312-win32.whl", hash = "sha256:bb0d9a1aaf5f1cb7967320e80690a1d7ff69f1d47ebc5a9bea013e3a21faec95"}, + {file = "numpy-1.26.0-cp312-cp312-win_amd64.whl", hash = "sha256:ee84ca3c58fe48b8ddafdeb1db87388dce2c3c3f701bf447b05e4cfcc3679112"}, + {file = "numpy-1.26.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4a873a8180479bc829313e8d9798d5234dfacfc2e8a7ac188418189bb8eafbd2"}, + {file = "numpy-1.26.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:914b28d3215e0c721dc75db3ad6d62f51f630cb0c277e6b3bcb39519bed10bd8"}, + {file = "numpy-1.26.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c78a22e95182fb2e7874712433eaa610478a3caf86f28c621708d35fa4fd6e7f"}, + {file = "numpy-1.26.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86f737708b366c36b76e953c46ba5827d8c27b7a8c9d0f471810728e5a2fe57c"}, + {file = "numpy-1.26.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:b44e6a09afc12952a7d2a58ca0a2429ee0d49a4f89d83a0a11052da696440e49"}, + {file = "numpy-1.26.0-cp39-cp39-win32.whl", hash = "sha256:5671338034b820c8d58c81ad1dafc0ed5a00771a82fccc71d6438df00302094b"}, + {file = "numpy-1.26.0-cp39-cp39-win_amd64.whl", hash = "sha256:020cdbee66ed46b671429c7265cf00d8ac91c046901c55684954c3958525dab2"}, + {file = "numpy-1.26.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:0792824ce2f7ea0c82ed2e4fecc29bb86bee0567a080dacaf2e0a01fe7654369"}, + {file = "numpy-1.26.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d484292eaeb3e84a51432a94f53578689ffdea3f90e10c8b203a99be5af57d8"}, + {file = "numpy-1.26.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:186ba67fad3c60dbe8a3abff3b67a91351100f2661c8e2a80364ae6279720299"}, + {file = "numpy-1.26.0.tar.gz", hash = "sha256:f93fc78fe8bf15afe2b8d6b6499f1c73953169fad1e9a8dd086cdff3190e7fdf"}, +] + [[package]] name = "nwb-schema-language" version = "0.1.1" @@ -1143,6 +1184,73 @@ files = [ {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, ] +[[package]] +name = "pandas" +version = "2.1.1" +description = "Powerful data structures for data analysis, time series, and statistics" +optional = false +python-versions = ">=3.9" +files = [ + {file = "pandas-2.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:58d997dbee0d4b64f3cb881a24f918b5f25dd64ddf31f467bb9b67ae4c63a1e4"}, + {file = "pandas-2.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02304e11582c5d090e5a52aec726f31fe3f42895d6bfc1f28738f9b64b6f0614"}, + {file = "pandas-2.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffa8f0966de2c22de408d0e322db2faed6f6e74265aa0856f3824813cf124363"}, + {file = "pandas-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1f84c144dee086fe4f04a472b5cd51e680f061adf75c1ae4fc3a9275560f8f4"}, + {file = "pandas-2.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:75ce97667d06d69396d72be074f0556698c7f662029322027c226fd7a26965cb"}, + {file = "pandas-2.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:4c3f32fd7c4dccd035f71734df39231ac1a6ff95e8bdab8d891167197b7018d2"}, + {file = "pandas-2.1.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:9e2959720b70e106bb1d8b6eadd8ecd7c8e99ccdbe03ee03260877184bb2877d"}, + {file = "pandas-2.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:25e8474a8eb258e391e30c288eecec565bfed3e026f312b0cbd709a63906b6f8"}, + {file = "pandas-2.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b8bd1685556f3374520466998929bade3076aeae77c3e67ada5ed2b90b4de7f0"}, + {file = "pandas-2.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc3657869c7902810f32bd072f0740487f9e030c1a3ab03e0af093db35a9d14e"}, + {file = "pandas-2.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:05674536bd477af36aa2effd4ec8f71b92234ce0cc174de34fd21e2ee99adbc2"}, + {file = "pandas-2.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:b407381258a667df49d58a1b637be33e514b07f9285feb27769cedb3ab3d0b3a"}, + {file = "pandas-2.1.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c747793c4e9dcece7bb20156179529898abf505fe32cb40c4052107a3c620b49"}, + {file = "pandas-2.1.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3bcad1e6fb34b727b016775bea407311f7721db87e5b409e6542f4546a4951ea"}, + {file = "pandas-2.1.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f5ec7740f9ccb90aec64edd71434711f58ee0ea7f5ed4ac48be11cfa9abf7317"}, + {file = "pandas-2.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29deb61de5a8a93bdd033df328441a79fcf8dd3c12d5ed0b41a395eef9cd76f0"}, + {file = "pandas-2.1.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4f99bebf19b7e03cf80a4e770a3e65eee9dd4e2679039f542d7c1ace7b7b1daa"}, + {file = "pandas-2.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:84e7e910096416adec68075dc87b986ff202920fb8704e6d9c8c9897fe7332d6"}, + {file = "pandas-2.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:366da7b0e540d1b908886d4feb3d951f2f1e572e655c1160f5fde28ad4abb750"}, + {file = "pandas-2.1.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9e50e72b667415a816ac27dfcfe686dc5a0b02202e06196b943d54c4f9c7693e"}, + {file = "pandas-2.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc1ab6a25da197f03ebe6d8fa17273126120874386b4ac11c1d687df288542dd"}, + {file = "pandas-2.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0dbfea0dd3901ad4ce2306575c54348d98499c95be01b8d885a2737fe4d7a98"}, + {file = "pandas-2.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:0489b0e6aa3d907e909aef92975edae89b1ee1654db5eafb9be633b0124abe97"}, + {file = "pandas-2.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:4cdb0fab0400c2cb46dafcf1a0fe084c8bb2480a1fa8d81e19d15e12e6d4ded2"}, + {file = "pandas-2.1.1.tar.gz", hash = "sha256:fecb198dc389429be557cde50a2d46da8434a17fe37d7d41ff102e3987fd947b"}, +] + +[package.dependencies] +numpy = [ + {version = ">=1.23.2", markers = "python_version == \"3.11\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, +] +python-dateutil = ">=2.8.2" +pytz = ">=2020.1" +tzdata = ">=2022.1" + +[package.extras] +all = ["PyQt5 (>=5.15.6)", "SQLAlchemy (>=1.4.36)", "beautifulsoup4 (>=4.11.1)", "bottleneck (>=1.3.4)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=0.8.1)", "fsspec (>=2022.05.0)", "gcsfs (>=2022.05.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.8.0)", "matplotlib (>=3.6.1)", "numba (>=0.55.2)", "numexpr (>=2.8.0)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.10)", "pandas-gbq (>=0.17.5)", "psycopg2 (>=2.9.3)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.5)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)", "pyxlsb (>=1.0.9)", "qtpy (>=2.2.0)", "s3fs (>=2022.05.0)", "scipy (>=1.8.1)", "tables (>=3.7.0)", "tabulate (>=0.8.10)", "xarray (>=2022.03.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.3)", "zstandard (>=0.17.0)"] +aws = ["s3fs (>=2022.05.0)"] +clipboard = ["PyQt5 (>=5.15.6)", "qtpy (>=2.2.0)"] +compression = ["zstandard (>=0.17.0)"] +computation = ["scipy (>=1.8.1)", "xarray (>=2022.03.0)"] +consortium-standard = ["dataframe-api-compat (>=0.1.7)"] +excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.0.10)", "pyxlsb (>=1.0.9)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.3)"] +feather = ["pyarrow (>=7.0.0)"] +fss = ["fsspec (>=2022.05.0)"] +gcp = ["gcsfs (>=2022.05.0)", "pandas-gbq (>=0.17.5)"] +hdf5 = ["tables (>=3.7.0)"] +html = ["beautifulsoup4 (>=4.11.1)", "html5lib (>=1.1)", "lxml (>=4.8.0)"] +mysql = ["SQLAlchemy (>=1.4.36)", "pymysql (>=1.0.2)"] +output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.8.10)"] +parquet = ["pyarrow (>=7.0.0)"] +performance = ["bottleneck (>=1.3.4)", "numba (>=0.55.2)", "numexpr (>=2.8.0)"] +plot = ["matplotlib (>=3.6.1)"] +postgresql = ["SQLAlchemy (>=1.4.36)", "psycopg2 (>=2.9.3)"] +spss = ["pyreadstat (>=1.1.5)"] +sql-other = ["SQLAlchemy (>=1.4.36)"] +test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"] +xml = ["lxml (>=4.8.0)"] + [[package]] name = "parse" version = "1.19.1" @@ -1626,6 +1734,17 @@ files = [ [package.dependencies] sortedcontainers = "*" +[[package]] +name = "pytz" +version = "2023.3.post1" +description = "World timezone definitions, modern and historical" +optional = false +python-versions = "*" +files = [ + {file = "pytz-2023.3.post1-py2.py3-none-any.whl", hash = "sha256:ce42d816b81b68506614c11e8937d3aa9e41007ceb50bfdcb0749b921bf646c7"}, + {file = "pytz-2023.3.post1.tar.gz", hash = "sha256:7b4fddbeb94a1eba4b557da24f19fdf9db575192544270a9101d8509f9f43d7b"}, +] + [[package]] name = "pyyaml" version = "6.0.1" @@ -2179,6 +2298,17 @@ files = [ {file = "typing_extensions-4.8.0.tar.gz", hash = "sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef"}, ] +[[package]] +name = "tzdata" +version = "2023.3" +description = "Provider of IANA time zone data" +optional = false +python-versions = ">=2" +files = [ + {file = "tzdata-2023.3-py2.py3-none-any.whl", hash = "sha256:7e65763eef3120314099b6939b5546db7adce1e7d6f2e179e3df563c70511eda"}, + {file = "tzdata-2023.3.tar.gz", hash = "sha256:11ef1e08e54acb0d4f95bdb1be05da659673de4acbd21bf9c69e94cc5e907a3a"}, +] + [[package]] name = "uri-template" version = "1.3.0" @@ -2386,5 +2516,5 @@ tests = ["coverage", "coveralls", "pytest", "pytest-cov", "pytest-depends", "pyt [metadata] lock-version = "2.0" -python-versions = "^3.11" -content-hash = "7a4e1c3b66143e4f4e8392238051241f25274ebd597183ef64168055949074f4" +python-versions = ">=3.11,<3.13" +content-hash = "0f2d9fc76cf3788fbdefc6f7b06afb7267c5fe2967970389907a5a9c4864334a" diff --git a/nwb_linkml/pyproject.toml b/nwb_linkml/pyproject.toml index f43701b..1f59cf5 100644 --- a/nwb_linkml/pyproject.toml +++ b/nwb_linkml/pyproject.toml @@ -11,7 +11,7 @@ packages = [ ] [tool.poetry.dependencies] -python = "^3.11" +python = ">=3.11,<3.13" pyyaml = "^6.0" linkml-runtime = "^1.5.6" nwb_schema_language = "^0.1.1" @@ -30,6 +30,7 @@ pytest-cov = {version = "^4.1.0", optional = true} coveralls = {version = "^3.3.1", optional = true} pytest-profiling = {version = "^1.7.0", optional = true} pydantic-settings = "^2.0.3" +pandas = "^2.1.1" [tool.poetry.extras] tests = [ diff --git a/nwb_linkml/src/nwb_linkml/io/hdf5_scratch.py b/nwb_linkml/src/nwb_linkml/io/hdf5_scratch.py index ed6c2d0..5580661 100644 --- a/nwb_linkml/src/nwb_linkml/io/hdf5_scratch.py +++ b/nwb_linkml/src/nwb_linkml/io/hdf5_scratch.py @@ -8,6 +8,15 @@ field more so that at each pass i can work through the items whose dependencies have been solved from the bottom up. """ +from typing import List +from nwb_linkml.types.df import DataFrame + +class MyDf(DataFrame): + ints: List[int] + +a = MyDf(ints=[1,2,3]) + + from nwb_linkml.io.hdf5 import HDF5IO, flatten_hdf import h5py from typing import NamedTuple, Tuple, Optional diff --git a/nwb_linkml/src/nwb_linkml/types/__init__.py b/nwb_linkml/src/nwb_linkml/types/__init__.py index 801a327..b0523c0 100644 --- a/nwb_linkml/src/nwb_linkml/types/__init__.py +++ b/nwb_linkml/src/nwb_linkml/types/__init__.py @@ -1 +1,2 @@ -from nwb_linkml.types.ndarray import NDArray \ No newline at end of file +from nwb_linkml.types.ndarray import NDArray +from nwb_linkml.types.df import DataFrame \ No newline at end of file diff --git a/nwb_linkml/src/nwb_linkml/types/df.py b/nwb_linkml/src/nwb_linkml/types/df.py new file mode 100644 index 0000000..8a5f47a --- /dev/null +++ b/nwb_linkml/src/nwb_linkml/types/df.py @@ -0,0 +1,111 @@ +""" +Pydantic models that behave like pandas dataframes +""" +import pdb +from typing import List, Any, get_origin, get_args, Union, Optional, Dict +from types import NoneType + +import numpy as np +import pandas as pd +from pydantic import ( + BaseModel, + model_serializer, + SerializerFunctionWrapHandler, + ConfigDict, + model_validator +) + +class DataFrame(BaseModel, pd.DataFrame): + """ + Pydantic model root class that mimics a pandas dataframe. + + Notes: + + The synchronization between the underlying lists in the pydantic model + and the derived dataframe is partial, and at the moment unidirectional. + This class is primarily intended for reading from tables stored in + NWB files rather than being able to manipulate them. + + The dataframe IS updated when new values are *assigned* to a field. + + eg.:: + + MyModel.fieldval = [1,2,3] + + But the dataframe is NOT updated when existing values are updated. + + eg.:: + + MyModel.fieldval.append(4) + + In that case you need to call :meth:`.update_df` manually. + + Additionally, if the dataframe is modified, the underlying lists are NOT updated, + but when the model is dumped to a dictionary or serialized, the dataframe IS used, + so changes will be reflected then. + + """ + + _df: pd.DataFrame = None + model_config = ConfigDict(validate_assignment=True) + def __init__(self, **kwargs): + # pdb.set_trace() + super().__init__(**kwargs) + + self._df = self.__make_df() + + + def __make_df(self) -> pd.DataFrame: + # make dict that can handle ragged arrays and NoneTypes + items = {k:v for k,v in self.__dict__.items() if k in self.model_fields} + + df_dict = {k: (pd.Series(v) if isinstance(v, list) else pd.Series([v])) + for k,v in items.items()} + df = pd.DataFrame(df_dict) + # replace Nans with None + df = df.fillna(np.nan).replace([np.nan], [None]) + return df + + def update_df(self): + """ + Update the internal dataframe in the case that the model values are changed + in a way that we can't detect, like appending to one of the lists. + + """ + self._df = self.__make_df() + + def __getattr__(self, item: str): + """ + Mimic pandas dataframe and pydantic model behavior + """ + if item in ('df', '_df'): + return self.__pydantic_private__['_df'] + elif item in self.model_fields.keys(): + return self._df[item] + else: + try: + return object.__getattribute__(self._df, item) + except AttributeError: + return object.__getattribute__(self, item) + @model_validator(mode='after') + def recreate_df(self): + """Remake DF when validating (eg. when updating values on assignment)""" + self.update_df() + + @model_serializer(mode='wrap', when_used='always') + def serialize_model(self, nxt: SerializerFunctionWrapHandler) -> Dict[str, Any]: + """ + We don't handle values that are changed + + """ + if self._df is None: + return nxt(self) + else: + out = self._df.to_dict('list') + # remove Nones + out = { + k: [inner_v for inner_v in v if inner_v is not None] + for k, v in out.items() + } + + return nxt(self.__class__(**out)) diff --git a/nwb_linkml/tests/test_types/test_df.py b/nwb_linkml/tests/test_types/test_df.py new file mode 100644 index 0000000..d4bef18 --- /dev/null +++ b/nwb_linkml/tests/test_types/test_df.py @@ -0,0 +1,59 @@ +import pytest + +import pandas as pd +from pydantic import BaseModel, ValidationError +from typing import List, Union, Optional +from nwb_linkml.types import DataFrame + +def test_df(): + """ + Dataframe class should behave like both a pydantic model and a dataframe + """ + +class MyDf(DataFrame): + ints: List[int] + strings: List[str] + multi: List[int | str] + opts: Optional[List[int]] = None + + good_kwargs = { + 'ints': [1,2,3], + 'strings': ['a','b','c'], + 'multi': [1,2,'a','d'], + 'opts': [] + } + bad_kwargs = { + 'ints': ['a','b','c'], + 'strings': [1,2,3], + 'multi': 'd' + } + df = MyDf(**good_kwargs) + assert isinstance(df, BaseModel) + assert isinstance(df, pd.DataFrame) + with pytest.raises(ValidationError): + bad_df = MyDf(**bad_kwargs) + + # can we do pydantic stuff + assert df.model_dump() == good_kwargs + # these throw when they fail + _ = df.model_dump_json() + _ = df.model_json_schema() + + # can we do pandas stuff + assert df['ints'].sum() == 6 + assert df.loc[2].to_list() == [3, 'c', 'a', None] + # lmao + + # we don't include the model when dumping/doing the schema + assert 'df' not in df.model_json_schema() + assert '_df' not in df.model_json_schema() + + # we update our dataframe when we assign + assert df.ints == good_kwargs['ints'] + assert df['ints'].tolist()[0:3] == good_kwargs['ints'] + df.ints = [1,2,3,4] + assert df.ints == [1,2,3,4] + assert (df['ints'] == pd.Series([1,2,3,4])).all() + + df['ints'] = df['ints']._append(pd.Series(5)) + diff --git a/nwb_linkml/tests/test_types/ndarray.py b/nwb_linkml/tests/test_types/test_ndarray.py similarity index 100% rename from nwb_linkml/tests/test_types/ndarray.py rename to nwb_linkml/tests/test_types/test_ndarray.py