From 318562a519b4b7a309a7bfe3918cfab81b131f2e Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Mon, 27 Nov 2023 23:26:07 -0800 Subject: [PATCH] sqlite --- src/comparison/data/graphdb.md | 29 +++++ src/comparison/data/index.md | 6 +- src/comparison/data/sqlite.md | 159 +++++++++++++++++++++++++++ src/comparison/index.md | 9 ++ src/comparison/ld/index.md | 6 +- src/comparison/p2p/bittorrent.md | 2 +- src/comparison/social/at_protocol.md | 6 + src/conf.py | 4 + src/encryption.md | 2 +- src/index.md | 10 +- src/todo.md | 4 + 11 files changed, 226 insertions(+), 11 deletions(-) create mode 100644 src/comparison/data/graphdb.md create mode 100644 src/comparison/data/sqlite.md create mode 100644 src/todo.md diff --git a/src/comparison/data/graphdb.md b/src/comparison/data/graphdb.md new file mode 100644 index 0000000..dfdbddb --- /dev/null +++ b/src/comparison/data/graphdb.md @@ -0,0 +1,29 @@ +# Graph Databases + +```{index} see: Triple Store; Graph Database + +``` +```{index} Database Engine; Graph Database + +``` + + +({index}`Graph Database`s and {index}`Triple Store`s) + +## Options + +```{table} Graph Databases +:width: 100% + +| DB | SPARQL? | Language | Description | +| -- | ------- | -------- | ----------- | +| [{index}`Oxigraph `](https://github.com/oxigraph/oxigraph) | Y | Rust, Python, JS | "Trying to do SQLite for graph dbs" | +| {index}`Blazegraph ` | | | | +| {index}`GraphDB ` | | | | +| {index}`Jena ` | | | | +| {index}`Virtuoso ` | | | | +``` + +## TODO + +- What in the heck is {index}`SOLID` using? \ No newline at end of file diff --git a/src/comparison/data/index.md b/src/comparison/data/index.md index a51e3c7..2755dec 100644 --- a/src/comparison/data/index.md +++ b/src/comparison/data/index.md @@ -7,4 +7,8 @@ datalad dmc eris -``` \ No newline at end of file +graphdb +sqlite +``` + +AND SEE https://github.com/bonfire-networks/pointers for a data model re this thread: https://social.treehouse.systems/@jubilee/110665600584252989 \ No newline at end of file diff --git a/src/comparison/data/sqlite.md b/src/comparison/data/sqlite.md new file mode 100644 index 0000000..0b94adc --- /dev/null +++ b/src/comparison/data/sqlite.md @@ -0,0 +1,159 @@ +# SQLite + +```{index} Database Engine; RDBMS +``` +```{index} RDBMS; SQLite +``` + +We want something like sqlite, but for {index}`Graph Database`s! + +Most of the existing triple stores and graph databases are very heavyweight services that would be impractical for packaging in a portable daemon in the same way that sqlite works. Maybe we can learn from how sqlite works and do something similar for graph databases? + +Questions: + +- How come these things can be faster than idk like a .json file +- How are they different architecturally than a traditional SQL server + +## File Structure + +- Main file +- Rollback Journal - stores additional information to restore in case of a crash. Store a copy of the original DB, write changes directly into DB file. COMMIT occurs when rollback is deleted +- Write-ahead Log - if in [WAL mode](https://www.sqlite.org/wal.html), append updates to WAL file. COMMIT occurs when writing to WAL file (not to main DB). Multiple transactions can be batched. + +### Pages + +Pages are the basic unit of an sqlite file. + +Numeracy: + +- Each page can be a power of 2 between 512 and 65536 +- All pages are the same size +- Max `2^32 - 2` pages in a single DB. + + +#### Types + +Each page has a single type: + + +> - The lock-byte page +> - A freelist page +> - A freelist trunk page +> - A freelist leaf page +> - A b-tree page +> - A table b-tree interior page +> - A table b-tree leaf page +> - An index b-tree interior page +> - An index b-tree leaf page +> - A payload overflow page +> - A pointer map page + +##### Lock-byte + +(artifact of windows 95 compatibility) + +##### Freelist + +Linked list of "trunks and leaves" to keep track of unused pages: +- Trunk pages: + - Series of 4-byte integers that take up full page + - First integer is the page number of the next trunk (zero if it's the last page) + - Second integer is number of leaf pointers that follow +- Leaf pages: + - contain nothing! + +##### {index}`B-tree` + +([B-tree wiki page](https://en.wikipedia.org/wiki/B-tree)) + +Two types of b-trees: table and index + +- **Table B-Trees**: + - One table b-tree in the db file for each `rowid` table in the database schema + - 64-bit signed integer key that refers to the `rowid` it implements + - Store all data in leaves (interior pages just point to leaves) + - +- **Index B-Trees**: + - One index b-tree for each index in the schema + - Arbitrary keys + - Store no data. + +Two types of b-tree pages: +- **Interior** +- **Leaf** + +```{todo} +Describe freeblocks +``` + +#### Payload Overflow + +> Define the "payload" of a cell to be the arbitrary length section of the cell. +> - For an index b-tree, the key is always arbitrary in length and hence the payload is the key. +> - There are no arbitrary length elements in the cells of interior table b-tree pages and so those cells have no payload. +> - Table b-tree leaf pages contain arbitrary length content and so for cells on those pages the payload is the content. + +When a payload is bigger than some threshold[^overflowthreshold], store it on a linked list of payload overload pages. The first four bytes of each overflow page are a 4-byte big-endian integer indicating the page number of the next page in the chain, or zero for the final page. + +[^overflowthreshold]: > The overflow thresholds are designed to give a minimum fanout of 4 for index b-trees and to make sure enough of the payload is on the b-tree page that the record header can usually be accessed without consulting an overflow page. In hindsight, the designer of the SQLite b-tree logic realized that these thresholds could have been made much simpler. However, the computations cannot be changed without resulting in an incompatible file format. And the current computations work well, even if they are a little complex. + +#### Pointer Maps + +Backlinks from child to parent nodes in index trees to assist with vacuuming :) + +Each pointermap page provides backlinks for the pages immediately following it. + +Each 5-byte ptrmap entry consists of: + +- 1 byte of page type information: + - `0`: A b-tree root page + - `0`: Freelist page + - `prior page` or `first page`: payload overflow page + - `parent page`: non-root b-tree page +- 4 byte big-endian page number + + +### Header + +(Add header info here as the rest of the spec makes it relevant) + +https://www.sqlite.org/fileformat.html#the_database_header + +Useful properties +- Magic header string makes it easy to identify sqlite files +- File change counter & schema cookie - 4-byte integer that increments whenever the db file is unlocked. useful for cache invalidation +- `version-valid-for-number` - stores the version of the software that most recently modified it, and the change counter at that modification. Useful for detecting if certain behaviors like updating the in-header db size are behaving correctly by knowing what version made a given change. + +## Schema + +### Records + +### Tables + +### Indices + +## I/O + +```{todo} +**How does writing and querying an sqlite file actually work???** +``` + +All reads from and writes to the main database file happen at a page boundary. + +All writes are an integer number of pages in size. + +Most reads are also an integer number of pages in size, except opening the database which reads the header (first 100 bytes). + + + + +## See also + +- [Graph Databases](graphdb) + +## References + +- [SQLite File Format](https://www.sqlite.org/fileformat.html) +- [SQLite Quirks](https://www.sqlite.org/quirks.html) - useful for understanding some design decisions +- [Customization and Porting](https://www.sqlite.org/custombuild.html) +- [SQLite Architecture](https://www.sqlite.org/arch.html) \ No newline at end of file diff --git a/src/comparison/index.md b/src/comparison/index.md index 8b5043b..227f910 100644 --- a/src/comparison/index.md +++ b/src/comparison/index.md @@ -15,6 +15,7 @@ data/index ## To be categorized +- [CozoDB](https://docs.cozodb.org/en/latest/releases/v0.6.html#experience-cozodb-the-hybrid-relational-graph-vector-database-the-hippocampus-for-llms) - uh i think this is the database we needed... - Agregore - Arweave - CAN @@ -30,6 +31,14 @@ data/index - chunks stored by nodes close in hash space - Repute.Social - LinkedTrust.us +- https://ganarchy.github.io/ - pull request-less git + +## See also + +- https://gitlab.com/bluesky-community1/decentralized-ecosystem/-/blob/master/README.md +- https://dsocialcommons.org/ +- https://openengiadina.codeberg.page/rdf-cbor/ - RDF/CBOR graph serialization] + - https://openengiadina.codeberg.page/rdf-cbor/content-addressable-rdf-v0.1.html ## Points of comparison diff --git a/src/comparison/ld/index.md b/src/comparison/ld/index.md index 722fd27..b6ee2f0 100644 --- a/src/comparison/ld/index.md +++ b/src/comparison/ld/index.md @@ -17,4 +17,8 @@ Linked data was born to be p2p. Many of the [initial, lofty visions](https://jon Don't just take my word for it tho: {attribution="A more decentralized vision for Linked Data. Polleres et al. (2020)"} -> So, where does this leave us? We have seen a lot of resources being put into publishing Linked Data, but yet a publicly widely visible “killer app” is still missing. The reason for this, in the opinion and experiences of the authors, lies all to often in the frustrating experiences when trying to actually use Linked Data for building actual applications. Many attempts and projects end up still using a centralized warehousing approach, integrating a handful of data sets directly from their raw data sources, rather than being able to leverage their “lifted” Linked Data versions: the use and benefits of RDF and Linked Data over conventional databases and warehouses technologies, where more trained people are available, remain questionable. {cite}`polleresMoreDecentralizedVision2020` \ No newline at end of file +> So, where does this leave us? We have seen a lot of resources being put into publishing Linked Data, but yet a publicly widely visible “killer app” is still missing. The reason for this, in the opinion and experiences of the authors, lies all to often in the frustrating experiences when trying to actually use Linked Data for building actual applications. Many attempts and projects end up still using a centralized warehousing approach, integrating a handful of data sets directly from their raw data sources, rather than being able to leverage their “lifted” Linked Data versions: the use and benefits of RDF and Linked Data over conventional databases and warehouses technologies, where more trained people are available, remain questionable. {cite}`polleresMoreDecentralizedVision2020` + +## TODO + +- https://layeredschemas.org/ \ No newline at end of file diff --git a/src/comparison/p2p/bittorrent.md b/src/comparison/p2p/bittorrent.md index 8d393b6..b9dba72 100644 --- a/src/comparison/p2p/bittorrent.md +++ b/src/comparison/p2p/bittorrent.md @@ -43,7 +43,7 @@ For example, a directory of three random files has a (decoded) `.torrent` file t } ``` -The contents of a torrent file are then uniquely indexed by the `infohash`, which is the hash of the entire (bencoded) `info` dictionary. {key}`Magnet Links ` are an abbreviated form of the `.torrent` file that contain only the info-hash, which allows downloading peers to request and independently verify the rest of the info dictionary and start downloading without a complete `.torrent`. +The contents of a torrent file are then uniquely indexed by the `infohash`, which is the hash of the entire (bencoded) `info` dictionary. {index}`Magnet Links ` are an abbreviated form of the `.torrent` file that contain only the info-hash, which allows downloading peers to request and independently verify the rest of the info dictionary and start downloading without a complete `.torrent`. A generic magnet link looks like: diff --git a/src/comparison/social/at_protocol.md b/src/comparison/social/at_protocol.md index 46fea8b..6a9b852 100644 --- a/src/comparison/social/at_protocol.md +++ b/src/comparison/social/at_protocol.md @@ -13,6 +13,12 @@ Specifically, AT protocol differentiates between *handles* and *identities*, whe That's about it, the rest of the handling of DID's is extremely centralized (see [did:plc](https://atproto.com/specs/did-plc) which requires resolution against a single domain), and the requirement of all posts to be funneled through [Big Graph Services](https://blueskyweb.xyz/blog/5-5-2023-federation-architecture) rather than directly peer to peer is transparently designed to ensure a marketing and advertising layer in between actors in the network. +```{note} +Lexicons were based on RDF? + +https://gist.github.com/pfrazee/0c51dc1afceac83d984ebfd555fe6340 +``` + ## Lessons diff --git a/src/conf.py b/src/conf.py index 551cdba..13c8898 100644 --- a/src/conf.py +++ b/src/conf.py @@ -17,6 +17,7 @@ release = '0.1.0' extensions = [ 'sphinx.ext.napoleon', 'sphinx.ext.autodoc', + 'sphinx.ext.todo', 'sphinxcontrib.mermaid', 'sphinxcontrib.bibtex', 'myst_parser', @@ -39,6 +40,9 @@ pygments_dark_style = "github-dark" # ----------- # Extension config +# todo +todo_include_todos = True + # myst myst_heading_anchors = 3 myst_enable_extensions = [ diff --git a/src/encryption.md b/src/encryption.md index 65cd0f7..2176866 100644 --- a/src/encryption.md +++ b/src/encryption.md @@ -3,6 +3,6 @@ How can we make it possible to have a protocol that is "open" when it is intended to, but also protects privacy and consent when we need it to? -# TODO +## TODO - https://en.wikipedia.org/wiki/OMEMO \ No newline at end of file diff --git a/src/index.md b/src/index.md index 1ab8d49..e143c3e 100644 --- a/src/index.md +++ b/src/index.md @@ -6,6 +6,8 @@ This site describes the implementation of the p2p linked data protocol in {cite} ## Document Status +**23-11-27** - Back at it again after some digressions into [chatbridge](https://git.jon-e.net/jonny/chatbridge) and [nwb-linkml](https://github.com/p2p-ld/nwb-linkml/) - gathering more information on storage and interchange formats for databases and triple stores before trying to prop up the first peers sharing graphs of NWB data. Still mostly populating the [Comparison](comparison) section as I take notes and before I restructure these docs. + **23-06-08** - Populating the [Comparison](comparison) section first to refresh myself on other projects, and starting to sketch diagrammatically in [Sketchpad](sketchpad). The rest of the pages are just stubs to keep track of ideas before fleshing them out. ```{toctree} @@ -59,11 +61,5 @@ sketchpad genindex references +todo ``` - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` diff --git a/src/todo.md b/src/todo.md new file mode 100644 index 0000000..da28a24 --- /dev/null +++ b/src/todo.md @@ -0,0 +1,4 @@ +# TODO + +```{todolist} +``` \ No newline at end of file