stopped at https://w3c.github.io/rdf-canon/spec/#canon-algo-algo step 3

2024-02-23 23:24:59 -08:00 · 2024-02-23 23:24:59 -08:00 · 1e104800b0
commit 1e104800b0
6 changed files with 184 additions and 0 deletions
--- a/poetry.lock
+++ b/poetry.lock
@ -0,0 +1,66 @@
+# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
+
+[[package]]
+name = "isodate"
+version = "0.6.1"
+description = "An ISO 8601 date/time/duration parser and formatter"
+optional = false
+python-versions = "*"
+files = [
+    {file = "isodate-0.6.1-py2.py3-none-any.whl", hash = "sha256:0751eece944162659049d35f4f549ed815792b38793f07cf73381c1c87cbed96"},
+    {file = "isodate-0.6.1.tar.gz", hash = "sha256:48c5881de7e8b0a0d648cb024c8062dc84e7b840ed81e864c7614fd3c127bde9"},
+]
+
+[package.dependencies]
+six = "*"
+
+[[package]]
+name = "pyparsing"
+version = "3.1.1"
+description = "pyparsing module - Classes and methods to define and execute parsing grammars"
+optional = false
+python-versions = ">=3.6.8"
+files = [
+    {file = "pyparsing-3.1.1-py3-none-any.whl", hash = "sha256:32c7c0b711493c72ff18a981d24f28aaf9c1fb7ed5e9667c9e84e3db623bdbfb"},
+    {file = "pyparsing-3.1.1.tar.gz", hash = "sha256:ede28a1a32462f5a9705e07aea48001a08f7cf81a021585011deba701581a0db"},
+]
+
+[package.extras]
+diagrams = ["jinja2", "railroad-diagrams"]
+
+[[package]]
+name = "rdflib"
+version = "7.0.0"
+description = "RDFLib is a Python library for working with RDF, a simple yet powerful language for representing information."
+optional = false
+python-versions = ">=3.8.1,<4.0.0"
+files = [
+    {file = "rdflib-7.0.0-py3-none-any.whl", hash = "sha256:0438920912a642c866a513de6fe8a0001bd86ef975057d6962c79ce4771687cd"},
+    {file = "rdflib-7.0.0.tar.gz", hash = "sha256:9995eb8569428059b8c1affd26b25eac510d64f5043d9ce8c84e0d0036e995ae"},
+]
+
+[package.dependencies]
+isodate = ">=0.6.0,<0.7.0"
+pyparsing = ">=2.1.0,<4"
+
+[package.extras]
+berkeleydb = ["berkeleydb (>=18.1.0,<19.0.0)"]
+html = ["html5lib (>=1.0,<2.0)"]
+lxml = ["lxml (>=4.3.0,<5.0.0)"]
+networkx = ["networkx (>=2.0.0,<3.0.0)"]
+
+[[package]]
+name = "six"
+version = "1.16.0"
+description = "Python 2 and 3 compatibility utilities"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
+files = [
+    {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
+    {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
+]
+
+[metadata]
+lock-version = "2.0"
+python-versions = "^3.11"
+content-hash = "036ed7fb018709708f370a7646048203cf0fa92169d910f113cbe042cba147bf"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,19 @@
+[tool.poetry]
+name = "rdf-canonize"
+version = "0.1.0"
+description = ""
+authors = ["sneakers-the-rat <sneakers-the-rat@protonmail.com>"]
+license = "GPL-3.0"
+readme = "README.md"
+packages = [
+    { include = "rdf_canonize" }
+]
+
+[tool.poetry.dependencies]
+python = "^3.11"
+rdflib = "^7.0.0"
+
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
--- a/rdf_canonize/init.py
+++ b/rdf_canonize/init.py
--- a/rdf_canonize/canonize.py
+++ b/rdf_canonize/canonize.py
@ -0,0 +1,49 @@
+"""
+Canonicalization algorithms
+
+
+* Initialization. Initialize the state needed for the rest of the algorithm using 4.2 Canonicalization State. Also initialize the canonicalized dataset using the input dataset (which remains immutable) the input blank node identifier map (retaining blank node identifiers from the input if possible, otherwise assigning them arbitrarily); the issued identifiers map from the canonical issuer is added upon completion of the algorithm.
+* Compute first degree hashes. Compute the first degree hash for each blank node in the dataset using 4.6 Hash First Degree Quads.
+* Canonically label unique nodes. Assign canonical identifiers via 4.5 Issue Identifier Algorithm, in Unicode code point order, to each blank node whose first degree hash is unique.
+* Compute N-degree hashes for non-unique nodes. For each repeated first degree hash (proceeding in Unicode code point order), compute the N-degree hash via 4.8 Hash N-Degree Quads of every unlabeled blank node that corresponds to the given repeated hash.
+* Canonically label remaining nodes. In Unicode code point order of the N-degree hashes, issue canonical identifiers to each corresponding blank node using 4.5 Issue Identifier Algorithm. If more than one node produces the same N-degree hash, the order in which these nodes receive a canonical identifier does not matter.
+* Finish. Return the serialized canonical form of the canonicalized dataset. Alternatively, return the canonicalized dataset containing the input blank node identifier map and issued identifiers map.
+
+"""
+from typing import Union
+
+from rdflib import Graph, Dataset, BNode
+
+from rdf_canonize.state import CanonicalizationState
+from rdf_canonize.types import CanonicalizedDataset, QuadType
+
+def hash_first_degree(quad: QuadType):
+    pass
+
+class Canonicalizer:
+    def __init__(self, dataset: Union[Graph, Dataset]):
+        if isinstance(dataset, Graph):
+            ds = Dataset()
+            ds.add_graph(ds)
+            dataset = ds
+
+        self.state = CanonicalizationState()
+        self.dataset = CanonicalizedDataset(init=dataset)
+
+    def init_state(self):
+        for quad in self.dataset.init.quads():
+            for node in [quad[0], quad[2]]:
+                if isinstance(node, BNode):
+                    try:
+                        self.state.bnode_quads[node.node_id].append(quad)
+                    except KeyError:
+                        self.state.bnode_quads[node.node_id] = [quad]
+
+
+
+
+    def canonicalize(self):
+        self.init_state()
+
+
+
--- a/rdf_canonize/state.py
+++ b/rdf_canonize/state.py
@ -0,0 +1,24 @@
+from dataclasses import dataclass, field
+from itertools import count
+
+from rdflib import BNode
+from rdflib.graph import _QuadType as QuadType
+
+from rdf_canonize.types import Hash, BNodeIDMapType, CanonicalIDMapType
+
+
+@dataclass
+class CanonicalIssuer:
+    prefix: str = 'c14n'
+    counter: count = count()
+    issued_identifiers: CanonicalIDMapType = field(default_factory=dict)
+
+
+@dataclass
+class CanonicalizationState:
+    bnode_quads: dict[BNode, list[QuadType]] = field(default_factory=dict)
+    hash_bnodes: dict[Hash, list[BNode]] = field(default_factory=dict)
+    issuer: CanonicalIssuer = CanonicalIssuer()
+
+
+
--- a/rdf_canonize/types.py
+++ b/rdf_canonize/types.py
@ -0,0 +1,26 @@
+from dataclasses import dataclass, field
+from typing import TypedDict
+
+from rdflib import Dataset, BNode
+from rdflib.graph import _QuadType as QuadType
+
+class Hash(str):
+    """Abstract representation for some hash digest"""
+
+class BNodeID(str):
+    """Blank node identifier local to the algo implementation"""
+
+class CanonicalID(str):
+    """Canonical ID given to a BNode"""
+
+BNodeIDMapType = dict[BNodeID, BNode]
+CanonicalIDMapType = dict[BNodeID, CanonicalID]
+
+
+@dataclass
+class CanonicalizedDataset:
+    init: Dataset
+    bnodeid_map: BNodeIDMapType
+    canonicalid_map: CanoncalIDMapType
+
+