This commit is contained in:
sneakers-the-rat 2024-02-23 23:24:59 -08:00
commit 1e104800b0
No known key found for this signature in database
GPG Key ID: 6DCB96EF1E4D232D
6 changed files with 184 additions and 0 deletions

66
poetry.lock generated Normal file
View File

@ -0,0 +1,66 @@
# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
[[package]]
name = "isodate"
version = "0.6.1"
description = "An ISO 8601 date/time/duration parser and formatter"
optional = false
python-versions = "*"
files = [
{file = "isodate-0.6.1-py2.py3-none-any.whl", hash = "sha256:0751eece944162659049d35f4f549ed815792b38793f07cf73381c1c87cbed96"},
{file = "isodate-0.6.1.tar.gz", hash = "sha256:48c5881de7e8b0a0d648cb024c8062dc84e7b840ed81e864c7614fd3c127bde9"},
]
[package.dependencies]
six = "*"
[[package]]
name = "pyparsing"
version = "3.1.1"
description = "pyparsing module - Classes and methods to define and execute parsing grammars"
optional = false
python-versions = ">=3.6.8"
files = [
{file = "pyparsing-3.1.1-py3-none-any.whl", hash = "sha256:32c7c0b711493c72ff18a981d24f28aaf9c1fb7ed5e9667c9e84e3db623bdbfb"},
{file = "pyparsing-3.1.1.tar.gz", hash = "sha256:ede28a1a32462f5a9705e07aea48001a08f7cf81a021585011deba701581a0db"},
]
[package.extras]
diagrams = ["jinja2", "railroad-diagrams"]
[[package]]
name = "rdflib"
version = "7.0.0"
description = "RDFLib is a Python library for working with RDF, a simple yet powerful language for representing information."
optional = false
python-versions = ">=3.8.1,<4.0.0"
files = [
{file = "rdflib-7.0.0-py3-none-any.whl", hash = "sha256:0438920912a642c866a513de6fe8a0001bd86ef975057d6962c79ce4771687cd"},
{file = "rdflib-7.0.0.tar.gz", hash = "sha256:9995eb8569428059b8c1affd26b25eac510d64f5043d9ce8c84e0d0036e995ae"},
]
[package.dependencies]
isodate = ">=0.6.0,<0.7.0"
pyparsing = ">=2.1.0,<4"
[package.extras]
berkeleydb = ["berkeleydb (>=18.1.0,<19.0.0)"]
html = ["html5lib (>=1.0,<2.0)"]
lxml = ["lxml (>=4.3.0,<5.0.0)"]
networkx = ["networkx (>=2.0.0,<3.0.0)"]
[[package]]
name = "six"
version = "1.16.0"
description = "Python 2 and 3 compatibility utilities"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
files = [
{file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
{file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
]
[metadata]
lock-version = "2.0"
python-versions = "^3.11"
content-hash = "036ed7fb018709708f370a7646048203cf0fa92169d910f113cbe042cba147bf"

19
pyproject.toml Normal file
View File

@ -0,0 +1,19 @@
[tool.poetry]
name = "rdf-canonize"
version = "0.1.0"
description = ""
authors = ["sneakers-the-rat <sneakers-the-rat@protonmail.com>"]
license = "GPL-3.0"
readme = "README.md"
packages = [
{ include = "rdf_canonize" }
]
[tool.poetry.dependencies]
python = "^3.11"
rdflib = "^7.0.0"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

0
rdf_canonize/__init__.py Normal file
View File

49
rdf_canonize/canonize.py Normal file
View File

@ -0,0 +1,49 @@
"""
Canonicalization algorithms
* Initialization. Initialize the state needed for the rest of the algorithm using 4.2 Canonicalization State. Also initialize the canonicalized dataset using the input dataset (which remains immutable) the input blank node identifier map (retaining blank node identifiers from the input if possible, otherwise assigning them arbitrarily); the issued identifiers map from the canonical issuer is added upon completion of the algorithm.
* Compute first degree hashes. Compute the first degree hash for each blank node in the dataset using 4.6 Hash First Degree Quads.
* Canonically label unique nodes. Assign canonical identifiers via 4.5 Issue Identifier Algorithm, in Unicode code point order, to each blank node whose first degree hash is unique.
* Compute N-degree hashes for non-unique nodes. For each repeated first degree hash (proceeding in Unicode code point order), compute the N-degree hash via 4.8 Hash N-Degree Quads of every unlabeled blank node that corresponds to the given repeated hash.
* Canonically label remaining nodes. In Unicode code point order of the N-degree hashes, issue canonical identifiers to each corresponding blank node using 4.5 Issue Identifier Algorithm. If more than one node produces the same N-degree hash, the order in which these nodes receive a canonical identifier does not matter.
* Finish. Return the serialized canonical form of the canonicalized dataset. Alternatively, return the canonicalized dataset containing the input blank node identifier map and issued identifiers map.
"""
from typing import Union
from rdflib import Graph, Dataset, BNode
from rdf_canonize.state import CanonicalizationState
from rdf_canonize.types import CanonicalizedDataset, QuadType
def hash_first_degree(quad: QuadType):
pass
class Canonicalizer:
def __init__(self, dataset: Union[Graph, Dataset]):
if isinstance(dataset, Graph):
ds = Dataset()
ds.add_graph(ds)
dataset = ds
self.state = CanonicalizationState()
self.dataset = CanonicalizedDataset(init=dataset)
def init_state(self):
for quad in self.dataset.init.quads():
for node in [quad[0], quad[2]]:
if isinstance(node, BNode):
try:
self.state.bnode_quads[node.node_id].append(quad)
except KeyError:
self.state.bnode_quads[node.node_id] = [quad]
def canonicalize(self):
self.init_state()

24
rdf_canonize/state.py Normal file
View File

@ -0,0 +1,24 @@
from dataclasses import dataclass, field
from itertools import count
from rdflib import BNode
from rdflib.graph import _QuadType as QuadType
from rdf_canonize.types import Hash, BNodeIDMapType, CanonicalIDMapType
@dataclass
class CanonicalIssuer:
prefix: str = 'c14n'
counter: count = count()
issued_identifiers: CanonicalIDMapType = field(default_factory=dict)
@dataclass
class CanonicalizationState:
bnode_quads: dict[BNode, list[QuadType]] = field(default_factory=dict)
hash_bnodes: dict[Hash, list[BNode]] = field(default_factory=dict)
issuer: CanonicalIssuer = CanonicalIssuer()

26
rdf_canonize/types.py Normal file
View File

@ -0,0 +1,26 @@
from dataclasses import dataclass, field
from typing import TypedDict
from rdflib import Dataset, BNode
from rdflib.graph import _QuadType as QuadType
class Hash(str):
"""Abstract representation for some hash digest"""
class BNodeID(str):
"""Blank node identifier local to the algo implementation"""
class CanonicalID(str):
"""Canonical ID given to a BNode"""
BNodeIDMapType = dict[BNodeID, BNode]
CanonicalIDMapType = dict[BNodeID, CanonicalID]
@dataclass
class CanonicalizedDataset:
init: Dataset
bnodeid_map: BNodeIDMapType
canonicalid_map: CanoncalIDMapType