mastodon-ld/masto_ld/patterns/wikilink.py

190 lines
6.5 KiB
Python

import re
from masto_ld.patterns.patterns import Pattern
from dataclasses import dataclass
from typing import Optional, Union, List
import pyparsing as pp
from pprint import pformat
WIKILINK = re.compile(r'\[\[(.*?)\]\]', re.IGNORECASE)
"""
Basic structure of wikilink, used to detect presence
"""
class NBack:
FIELDS = ('wildcard', 'start', 'end')
def __init__(self, start:Optional[int]=None, end:Optional[int]=None,
wildcard:Optional[Union[str,bool]]=None,
one:Optional[str]=None):
if wildcard:
self.wildcard = True
self.start = None
self.end = None
return
else:
self.wildcard = False
if one:
self.start = 1
self.end = 1
else:
if start is not None:
start = int(start)
if end is not None:
end = int(end)
self.start = start
self.end = end
if self.start is not None and self.end is not None:
if self.start > self.end:
raise ValueError(f"Start value must be less than end value, got start:{self.start}, end:{self.end}")
@classmethod
def make_parser(cls) -> pp.ParserElement:
# --------------------------------------------------
# n-back links immediately follow the [[ and can be one of
# ^
# ^*
# ^{n,m}
# ^{n,}
# ^{,m}
# ^{m}
# make elements
caret = pp.Literal("^")
lcurly = pp.Literal('{').suppress()
rcurly = pp.Literal('}').suppress()
integer = pp.Word(pp.nums)
comma = pp.Literal(',').suppress()
nb_range = caret + lcurly
# combine into matches
nb_wildcard = caret.suppress() + "*"
# start or end can be omitted if comma is present
nb_full = nb_range + pp.Optional(integer("start")) + comma + pp.Optional(integer("end")) + rcurly
# if no comma present, it's just an end
nb_end = nb_range + integer("end") + rcurly
# combine into full nback parser
nback = pp.Group(nb_wildcard('wildcard') | nb_full | nb_end | caret("one")).set_results_name("nback")
return nback
def __eq__(self, other:'NBack'):
return all([getattr(self, f) == getattr(other, f) for f in self.FIELDS])
def __repr__(self) -> str:
return pformat({f:getattr(self, f) for f in self.FIELDS})
class Wikilink(Pattern):
"""
Pattern for detecting wikilinks!
This pattern implements an extended wikilink syntax that includes
* **n-back links** - allows the user to specify messages in threads that are not the initiating message, and
* **Semantic wikilinks** - specify a triplet subject-predicate-object link
In each of the following examples, `LINK` is a placeholder for the text of the wikilink to be made.
# N-Back Links (see :class:`.NBack`)
For all of these, whitespace in-between the n-back specifier and the link text will be ignored. So
`[[^LINK]]` and `[[^ LINK]]` are both valid.
* **Preceding Message** - `[[^LINK]]`
* **Entire Preceding Thread** - `[[^*LINK]]`
* **Ranges**
** **Fully specified** - `[[^{n,m}LINK]]` where `n` and `m` are the start and end of the range to be included, inclusive.
eg. `[[^{2,5}LINK]]` would specify four messages: the 2nd one above the initiating message through the 5th, and
`n == 0` indicates the initiating message.
** **End specified** - `[[^{,m}LINK]]` OR `[[^{m}LINK]]` - include the initiating message and the `m` messages above it.
** **Start specified** - `[[^{n,}LINK]]` - include all preceding messages in the thread before the `nth` message
# Semantic Wikilinks
Semantic wikilinks create a subject, predicate, object triplet. The subject will be the page that the
Semantic wikilinks use `::` as a delimiter between terms, and a `::` indicates that a wikilink is semantic.
`SUB`, `PRED`, and `OBJ` are placeholders for the parts of
a triplet in the following examples.
* **Complete Triplet** - `[[SUB::PRED::OBJ]]` - create a semantic wikilink on the `SUB`ject page that links to the
`OBJ`ect page with the indicated predicate.
eg. `[[Paper::Has DOI::https://doi.org/10.xxx/yyyy]]`
* **Implicit Triplet** - `[[PRED::OBJ]]` after a `[[SUB]]` wikilink has been previously used in the message or thread.
A subject can also be declared with a complete triplet.
"""
FIELDS = ('link', 'nback', 'predicate', 'object', 'section')
def __init__(
self,
link: str,
nback: Optional[Union[NBack, tuple, dict]] = None,
predicate: Optional[str] = None,
object: Optional[str] = None,
section: Optional[str] = None,
**kwargs):
super(Wikilink, self).__init__(**kwargs)
self.link = link
if isinstance(nback, (tuple, list)):
nback = NBack(*nback)
elif isinstance(nback, dict):
nback = NBack(**nback)
elif isinstance(nback, pp.ParseResults):
nback = NBack(**dict(nback))
if isinstance(section, pp.ParseResults):
section = section[0]
self.nback = nback
self.predicate = predicate
self.object = object
self.section = section
@classmethod
def make_parser(cls) -> pp.ParserElement:
"""
Make the parser to detect wikilinks!
"""
# All wikilinks start with [[ and end with ]]
lbracket = pp.Literal('[[').suppress()
rbracket = pp.Literal(']]').suppress()
#nback parser
nback = NBack.make_parser()
# main wikilink subject text
link = pp.Word(pp.printables+ " ", excludeChars="#[]{}|")
# optional page section
hash = pp.Literal("#").suppress()
section = hash + link
# Combine all
parser = lbracket + pp.Optional(nback) + link("link") + pp.Optional(section("section")) + rbracket
return parser
@classmethod
def parse(cls, string:str, return_parsed:bool=False) -> List['Wikilink']:
parser = cls.make_parser()
results = parser.search_string(string)
if return_parsed:
return results
else:
return [Wikilink(**dict(res.items())) for res in results]
def __eq__(self, other:'Wikilink'):
return all(getattr(self, f) == getattr(other, f) for f in self.FIELDS)
def __repr__(self) -> str:
return pformat({f:getattr(self, f) for f in self.FIELDS if getattr(self, f) is not None})