mastodon-ld/masto_ld/patterns/wikilink.py

import re
from masto_ld.patterns.patterns import Pattern
from dataclasses import dataclass
from typing import Optional, Union, List
import pyparsing as pp
from pprint import pformat


WIKILINK = re.compile(r'\[\[(.*?)\]\]', re.IGNORECASE)
"""
Basic structure of wikilink, used to detect presence
"""

class NBack:
    FIELDS = ('wildcard', 'start', 'end')

    def __init__(self, start:Optional[int]=None, end:Optional[int]=None,
                 wildcard:Optional[Union[str,bool]]=None,
                 one:Optional[str]=None):

        if wildcard:
            self.wildcard = True
            self.start = None
            self.end = None
            return
        else:
            self.wildcard = False

        if one:
            self.start = 1
            self.end = 1
        else:
            if start is not None:
                start = int(start)
            if end is not None:
                end = int(end)
            self.start = start
            self.end = end

        if self.start is not None and self.end is not None:
            if self.start > self.end:
                raise ValueError(f"Start value must be less than end value, got start:{self.start}, end:{self.end}")

    @classmethod
    def make_parser(cls) -> pp.ParserElement:
        # --------------------------------------------------
        # n-back links immediately follow the [[ and can be one of
        # ^
        # ^*
        # ^{n,m}
        # ^{n,}
        # ^{,m}
        # ^{m}

        # make elements
        caret = pp.Literal("^")
        lcurly = pp.Literal('{').suppress()
        rcurly = pp.Literal('}').suppress()
        integer = pp.Word(pp.nums)
        comma = pp.Literal(',').suppress()
        nb_range = caret + lcurly

        # combine into matches
        nb_wildcard = caret.suppress() + "*"
        # start or end can be omitted if comma is present
        nb_full = nb_range + pp.Optional(integer("start")) + comma + pp.Optional(integer("end")) + rcurly
        # if no comma present, it's just an end
        nb_end = nb_range + integer("end") + rcurly

        # combine into full nback parser
        nback = pp.Group(nb_wildcard('wildcard') | nb_full | nb_end | caret("one")).set_results_name("nback")
        return nback

    def __eq__(self, other:'NBack'):
        return all([getattr(self, f) == getattr(other, f) for f in self.FIELDS])

    def __repr__(self) -> str:
        return pformat({f:getattr(self, f) for f in self.FIELDS})

class Wikilink(Pattern):
    """
    Pattern for detecting wikilinks!

    This pattern implements an extended wikilink syntax that includes

    * **n-back links** - allows the user to specify messages in threads that are not the initiating message, and
    * **Semantic wikilinks** - specify a triplet subject-predicate-object link

    In each of the following examples, `LINK` is a placeholder for the text of the wikilink to be made.

    # N-Back Links (see :class:`.NBack`)

    For all of these, whitespace in-between the n-back specifier and the link text will be ignored. So
    `[[^LINK]]` and `[[^ LINK]]` are both valid.

    * **Preceding Message** - `[[^LINK]]`
    * **Entire Preceding Thread** - `[[^*LINK]]`
    * **Ranges**
    ** **Fully specified** - `[[^{n,m}LINK]]` where `n` and `m` are the start and end of the range to be included, inclusive.
       eg. `[[^{2,5}LINK]]` would specify four messages: the 2nd one above the initiating message through the 5th, and
       `n == 0` indicates the initiating message.
    ** **End specified** - `[[^{,m}LINK]]` OR `[[^{m}LINK]]` - include the initiating message and the `m` messages above it.
    ** **Start specified** - `[[^{n,}LINK]]` - include all preceding messages in the thread before the `nth` message

    # Semantic Wikilinks

    Semantic wikilinks create a subject, predicate, object triplet. The subject will be the page that the

    Semantic wikilinks use `::` as a delimiter between terms, and a `::` indicates that a wikilink is semantic.

    `SUB`, `PRED`, and `OBJ` are placeholders for the parts of
    a triplet in the following examples.

    * **Complete Triplet** - `[[SUB::PRED::OBJ]]` - create a semantic wikilink on the `SUB`ject page that links to the
      `OBJ`ect page with the indicated predicate.

      eg. `[[Paper::Has DOI::https://doi.org/10.xxx/yyyy]]`

    * **Implicit Triplet** - `[[PRED::OBJ]]` after a `[[SUB]]` wikilink has been previously used in the message or thread.
      A subject can also be declared with a complete triplet.
    """
    FIELDS = ('link', 'nback', 'predicate', 'object', 'section')

    def __init__(
            self,
            link: str,
            nback: Optional[Union[NBack, tuple, dict]] = None,
            predicate: Optional[str] = None,
            object: Optional[str] = None,
            section: Optional[str] = None,
            **kwargs):
        super(Wikilink, self).__init__(**kwargs)

        self.link = link
        if isinstance(nback, (tuple, list)):
            nback = NBack(*nback)
        elif isinstance(nback, dict):
            nback = NBack(**nback)
        elif isinstance(nback, pp.ParseResults):
            nback = NBack(**dict(nback))

        if isinstance(section, pp.ParseResults):
            section = section[0]

        self.nback = nback
        self.predicate = predicate
        self.object = object
        self.section = section

    @classmethod
    def make_parser(cls) -> pp.ParserElement:
        """
        Make the parser to detect wikilinks!
        """
        # All wikilinks start with [[ and end with ]]
        lbracket = pp.Literal('[[').suppress()
        rbracket = pp.Literal(']]').suppress()

        #nback parser
        nback = NBack.make_parser()

        # main wikilink subject text
        link = pp.Word(pp.printables+ " ", excludeChars="#[]{}|")

        # optional page section
        hash = pp.Literal("#").suppress()
        section = hash + link

        # Combine all
        parser = lbracket + pp.Optional(nback) + link("link") + pp.Optional(section("section")) + rbracket
        return parser

    @classmethod
    def parse(cls, string:str, return_parsed:bool=False) -> List['Wikilink']:
        parser = cls.make_parser()
        results = parser.search_string(string)
        if return_parsed:
            return results
        else:
            return [Wikilink(**dict(res.items())) for res in results]

    def __eq__(self, other:'Wikilink'):
        return all(getattr(self, f) == getattr(other, f) for f in self.FIELDS)

    def __repr__(self) -> str:
        return pformat({f:getattr(self, f) for f in self.FIELDS if getattr(self, f) is not None})