From 5eaa0f291b9d1ca2d8a5d578241a99155f8e1e69 Mon Sep 17 00:00:00 2001 From: Thomas Sileo Date: Wed, 5 Oct 2022 20:05:16 +0200 Subject: [PATCH] More Markdown improvements --- app/activitypub.py | 30 +++++++++++++---------- app/source.py | 60 ++++++++++++++++++++++++++++++++-------------- 2 files changed, 59 insertions(+), 31 deletions(-) diff --git a/app/activitypub.py b/app/activitypub.py index febab1c..61f1e8b 100644 --- a/app/activitypub.py +++ b/app/activitypub.py @@ -6,7 +6,6 @@ from typing import Any import httpx from loguru import logger -from markdown import markdown from app import config from app.config import ALSO_KNOWN_AS @@ -14,6 +13,7 @@ from app.config import AP_CONTENT_TYPE # noqa: F401 from app.config import MOVED_TO from app.httpsig import auth from app.key import get_pubkey_as_pem +from app.source import dedup_tags from app.source import hashtagify from app.utils.url import check_url @@ -101,6 +101,19 @@ class VisibilityEnum(str, enum.Enum): _LOCAL_ACTOR_SUMMARY, _LOCAL_ACTOR_TAGS = hashtagify(config.CONFIG.summary) +_LOCAL_ACTOR_METADATA = [] +if config.CONFIG.metadata: + for kv in config.CONFIG.metadata: + kv_value, kv_tags = hashtagify(kv.value) + _LOCAL_ACTOR_METADATA.append( + { + "name": kv.key, + "type": "PropertyValue", + "value": kv_value, + } + ) + _LOCAL_ACTOR_TAGS.extend(kv_tags) + ME = { "@context": AS_EXTENDED_CTX, @@ -113,7 +126,7 @@ ME = { "outbox": config.BASE_URL + "/outbox", "preferredUsername": config.USERNAME, "name": config.CONFIG.name, - "summary": markdown(_LOCAL_ACTOR_SUMMARY, extensions=["mdx_linkify"]), + "summary": _LOCAL_ACTOR_SUMMARY, "endpoints": { # For compat with servers expecting a sharedInbox... "sharedInbox": config.BASE_URL @@ -121,16 +134,7 @@ ME = { }, "url": config.ID + "/", # XXX: the path is important for Mastodon compat "manuallyApprovesFollowers": config.CONFIG.manually_approves_followers, - "attachment": [ - { - "name": kv.key, - "type": "PropertyValue", - "value": markdown(kv.value, extensions=["mdx_linkify", "fenced_code"]), - } - for kv in config.CONFIG.metadata - ] - if config.CONFIG.metadata - else [], + "attachment": _LOCAL_ACTOR_METADATA, "icon": { "mediaType": mimetypes.guess_type(config.CONFIG.icon_url)[0], "type": "Image", @@ -141,7 +145,7 @@ ME = { "owner": config.ID, "publicKeyPem": get_pubkey_as_pem(config.KEY_PATH), }, - "tag": _LOCAL_ACTOR_TAGS, + "tag": dedup_tags(_LOCAL_ACTOR_TAGS), } if ALSO_KNOWN_AS: diff --git a/app/source.py b/app/source.py index 0dea35c..b411b04 100644 --- a/app/source.py +++ b/app/source.py @@ -21,15 +21,16 @@ if typing.TYPE_CHECKING: _FORMATTER = HtmlFormatter(style=CODE_HIGHLIGHTING_THEME) _HASHTAG_REGEX = re.compile(r"(#[\d\w]+)") -_MENTION_REGEX = re.compile(r"@[\d\w_.+-]+@[\d\w-]+\.[\d\w\-.]+") +_MENTION_REGEX = re.compile(r"(@[\d\w_.+-]+@[\d\w-]+\.[\d\w\-.]+)") +_URL_REGEX = re.compile( + "(https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*))" # noqa: E501 +) class AutoLink(SpanToken): parse_inner = False precedence = 10 - pattern = re.compile( - "(https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*))" # noqa: E501 - ) + pattern = _URL_REGEX def __init__(self, match_obj: re.Match) -> None: self.target = match_obj.group() @@ -38,7 +39,7 @@ class AutoLink(SpanToken): class Mention(SpanToken): parse_inner = False precedence = 10 - pattern = re.compile(r"(@[\d\w_.+-]+@[\d\w-]+\.[\d\w\-.]+)") + pattern = _MENTION_REGEX def __init__(self, match_obj: re.Match) -> None: self.target = match_obj.group() @@ -47,7 +48,7 @@ class Mention(SpanToken): class Hashtag(SpanToken): parse_inner = False precedence = 10 - pattern = re.compile(r"(#[\d\w]+)") + pattern = _HASHTAG_REGEX def __init__(self, match_obj: re.Match) -> None: self.target = match_obj.group() @@ -88,9 +89,13 @@ class CustomRenderer(HTMLRenderer): def render_hashtag(self, token: Hashtag) -> str: tag = token.target[1:] - link = f'' # noqa: E501 + link = f'' # noqa: E501 self.tags.append( - dict(href=f"{BASE_URL}/t/{tag}", name=token.target, type="Hashtag") + dict( + href=f"{BASE_URL}/t/{tag.lower()}", + name=token.target.lower(), + type="Hashtag", + ) ) return link @@ -134,17 +139,22 @@ async def _prefetch_mentioned_actors( return actors -def hashtagify(content: str) -> tuple[str, list[dict[str, str]]]: - # TODO: fix this, switch to mistletoe? +def hashtagify( + content: str, +) -> tuple[str, list[dict[str, str]]]: tags = [] - hashtags = re.findall(_HASHTAG_REGEX, content) - hashtags = sorted(set(hashtags), reverse=True) # unique tags, longest first - for hashtag in hashtags: - tag = hashtag[1:] - link = f'' # noqa: E501 - tags.append(dict(href=f"{BASE_URL}/t/{tag}", name=hashtag, type="Hashtag")) - content = content.replace(hashtag, link) - return content, tags + with CustomRenderer( + mentioned_actors={}, + enable_mentionify=False, + enable_hashtagify=True, + ) as renderer: + rendered_content = renderer.render(Document(content)) + tags.extend(renderer.tags) + + # Handle custom emoji + tags.extend(emoji.tags(content)) + + return rendered_content, tags async def markdownify( @@ -174,3 +184,17 @@ async def markdownify( tags.extend(emoji.tags(content)) return rendered_content, tags, list(mentioned_actors.values()) + + +def dedup_tags(tags: list[dict[str, str]]) -> list[dict[str, str]]: + idx = set() + deduped_tags = [] + for tag in tags: + tag_idx = (tag["type"], tag["name"]) + if tag_idx in idx: + continue + + idx.add(tag_idx) + deduped_tags.append(tag) + + return deduped_tags