From d378e171735e5676f2a9e925be7fc45d8d19fdb4 Mon Sep 17 00:00:00 2001 From: Thomas Sileo Date: Thu, 4 Aug 2022 19:11:14 +0200 Subject: [PATCH] Improve privacy relace --- app/ap_object.py | 18 ++---------------- app/templates.py | 2 ++ app/templates/utils.html | 2 +- app/utils/opengraph.py | 5 +++++ docs/user_guide.md | 3 +++ 5 files changed, 13 insertions(+), 17 deletions(-) diff --git a/app/ap_object.py b/app/ap_object.py index 72fd871..f96b5c6 100644 --- a/app/ap_object.py +++ b/app/ap_object.py @@ -2,7 +2,6 @@ import hashlib from datetime import datetime from functools import cached_property from typing import Any -from urllib.parse import urlparse import pydantic from bs4 import BeautifulSoup # type: ignore @@ -12,8 +11,8 @@ from app import activitypub as ap from app.actor import LOCAL_ACTOR from app.actor import Actor from app.actor import RemoteActor -from app.config import PRIVACY_REPLACE from app.media import proxied_media_url +from app.utils import privacy_replace from app.utils.datetime import now from app.utils.datetime import parse_isoformat @@ -179,20 +178,7 @@ class Object: if self.ap_object.get("mediaType") == "text/markdown": content = markdown(content, extensions=["mdx_linkify"]) - if not PRIVACY_REPLACE: - return content - - soup = BeautifulSoup(content, "html5lib") - links = soup.find_all("a", href=True) - - for link in links: - parsed_href = urlparse(link.attrs["href"]) - if new_netloc := PRIVACY_REPLACE.get( - parsed_href.netloc.removeprefix("www.") - ): - link.attrs["href"] = parsed_href._replace(netloc=new_netloc).geturl() - - return soup.find("body").decode_contents() + return privacy_replace.replace_content(content) @property def summary(self) -> str | None: diff --git a/app/templates.py b/app/templates.py index 2c51bbe..30af066 100644 --- a/app/templates.py +++ b/app/templates.py @@ -32,6 +32,7 @@ from app.config import generate_csrf_token from app.config import session_serializer from app.database import AsyncSession from app.media import proxied_media_url +from app.utils import privacy_replace from app.utils.datetime import now from app.utils.highlight import HIGHLIGHT_CSS from app.utils.highlight import highlight @@ -400,3 +401,4 @@ _templates.env.filters["emojify"] = _emojify _templates.env.filters["pluralize"] = _pluralize _templates.env.filters["parse_datetime"] = _parse_datetime _templates.env.filters["poll_item_pct"] = _poll_item_pct +_templates.env.filters["privacy_replace_url"] = privacy_replace.replace_url diff --git a/app/templates/utils.html b/app/templates/utils.html index bfc6331..a2040a5 100644 --- a/app/templates/utils.html +++ b/app/templates/utils.html @@ -317,7 +317,7 @@ {% endif %}
- {{ og_meta.title }} + {{ og_meta.title }} {% if og_meta.site_name %} {{ og_meta.site_name }} {% endif %} diff --git a/app/utils/opengraph.py b/app/utils/opengraph.py index 13b8c35..fccf43b 100644 --- a/app/utils/opengraph.py +++ b/app/utils/opengraph.py @@ -15,6 +15,7 @@ from app.database import AsyncSession from app.models import InboxObject from app.models import OutboxObject from app.utils.url import is_url_valid +from app.utils.url import make_abs class OpenGraphMeta(BaseModel): @@ -46,6 +47,10 @@ def _scrap_og_meta(url: str, html: str) -> OpenGraphMeta | None: if "title" not in raw: return None + for maybe_rel in {"url", "image"}: + if u := raw.get(maybe_rel): + raw[maybe_rel] = make_abs(u, url) + return OpenGraphMeta.parse_obj(raw) diff --git a/docs/user_guide.md b/docs/user_guide.md index 7109119..b129eda 100644 --- a/docs/user_guide.md +++ b/docs/user_guide.md @@ -43,6 +43,9 @@ To do so, just add as these extra config items, this is a sample config that rew domain = "youtube.com" replace_by ="yewtu.be" [[privacy_replace]] +domain = "youtu.be" +replace_by = "yewtu.be" +[[privacy_replace]] domain = "twitter.com" replace_by = "nitter.net" [[privacy_replace]]