From 70cdde5d460b4078bd2bbd81bb27d12b131423ad Mon Sep 17 00:00:00 2001 From: Thomas Sileo Date: Thu, 4 Aug 2022 17:36:21 +0200 Subject: [PATCH] OG metadata fixes/tweaks --- app/boxes.py | 28 ++++++++++++++++------------ app/utils/opengraph.py | 30 ++++++++++++++++++++++-------- 2 files changed, 38 insertions(+), 20 deletions(-) diff --git a/app/boxes.py b/app/boxes.py index d09ea94..8f1f0a8 100644 --- a/app/boxes.py +++ b/app/boxes.py @@ -59,21 +59,21 @@ async def save_outbox_object( source: str | None = None, is_transient: bool = False, ) -> models.OutboxObject: - ra = await RemoteObject.from_raw_object(raw_object) + ro = await RemoteObject.from_raw_object(raw_object) outbox_object = models.OutboxObject( public_id=public_id, - ap_type=ra.ap_type, - ap_id=ra.ap_id, - ap_context=ra.ap_context, - ap_object=ra.ap_object, - visibility=ra.visibility, - og_meta=await opengraph.og_meta_from_note(ra.ap_object), + ap_type=ro.ap_type, + ap_id=ro.ap_id, + ap_context=ro.ap_context, + ap_object=ro.ap_object, + visibility=ro.visibility, + og_meta=await opengraph.og_meta_from_note(db_session, ro), relates_to_inbox_object_id=relates_to_inbox_object_id, relates_to_outbox_object_id=relates_to_outbox_object_id, relates_to_actor_id=relates_to_actor_id, - activity_object_ap_id=ra.activity_object_ap_id, - is_hidden_from_homepage=True if ra.in_reply_to else False, + activity_object_ap_id=ro.activity_object_ap_id, + is_hidden_from_homepage=True if ro.in_reply_to else False, source=source, is_transient=is_transient, ) @@ -429,7 +429,7 @@ async def send_create( # If the note is public, check if we need to send any webmentions if visibility == ap.VisibilityEnum.PUBLIC: - possible_targets = opengraph._urls_from_note(obj) + possible_targets = await opengraph.external_urls(db_session, outbox_object) logger.info(f"webmentions possible targert {possible_targets}") for target in possible_targets: webmention_endpoint = await webmentions.discover_webmention_endpoint(target) @@ -552,7 +552,8 @@ async def send_update( # If the note is public, check if we need to send any webmentions if outbox_object.visibility == ap.VisibilityEnum.PUBLIC: - possible_targets = opengraph._urls_from_note(note) + + possible_targets = await opengraph.external_urls(db_session, outbox_object) logger.info(f"webmentions possible targert {possible_targets}") for target in possible_targets: webmention_endpoint = await webmentions.discover_webmention_endpoint(target) @@ -1209,7 +1210,7 @@ async def _process_note_object( relates_to_inbox_object_id=parent_activity.id, relates_to_outbox_object_id=None, activity_object_ap_id=ro.activity_object_ap_id, - og_meta=await opengraph.og_meta_from_note(ro.ap_object), + og_meta=await opengraph.og_meta_from_note(db_session, ro), # Hide replies from the stream is_hidden_from_stream=not ( (not is_reply and is_from_following) or is_mention or is_local_reply @@ -1614,6 +1615,9 @@ async def save_to_inbox( ap_published_at=announced_object.ap_published_at, ap_object=announced_object.ap_object, visibility=announced_object.visibility, + og_meta=await opengraph.og_meta_from_note( + db_session, announced_object + ), is_hidden_from_stream=True, ) db_session.add(announced_inbox_object) diff --git a/app/utils/opengraph.py b/app/utils/opengraph.py index 563f419..13b8c35 100644 --- a/app/utils/opengraph.py +++ b/app/utils/opengraph.py @@ -7,8 +7,13 @@ import httpx from bs4 import BeautifulSoup # type: ignore from pydantic import BaseModel -from app import activitypub as ap +from app import ap_object from app import config +from app.actor import LOCAL_ACTOR +from app.actor import fetch_actor +from app.database import AsyncSession +from app.models import InboxObject +from app.models import OutboxObject from app.utils.url import is_url_valid @@ -44,17 +49,23 @@ def _scrap_og_meta(url: str, html: str) -> OpenGraphMeta | None: return OpenGraphMeta.parse_obj(raw) -def _urls_from_note(note: ap.RawObject) -> set[str]: - note_host = urlparse(ap.get_id(note["id"]) or "").netloc +async def external_urls( + db_session: AsyncSession, + ro: ap_object.RemoteObject | OutboxObject | InboxObject, +) -> set[str]: + note_host = urlparse(ro.ap_id).netloc tags_hrefs = set() - for tag in note.get("tag", []): + for tag in ro.tags: if tag_href := tag.get("href"): tags_hrefs.add(tag_href) + if tag.get("type") == "Mention" and tag["name"] != LOCAL_ACTOR.handle: + mentioned_actor = await fetch_actor(db_session, tag["href"]) + tags_hrefs.add(mentioned_actor.url) urls = set() - if "content" in note: - soup = BeautifulSoup(note["content"], "html5lib") + if ro.content: + soup = BeautifulSoup(ro.content, "html5lib") for link in soup.find_all("a"): h = link.get("href") ph = urlparse(h) @@ -91,9 +102,12 @@ async def _og_meta_from_url(url: str) -> OpenGraphMeta | None: return _scrap_og_meta(url, resp.text) -async def og_meta_from_note(note: ap.RawObject) -> list[dict[str, Any]]: +async def og_meta_from_note( + db_session: AsyncSession, + ro: ap_object.RemoteObject, +) -> list[dict[str, Any]]: og_meta = [] - urls = _urls_from_note(note) + urls = await external_urls(db_session, ro) for url in urls: try: maybe_og_meta = await _og_meta_from_url(url)