forked from forks/microblog.pub
Fix OG metadata scraping and improve workers
This commit is contained in:
parent
c3eb44add7
commit
793a939046
5 changed files with 37 additions and 7 deletions
|
@ -3,7 +3,6 @@ import traceback
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
|
|
||||||
import httpx
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from sqlalchemy import func
|
from sqlalchemy import func
|
||||||
from sqlalchemy import select
|
from sqlalchemy import select
|
||||||
|
@ -108,6 +107,7 @@ async def process_next_incoming_activity(
|
||||||
|
|
||||||
next_activity.tries = next_activity.tries + 1
|
next_activity.tries = next_activity.tries + 1
|
||||||
next_activity.last_try = now()
|
next_activity.last_try = now()
|
||||||
|
await db_session.commit()
|
||||||
|
|
||||||
if next_activity.ap_object and next_activity.sent_by_ap_actor_id:
|
if next_activity.ap_object and next_activity.sent_by_ap_actor_id:
|
||||||
try:
|
try:
|
||||||
|
@ -120,13 +120,16 @@ async def process_next_incoming_activity(
|
||||||
),
|
),
|
||||||
timeout=60,
|
timeout=60,
|
||||||
)
|
)
|
||||||
except httpx.TimeoutException as exc:
|
except asyncio.exceptions.TimeoutError:
|
||||||
url = exc._request.url if exc._request else None
|
logger.error("Activity took too long to process")
|
||||||
logger.error(f"Failed, HTTP timeout when fetching {url}")
|
await db_session.rollback()
|
||||||
|
await db_session.refresh(next_activity)
|
||||||
next_activity.error = traceback.format_exc()
|
next_activity.error = traceback.format_exc()
|
||||||
_set_next_try(next_activity)
|
_set_next_try(next_activity)
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.exception("Failed")
|
logger.exception("Failed")
|
||||||
|
await db_session.rollback()
|
||||||
|
await db_session.refresh(next_activity)
|
||||||
next_activity.error = traceback.format_exc()
|
next_activity.error = traceback.format_exc()
|
||||||
_set_next_try(next_activity)
|
_set_next_try(next_activity)
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -1,12 +1,15 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import re
|
import re
|
||||||
|
import signal
|
||||||
|
from concurrent.futures import TimeoutError
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
from bs4 import BeautifulSoup # type: ignore
|
from bs4 import BeautifulSoup # type: ignore
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
from pebble import concurrent # type: ignore
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from app import activitypub as ap
|
from app import activitypub as ap
|
||||||
|
@ -29,7 +32,11 @@ class OpenGraphMeta(BaseModel):
|
||||||
site_name: str
|
site_name: str
|
||||||
|
|
||||||
|
|
||||||
|
@concurrent.process(timeout=5)
|
||||||
def _scrap_og_meta(url: str, html: str) -> OpenGraphMeta | None:
|
def _scrap_og_meta(url: str, html: str) -> OpenGraphMeta | None:
|
||||||
|
# Prevent SIGTERM to bubble up to the worker
|
||||||
|
signal.signal(signal.SIGTERM, signal.SIG_IGN)
|
||||||
|
|
||||||
soup = BeautifulSoup(html, "html5lib")
|
soup = BeautifulSoup(html, "html5lib")
|
||||||
ogs = {
|
ogs = {
|
||||||
og.attrs["property"]: og.attrs.get("content")
|
og.attrs["property"]: og.attrs.get("content")
|
||||||
|
@ -58,6 +65,10 @@ def _scrap_og_meta(url: str, html: str) -> OpenGraphMeta | None:
|
||||||
return OpenGraphMeta.parse_obj(raw)
|
return OpenGraphMeta.parse_obj(raw)
|
||||||
|
|
||||||
|
|
||||||
|
def scrap_og_meta(url: str, html: str) -> OpenGraphMeta | None:
|
||||||
|
return _scrap_og_meta(url, html).result()
|
||||||
|
|
||||||
|
|
||||||
async def external_urls(
|
async def external_urls(
|
||||||
db_session: AsyncSession,
|
db_session: AsyncSession,
|
||||||
ro: ap_object.RemoteObject | OutboxObject | InboxObject,
|
ro: ap_object.RemoteObject | OutboxObject | InboxObject,
|
||||||
|
@ -126,7 +137,10 @@ async def _og_meta_from_url(url: str) -> OpenGraphMeta | None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return _scrap_og_meta(url, resp.text)
|
return scrap_og_meta(url, resp.text)
|
||||||
|
except TimeoutError:
|
||||||
|
logger.info(f"Timed out when scraping OG meta for {url}")
|
||||||
|
return None
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.info(f"Failed to scrap OG meta for {url}")
|
logger.info(f"Failed to scrap OG meta for {url}")
|
||||||
return None
|
return None
|
||||||
|
|
|
@ -69,5 +69,5 @@ class Worker(Generic[T]):
|
||||||
logger.info("stopping loop")
|
logger.info("stopping loop")
|
||||||
|
|
||||||
async def _shutdown(self, sig: signal.Signals) -> None:
|
async def _shutdown(self, sig: signal.Signals) -> None:
|
||||||
logger.info(f"Caught {signal=}")
|
logger.info(f"Caught {sig=}")
|
||||||
self._stop_event.set()
|
self._stop_event.set()
|
||||||
|
|
14
poetry.lock
generated
14
poetry.lock
generated
|
@ -689,6 +689,14 @@ category = "dev"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pebble"
|
||||||
|
version = "5.0.2"
|
||||||
|
description = "Threading and multiprocessing eye-candy."
|
||||||
|
category = "main"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.6"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pillow"
|
name = "pillow"
|
||||||
version = "9.3.0"
|
version = "9.3.0"
|
||||||
|
@ -1263,7 +1271,7 @@ dev = ["pytest (>=4.6.2)", "black (>=19.3b0)"]
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "1.1"
|
lock-version = "1.1"
|
||||||
python-versions = "^3.10"
|
python-versions = "^3.10"
|
||||||
content-hash = "89df524a545a19a20440d1872c93151bbf3f68d3b3d20cc50bc9049dd0e6d25f"
|
content-hash = "13a1f5fc3f65c56e753062dca6ab74a50f7270d78a08ebf6297f7b4fa26b5eac"
|
||||||
|
|
||||||
[metadata.files]
|
[metadata.files]
|
||||||
aiosqlite = [
|
aiosqlite = [
|
||||||
|
@ -1871,6 +1879,10 @@ pathspec = [
|
||||||
{file = "pathspec-0.10.1-py3-none-any.whl", hash = "sha256:46846318467efc4556ccfd27816e004270a9eeeeb4d062ce5e6fc7a87c573f93"},
|
{file = "pathspec-0.10.1-py3-none-any.whl", hash = "sha256:46846318467efc4556ccfd27816e004270a9eeeeb4d062ce5e6fc7a87c573f93"},
|
||||||
{file = "pathspec-0.10.1.tar.gz", hash = "sha256:7ace6161b621d31e7902eb6b5ae148d12cfd23f4a249b9ffb6b9fee12084323d"},
|
{file = "pathspec-0.10.1.tar.gz", hash = "sha256:7ace6161b621d31e7902eb6b5ae148d12cfd23f4a249b9ffb6b9fee12084323d"},
|
||||||
]
|
]
|
||||||
|
pebble = [
|
||||||
|
{file = "Pebble-5.0.2-py3-none-any.whl", hash = "sha256:61b2dfd52b1a8c083b4e6cf3e0f1ff2e8a430a6283c53969a7057a1c91bed3cd"},
|
||||||
|
{file = "Pebble-5.0.2.tar.gz", hash = "sha256:9c58c03eaf920c31287444c6fef39dc53baeac9de221ead104f5c9b48e8bd587"},
|
||||||
|
]
|
||||||
pillow = [
|
pillow = [
|
||||||
{file = "Pillow-9.3.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:0b7257127d646ff8676ec8a15520013a698d1fdc48bc2a79ba4e53df792526f2"},
|
{file = "Pillow-9.3.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:0b7257127d646ff8676ec8a15520013a698d1fdc48bc2a79ba4e53df792526f2"},
|
||||||
{file = "Pillow-9.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b90f7616ea170e92820775ed47e136208e04c967271c9ef615b6fbd08d9af0e3"},
|
{file = "Pillow-9.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b90f7616ea170e92820775ed47e136208e04c967271c9ef615b6fbd08d9af0e3"},
|
||||||
|
|
|
@ -44,6 +44,7 @@ uvicorn = {extras = ["standard"], version = "^0.18.3"}
|
||||||
Brotli = "^1.0.9"
|
Brotli = "^1.0.9"
|
||||||
greenlet = "^1.1.3"
|
greenlet = "^1.1.3"
|
||||||
mistletoe = "^0.9.0"
|
mistletoe = "^0.9.0"
|
||||||
|
Pebble = "^5.0.2"
|
||||||
|
|
||||||
[tool.poetry.dev-dependencies]
|
[tool.poetry.dev-dependencies]
|
||||||
black = "^22.3.0"
|
black = "^22.3.0"
|
||||||
|
|
Loading…
Reference in a new issue