from urllib.parse import urlparse from bs4 import BeautifulSoup # type: ignore from loguru import logger from app.config import PRIVACY_REPLACE def replace_content(content: str) -> str: if not PRIVACY_REPLACE: return content soup = BeautifulSoup(content, "html5lib") links = list(soup.find_all("a", href=True)) if not links: return content for link in links: link.attrs["href"] = replace_url(link.attrs["href"]) return soup.find("body").decode_contents() def replace_url(u: str) -> str: if not PRIVACY_REPLACE: return u try: parsed_href = urlparse(u) if not parsed_href.hostname: raise ValueError("Missing hostname") except Exception: logger.warning(f"Failed to parse url={u}") return u if new_netloc := PRIVACY_REPLACE.get(parsed_href.hostname.removeprefix("www.")): return parsed_href._replace(netloc=new_netloc).geturl() return u