microblog.pub/app/utils/highlight.py

55 lines
1.8 KiB
Python
Raw Normal View History

2022-08-29 19:42:54 +00:00
import base64
import hashlib
2022-06-22 18:11:22 +00:00
from functools import lru_cache
from bs4 import BeautifulSoup # type: ignore
from pygments import highlight as phighlight # type: ignore
from pygments.formatters import HtmlFormatter # type: ignore
2022-07-12 20:24:15 +00:00
from pygments.lexers import get_lexer_by_name # type: ignore
2022-06-22 18:11:22 +00:00
from pygments.lexers import guess_lexer # type: ignore
from app.config import CODE_HIGHLIGHTING_THEME
_FORMATTER = HtmlFormatter(style=CODE_HIGHLIGHTING_THEME)
2022-06-22 18:11:22 +00:00
HIGHLIGHT_CSS = _FORMATTER.get_style_defs()
2022-08-29 19:42:54 +00:00
HIGHLIGHT_CSS_HASH = base64.b64encode(
hashlib.sha256(HIGHLIGHT_CSS.encode()).digest()
).decode()
2022-06-22 18:11:22 +00:00
@lru_cache(256)
def highlight(html: str) -> str:
soup = BeautifulSoup(html, "html5lib")
for code in soup.find_all("code"):
if not code.parent.name == "pre":
continue
2022-07-12 20:24:15 +00:00
# Replace <br> tags with line breaks (Mastodon sends code like this)
2022-07-12 07:43:50 +00:00
code_content = (
code.encode_contents().decode().replace("<br>", "\n").replace("<br/>", "\n")
)
2022-07-12 20:24:15 +00:00
# If this comes from a microblog.pub instance we may have the language
# in the class name
if "class" in code.attrs and code.attrs["class"][0].startswith("language-"):
try:
lexer = get_lexer_by_name(
code.attrs["class"][0].removeprefix("language-")
)
except Exception:
lexer = guess_lexer(code_content)
else:
lexer = guess_lexer(code_content)
# Replace the code with Pygment output
2022-07-27 18:55:03 +00:00
# XXX: the HTML escaping causes issue with Python type annotations
code_content = code_content.replace(") -&gt; ", ") -> ")
2022-07-12 20:24:15 +00:00
code.parent.replaceWith(
BeautifulSoup(
phighlight(code_content, lexer, _FORMATTER), "html5lib"
).body.next
)
return soup.body.encode_contents().decode()