microblog.pub/utils/opengraph.py
2019-08-06 21:47:02 +02:00

106 lines
3.1 KiB
Python

import logging
from typing import Any
from typing import Dict
from typing import Set
from urllib.parse import urlparse
import opengraph
import requests
from bs4 import BeautifulSoup
from little_boxes import activitypub as ap
from little_boxes.errors import NotAnActivityError
from little_boxes.urlutils import check_url
from little_boxes.urlutils import is_url_valid
from .lookup import lookup
logger = logging.getLogger(__name__)
def links_from_note(note: Dict[str, Any]) -> Set[str]:
note_host = urlparse(ap._get_id(note["id"]) or "").netloc
links = set()
if "content" in note:
soup = BeautifulSoup(note["content"], "html5lib")
for link in soup.find_all("a"):
h = link.get("href")
ph = urlparse(h)
if (
ph.scheme in {"http", "https"}
and ph.netloc != note_host
and is_url_valid(h)
):
links.add(h)
# FIXME(tsileo): support summary and name fields
return links
def fetch_og_metadata(user_agent, links):
res = []
for l in links:
check_url(l)
# Remove any AP objects
try:
lookup(l)
continue
except NotAnActivityError:
pass
except Exception:
logger.exception(f"skipping {l} because of issues during AP lookup")
continue
try:
h = requests.head(l, headers={"User-Agent": user_agent}, timeout=3)
h.raise_for_status()
except requests.HTTPError as http_err:
logger.debug(f"failed to HEAD {l}, got a {http_err.response.status_code}")
continue
except requests.Timeout:
logger.debug(f"HEAD {l} timed out")
continue
if not h.headers.get("content-type").startswith("text/html"):
logger.debug(f"skipping {l} for bad content type")
continue
try:
r = requests.get(l, headers={"User-Agent": user_agent}, timeout=5)
r.raise_for_status()
except requests.HTTPError as http_err:
logger.debug(f"failed to GET {l}, got a {http_err.response.status_code}")
continue
except requests.Timeout:
logger.debug(f"GET {l} timed out")
continue
r.encoding = "UTF-8"
html = r.text
try:
data = dict(opengraph.OpenGraph(html=html))
except Exception:
logger.exception(f"failed to parse {l}")
continue
# Keep track of the fetched URL as some crappy websites use relative URLs everywhere
data["_input_url"] = l
u = urlparse(l)
# If it's a relative URL, build the absolute version
if "image" in data and data["image"].startswith("/"):
data["image"] = u._replace(
path=data["image"], params="", query="", fragment=""
).geturl()
if "url" in data and data["url"].startswith("/"):
data["url"] = u._replace(
path=data["url"], params="", query="", fragment=""
).geturl()
if data.get("url"):
res.append(data)
return res