Tweak OG metadata fetching

This commit is contained in:
Thomas Sileo 2019-08-04 23:36:38 +02:00
parent f902868250
commit 16f4af0463

View file

@ -1,5 +1,5 @@
import logging import logging
import urllib from urllib.parse import urlparse
import opengraph import opengraph
import requests import requests
@ -15,18 +15,19 @@ logger = logging.getLogger(__name__)
def links_from_note(note): def links_from_note(note):
tags_href = set() note_host = urlparse(ap._get_id(note["id"]) or "").netloc
for t in note.get("tag", []):
h = t.get("href")
if h:
tags_href.add(h)
links = set() links = set()
if "content" in note: if "content" in note:
soup = BeautifulSoup(note["content"], "html5lib") soup = BeautifulSoup(note["content"], "html5lib")
for link in soup.find_all("a"): for link in soup.find_all("a"):
h = link.get("href") h = link.get("href")
if h.startswith("http") and h not in tags_href and is_url_valid(h): ph = urlparse(h)
if (
ph.scheme in {"http", "https"}
and ph.netloc != note_host
and is_url_valid(h)
):
links.add(h) links.add(h)
# FIXME(tsileo): support summary and name fields # FIXME(tsileo): support summary and name fields
@ -63,7 +64,7 @@ def fetch_og_metadata(user_agent, links):
# Keep track of the fetched URL as some crappy websites use relative URLs everywhere # Keep track of the fetched URL as some crappy websites use relative URLs everywhere
data["_input_url"] = l data["_input_url"] = l
u = urllib.parse.urlparse(l) u = urlparse(l)
# If it's a relative URL, build the absolute version # If it's a relative URL, build the absolute version
if "image" in data and data["image"].startswith("/"): if "image" in data and data["image"].startswith("/"):