mirror of
https://git.sr.ht/~tsileo/microblog.pub
synced 2024-11-15 03:04:28 +00:00
Tweak OG metadata fetching
This commit is contained in:
parent
f902868250
commit
16f4af0463
1 changed files with 9 additions and 8 deletions
|
@ -1,5 +1,5 @@
|
||||||
import logging
|
import logging
|
||||||
import urllib
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
import opengraph
|
import opengraph
|
||||||
import requests
|
import requests
|
||||||
|
@ -15,18 +15,19 @@ logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def links_from_note(note):
|
def links_from_note(note):
|
||||||
tags_href = set()
|
note_host = urlparse(ap._get_id(note["id"]) or "").netloc
|
||||||
for t in note.get("tag", []):
|
|
||||||
h = t.get("href")
|
|
||||||
if h:
|
|
||||||
tags_href.add(h)
|
|
||||||
|
|
||||||
links = set()
|
links = set()
|
||||||
if "content" in note:
|
if "content" in note:
|
||||||
soup = BeautifulSoup(note["content"], "html5lib")
|
soup = BeautifulSoup(note["content"], "html5lib")
|
||||||
for link in soup.find_all("a"):
|
for link in soup.find_all("a"):
|
||||||
h = link.get("href")
|
h = link.get("href")
|
||||||
if h.startswith("http") and h not in tags_href and is_url_valid(h):
|
ph = urlparse(h)
|
||||||
|
if (
|
||||||
|
ph.scheme in {"http", "https"}
|
||||||
|
and ph.netloc != note_host
|
||||||
|
and is_url_valid(h)
|
||||||
|
):
|
||||||
links.add(h)
|
links.add(h)
|
||||||
|
|
||||||
# FIXME(tsileo): support summary and name fields
|
# FIXME(tsileo): support summary and name fields
|
||||||
|
@ -63,7 +64,7 @@ def fetch_og_metadata(user_agent, links):
|
||||||
|
|
||||||
# Keep track of the fetched URL as some crappy websites use relative URLs everywhere
|
# Keep track of the fetched URL as some crappy websites use relative URLs everywhere
|
||||||
data["_input_url"] = l
|
data["_input_url"] = l
|
||||||
u = urllib.parse.urlparse(l)
|
u = urlparse(l)
|
||||||
|
|
||||||
# If it's a relative URL, build the absolute version
|
# If it's a relative URL, build the absolute version
|
||||||
if "image" in data and data["image"].startswith("/"):
|
if "image" in data and data["image"].startswith("/"):
|
||||||
|
|
Loading…
Reference in a new issue