mirror of
https://git.sr.ht/~tsileo/microblog.pub
synced 2024-12-22 05:04:27 +00:00
Improve Open Graph metadata parsing
This commit is contained in:
parent
5ea22edcb8
commit
b0cb248a23
1 changed files with 22 additions and 8 deletions
|
@ -43,18 +43,32 @@ def fetch_og_metadata(user_agent, links):
|
|||
for l in links:
|
||||
check_url(l)
|
||||
|
||||
# Remove any AP actor from the list
|
||||
# Remove any AP objects
|
||||
try:
|
||||
p = lookup(l)
|
||||
if p.has_type(ap.ACTOR_TYPES):
|
||||
continue
|
||||
lookup(l)
|
||||
continue
|
||||
except NotAnActivityError:
|
||||
pass
|
||||
except Exception:
|
||||
logger.exception(f"skipping {l} because of issues during AP lookup")
|
||||
continue
|
||||
|
||||
r = requests.get(l, headers={"User-Agent": user_agent}, timeout=15)
|
||||
r.raise_for_status()
|
||||
if not r.headers.get("content-type").startswith("text/html"):
|
||||
logger.debug(f"skipping {l}")
|
||||
try:
|
||||
h = requests.head(l, headers={"User-Agent": user_agent}, timeout=3)
|
||||
h.raise_for_status()
|
||||
except requests.HTTPError as http_err:
|
||||
logger.debug(f"failed to HEAD {l}, got a {http_err.response.status_code}")
|
||||
continue
|
||||
|
||||
if not h.headers.get("content-type").startswith("text/html"):
|
||||
logger.debug(f"skipping {l} for bad content type")
|
||||
continue
|
||||
|
||||
try:
|
||||
r = requests.get(l, headers={"User-Agent": user_agent}, timeout=5)
|
||||
r.raise_for_status()
|
||||
except requests.HTTPError as http_err:
|
||||
logger.debug(f"failed to GET {l}, got a {http_err.response.status_code}")
|
||||
continue
|
||||
|
||||
r.encoding = "UTF-8"
|
||||
|
|
Loading…
Reference in a new issue