mirror of
https://git.sr.ht/~tsileo/microblog.pub
synced 2024-11-15 03:04:28 +00:00
Improve Open Graph metadata parsing
This commit is contained in:
parent
5ea22edcb8
commit
b0cb248a23
1 changed files with 22 additions and 8 deletions
|
@ -43,18 +43,32 @@ def fetch_og_metadata(user_agent, links):
|
||||||
for l in links:
|
for l in links:
|
||||||
check_url(l)
|
check_url(l)
|
||||||
|
|
||||||
# Remove any AP actor from the list
|
# Remove any AP objects
|
||||||
try:
|
try:
|
||||||
p = lookup(l)
|
lookup(l)
|
||||||
if p.has_type(ap.ACTOR_TYPES):
|
|
||||||
continue
|
continue
|
||||||
except NotAnActivityError:
|
except NotAnActivityError:
|
||||||
pass
|
pass
|
||||||
|
except Exception:
|
||||||
|
logger.exception(f"skipping {l} because of issues during AP lookup")
|
||||||
|
continue
|
||||||
|
|
||||||
r = requests.get(l, headers={"User-Agent": user_agent}, timeout=15)
|
try:
|
||||||
|
h = requests.head(l, headers={"User-Agent": user_agent}, timeout=3)
|
||||||
|
h.raise_for_status()
|
||||||
|
except requests.HTTPError as http_err:
|
||||||
|
logger.debug(f"failed to HEAD {l}, got a {http_err.response.status_code}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not h.headers.get("content-type").startswith("text/html"):
|
||||||
|
logger.debug(f"skipping {l} for bad content type")
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
r = requests.get(l, headers={"User-Agent": user_agent}, timeout=5)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
if not r.headers.get("content-type").startswith("text/html"):
|
except requests.HTTPError as http_err:
|
||||||
logger.debug(f"skipping {l}")
|
logger.debug(f"failed to GET {l}, got a {http_err.response.status_code}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
r.encoding = "UTF-8"
|
r.encoding = "UTF-8"
|
||||||
|
|
Loading…
Reference in a new issue