Improve Open Graph metadata parsing

This commit is contained in:
Thomas Sileo 2019-08-06 20:16:06 +02:00
parent 5ea22edcb8
commit b0cb248a23

View file

@ -43,18 +43,32 @@ def fetch_og_metadata(user_agent, links):
for l in links: for l in links:
check_url(l) check_url(l)
# Remove any AP actor from the list # Remove any AP objects
try: try:
p = lookup(l) lookup(l)
if p.has_type(ap.ACTOR_TYPES): continue
continue
except NotAnActivityError: except NotAnActivityError:
pass pass
except Exception:
logger.exception(f"skipping {l} because of issues during AP lookup")
continue
r = requests.get(l, headers={"User-Agent": user_agent}, timeout=15) try:
r.raise_for_status() h = requests.head(l, headers={"User-Agent": user_agent}, timeout=3)
if not r.headers.get("content-type").startswith("text/html"): h.raise_for_status()
logger.debug(f"skipping {l}") except requests.HTTPError as http_err:
logger.debug(f"failed to HEAD {l}, got a {http_err.response.status_code}")
continue
if not h.headers.get("content-type").startswith("text/html"):
logger.debug(f"skipping {l} for bad content type")
continue
try:
r = requests.get(l, headers={"User-Agent": user_agent}, timeout=5)
r.raise_for_status()
except requests.HTTPError as http_err:
logger.debug(f"failed to GET {l}, got a {http_err.response.status_code}")
continue continue
r.encoding = "UTF-8" r.encoding = "UTF-8"