2018-08-05 12:24:52 +00:00
|
|
|
import logging
|
2018-05-18 18:41:41 +00:00
|
|
|
import opengraph
|
|
|
|
import requests
|
|
|
|
from bs4 import BeautifulSoup
|
2018-07-22 09:44:42 +00:00
|
|
|
from little_boxes import activitypub as ap
|
|
|
|
from little_boxes.errors import NotAnActivityError
|
2018-06-17 17:21:59 +00:00
|
|
|
from little_boxes.urlutils import check_url
|
|
|
|
from little_boxes.urlutils import is_url_valid
|
2018-07-22 10:04:18 +00:00
|
|
|
|
2018-07-22 09:44:42 +00:00
|
|
|
from .lookup import lookup
|
2018-05-18 18:41:41 +00:00
|
|
|
|
2018-08-05 12:24:52 +00:00
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
2018-05-18 18:41:41 +00:00
|
|
|
|
|
|
|
def links_from_note(note):
|
2018-06-17 17:21:59 +00:00
|
|
|
tags_href = set()
|
|
|
|
for t in note.get("tag", []):
|
|
|
|
h = t.get("href")
|
2018-05-18 18:41:41 +00:00
|
|
|
if h:
|
|
|
|
tags_href.add(h)
|
|
|
|
|
|
|
|
links = set()
|
2019-04-14 17:17:54 +00:00
|
|
|
if "content" in note:
|
|
|
|
soup = BeautifulSoup(note["content"])
|
|
|
|
for link in soup.find_all("a"):
|
|
|
|
h = link.get("href")
|
|
|
|
if h.startswith("http") and h not in tags_href and is_url_valid(h):
|
|
|
|
links.add(h)
|
|
|
|
|
|
|
|
# FIXME(tsileo): support summary and name fields
|
2018-05-18 18:41:41 +00:00
|
|
|
|
|
|
|
return links
|
|
|
|
|
|
|
|
|
2018-07-21 21:16:40 +00:00
|
|
|
def fetch_og_metadata(user_agent, links):
|
2018-08-05 12:24:52 +00:00
|
|
|
res = []
|
2018-05-18 18:41:41 +00:00
|
|
|
for l in links:
|
2018-05-25 22:03:30 +00:00
|
|
|
check_url(l)
|
2018-07-22 09:44:42 +00:00
|
|
|
|
|
|
|
# Remove any AP actor from the list
|
|
|
|
try:
|
|
|
|
p = lookup(l)
|
|
|
|
if p.has_type(ap.ACTOR_TYPES):
|
|
|
|
continue
|
|
|
|
except NotAnActivityError:
|
|
|
|
pass
|
|
|
|
|
2018-07-21 21:16:40 +00:00
|
|
|
r = requests.get(l, headers={"User-Agent": user_agent}, timeout=15)
|
2018-05-18 18:41:41 +00:00
|
|
|
r.raise_for_status()
|
2018-08-05 12:45:44 +00:00
|
|
|
if not r.headers.get("content-type").startswith("text/html"):
|
|
|
|
logger.debug(f"skipping {l}")
|
|
|
|
continue
|
2018-08-05 11:55:48 +00:00
|
|
|
|
2019-04-13 08:00:56 +00:00
|
|
|
r.encoding = "UTF-8"
|
2018-08-05 12:24:52 +00:00
|
|
|
html = r.text
|
|
|
|
try:
|
|
|
|
data = dict(opengraph.OpenGraph(html=html))
|
|
|
|
except Exception:
|
2018-08-05 12:45:44 +00:00
|
|
|
logger.exception(f"failed to parse {l}")
|
2018-08-05 12:24:52 +00:00
|
|
|
continue
|
2018-08-05 11:55:48 +00:00
|
|
|
if data.get("url"):
|
|
|
|
res.append(data)
|
|
|
|
|
|
|
|
return res
|