microblog.pub/utils/opengraph.py

47 lines
1.3 KiB
Python
Raw Normal View History

2018-05-18 18:41:41 +00:00
import opengraph
import requests
from bs4 import BeautifulSoup
2018-06-17 17:21:59 +00:00
from little_boxes.urlutils import check_url
from little_boxes.urlutils import is_url_valid
2018-05-18 18:41:41 +00:00
def links_from_note(note):
2018-06-17 17:21:59 +00:00
tags_href = set()
for t in note.get("tag", []):
h = t.get("href")
2018-05-18 18:41:41 +00:00
if h:
# TODO(tsileo): fetch the URL for Actor profile, type=mention
tags_href.add(h)
links = set()
2018-06-17 17:21:59 +00:00
soup = BeautifulSoup(note["content"])
for link in soup.find_all("a"):
h = link.get("href")
if h.startswith("http") and h not in tags_href and is_url_valid(h):
2018-05-18 18:41:41 +00:00
links.add(h)
return links
def fetch_og_metadata(user_agent, col, remote_id):
2018-06-17 17:21:59 +00:00
doc = col.find_one({"remote_id": remote_id})
2018-05-18 18:41:41 +00:00
if not doc:
raise ValueError
2018-06-17 17:21:59 +00:00
note = doc["activity"]["object"]
2018-05-18 18:41:41 +00:00
print(note)
links = links_from_note(note)
if not links:
return 0
# FIXME(tsileo): set the user agent by giving HTML directly to OpenGraph
htmls = []
for l in links:
2018-05-25 22:03:30 +00:00
check_url(l)
2018-06-17 17:21:59 +00:00
r = requests.get(l, headers={"User-Agent": user_agent})
2018-05-18 18:41:41 +00:00
r.raise_for_status()
htmls.append(r.text)
links_og_metadata = [dict(opengraph.OpenGraph(html=html)) for html in htmls]
2018-06-17 17:21:59 +00:00
col.update_one(
{"remote_id": remote_id}, {"$set": {"meta.og_metadata": links_og_metadata}}
)
2018-05-18 18:41:41 +00:00
return len(links)