From 043e9a79dc4d54b76269201b4d027fd871e0140a Mon Sep 17 00:00:00 2001 From: Thomas Sileo Date: Thu, 15 Aug 2019 16:08:52 +0200 Subject: [PATCH] Improve video caching --- blueprints/tasks.py | 38 +++++++++++++++++++++++++++++++------- utils/media.py | 25 ++++++++++++++++++++----- utils/template_filters.py | 10 +--------- 3 files changed, 52 insertions(+), 21 deletions(-) diff --git a/blueprints/tasks.py b/blueprints/tasks.py index f0ae6a6..154b39a 100644 --- a/blueprints/tasks.py +++ b/blueprints/tasks.py @@ -39,6 +39,7 @@ from core.shared import p from core.tasks import Tasks from utils import now from utils import opengraph +from utils.media import is_video blueprint = flask.Blueprint("tasks", __name__) @@ -216,6 +217,25 @@ def task_finish_post_to_inbox() -> _Response: return "" +def select_video_to_cache(links): + """Try to find the 360p version from a video urls, or return the smallest one.""" + videos = [] + for link in links: + if link.get("mimeType", "").startswith("video/") or is_video(link["href"]): + videos.append({"href": link["href"], "height": link["height"]}) + + if not videos: + app.logger.warning(f"failed to select a video from {links!r}") + return None + + videos = sorted(videos, key=lambda l: l["height"]) + for video in videos: + if video["height"] == 360: + return video + + return videos[0] + + @blueprint.route("/task/cache_attachments", methods=["POST"]) def task_cache_attachments() -> _Response: task = p.parse(flask.request) @@ -230,10 +250,10 @@ def task_cache_attachments() -> _Response: if obj.has_type(ap.ActivityType.VIDEO): if isinstance(obj.url, list): - for link in obj.url: - if link.get("mimeType", "").startswith("video/"): - config.MEDIA_CACHE.cache_attachment({"url": link["href"]}, iri) - break + # TODO: filter only videogt + link = select_video_to_cache(obj.url) + if link: + config.MEDIA_CACHE.cache_attachment({"url": link["href"]}, iri) elif isinstance(obj.url, str): config.MEDIA_CACHE.cache_attachment({"url": obj.url}, iri) else: @@ -290,9 +310,13 @@ def task_cache_actor() -> _Response: if not activity.has_type([ap.ActivityType.CREATE, ap.ActivityType.ANNOUNCE]): return "" - if activity.get_object()._data.get( - "attachment", [] - ) or activity.get_object().has_type(ap.ActivityType.VIDEO): + if ( + activity.has_type(ap.ActivityType.CREATE) + and activity.get_object()._data.get("attachment", []) + ) or ( + activity.has_type(ap.ActivityType.ANNOUNCE) + and activity.get_object().has_type(ap.ActivityType.VIDEO) + ): Tasks.cache_attachments(iri) except (ActivityGoneError, ActivityNotFoundError): diff --git a/utils/media.py b/utils/media.py index 54b7876..96567de 100644 --- a/utils/media.py +++ b/utils/media.py @@ -2,6 +2,7 @@ import base64 import mimetypes from enum import Enum from enum import unique +from functools import lru_cache from gzip import GzipFile from io import BytesIO from typing import Any @@ -14,6 +15,22 @@ from little_boxes import activitypub as ap from PIL import Image +@lru_cache(2048) +def _is_img(filename): + mimetype, _ = mimetypes.guess_type(filename.lower()) + if mimetype and mimetype.split("/")[0] in ["image"]: + return True + return False + + +@lru_cache(2048) +def is_video(filename): + mimetype, _ = mimetypes.guess_type(filename.lower()) + if mimetype and mimetype.split("/")[0] in ["video"]: + return True + return False + + def load(url: str, user_agent: str) -> Image: """Initializes a `PIL.Image` from the URL.""" with requests.get(url, stream=True, headers={"User-Agent": user_agent}) as resp: @@ -74,10 +91,7 @@ class MediaCache(object): # If it's an image, make some thumbnails if ( - url.endswith(".png") - or url.endswith(".jpg") - or url.endswith(".jpeg") - or url.endswith(".gif") + _is_img(url) or attachment.get("mediaType", "").startswith("image/") or ap._has_type(attachment.get("type"), ap.ActivityType.IMAGE) ): @@ -123,8 +137,9 @@ class MediaCache(object): resp.raise_for_status() with BytesIO() as buf: with GzipFile(mode="wb", fileobj=buf) as f1: - for chunk in resp.iter_content(): + for chunk in resp.iter_content(chunk_size=2 << 20): if chunk: + print(len(chunk)) f1.write(chunk) buf.seek(0) self.fs.put( diff --git a/utils/template_filters.py b/utils/template_filters.py index 9cb9757..aa10097 100644 --- a/utils/template_filters.py +++ b/utils/template_filters.py @@ -1,5 +1,4 @@ import logging -import mimetypes import urllib from datetime import datetime from datetime import timezone @@ -22,6 +21,7 @@ from config import MEDIA_CACHE from core.activitypub import _answer_key from utils import parse_datetime from utils.media import Kind +from utils.media import _is_img _logger = logging.getLogger(__name__) @@ -312,14 +312,6 @@ def has_actor_type(doc): return False -@lru_cache(512) -def _is_img(filename): - mimetype, _ = mimetypes.guess_type(filename.lower()) - if mimetype and mimetype.split("/")[0] in ["image"]: - return True - return False - - @filters.app_template_filter() def not_only_imgs(attachment): for a in attachment: