From 9a1f78db22a1ac6777167f33d1c38f47e3b5a9a5 Mon Sep 17 00:00:00 2001 From: Alex Vandiver Date: Tue, 10 Sep 2024 18:33:25 +0000 Subject: [PATCH] thumbnail: Support checking for images from streaming sources. We may not always have trivial access to all of the bytes of the uploaded file -- for instance, if the file was uploaded previously, or by some other process. Downloading the entire image in order to check its headers is an inefficient use of time and bandwidth. Adjust `maybe_thumbnail` and dependencies to potentially take a `pyvips.Source` which supports streaming data from S3 or disk. This allows making the ImageAttachment row, if deemed appropriate, based on only a few KB of data, and not the entire image. --- zerver/lib/thumbnail.py | 11 ++++++++--- zerver/lib/upload/__init__.py | 19 +++++++++++++++---- zerver/lib/upload/base.py | 12 ++++++++++++ zerver/lib/upload/local.py | 10 +++++++++- zerver/lib/upload/s3.py | 16 +++++++++++++++- zerver/tests/test_thumbnail.py | 21 ++++++++++++++++++++- zerver/tests/test_upload_local.py | 16 ++++++++++++++++ zerver/tests/test_upload_s3.py | 18 ++++++++++++++++++ 8 files changed, 113 insertions(+), 10 deletions(-) diff --git a/zerver/lib/thumbnail.py b/zerver/lib/thumbnail.py index 0acdf9aaf6..4bba1ee9dd 100644 --- a/zerver/lib/thumbnail.py +++ b/zerver/lib/thumbnail.py @@ -138,7 +138,7 @@ class BadImageError(JsonableError): @contextmanager -def libvips_check_image(image_data: bytes) -> Iterator[pyvips.Image]: +def libvips_check_image(image_data: bytes | pyvips.Source) -> Iterator[pyvips.Image]: # The primary goal of this is to verify that the image is valid, # and raise BadImageError otherwise. The yielded `source_image` # may be ignored, since calling `thumbnail_buffer` is faster than @@ -146,7 +146,10 @@ def libvips_check_image(image_data: bytes) -> Iterator[pyvips.Image]: # cannot make use of shrink-on-load optimizations: # https://www.libvips.org/API/current/libvips-resample.html#vips-thumbnail-image try: - source_image = pyvips.Image.new_from_buffer(image_data, "") + if isinstance(image_data, bytes): + source_image = pyvips.Image.new_from_buffer(image_data, "") + else: + source_image = pyvips.Image.new_from_source(image_data, "", access="sequential") except pyvips.Error: raise BadImageError(_("Could not decode image; did you upload an image file?")) @@ -275,7 +278,9 @@ def missing_thumbnails(image_attachment: ImageAttachment) -> list[ThumbnailForma return needed_thumbnails -def maybe_thumbnail(attachment: AbstractAttachment, content: bytes) -> ImageAttachment | None: +def maybe_thumbnail( + attachment: AbstractAttachment, content: bytes | pyvips.Source +) -> ImageAttachment | None: if attachment.content_type not in THUMBNAIL_ACCEPT_IMAGE_TYPES: # If it doesn't self-report as an image file that we might want # to thumbnail, don't parse the bytes at all. diff --git a/zerver/lib/upload/__init__.py b/zerver/lib/upload/__init__.py index d108c8b428..670ac6483f 100644 --- a/zerver/lib/upload/__init__.py +++ b/zerver/lib/upload/__init__.py @@ -8,6 +8,7 @@ from datetime import datetime from typing import IO, Any, BinaryIO from urllib.parse import unquote, urljoin +import pyvips from django.conf import settings from django.core.files.uploadedfile import UploadedFile from django.db import transaction @@ -26,7 +27,7 @@ from zerver.lib.thumbnail import ( resize_avatar, resize_emoji, ) -from zerver.lib.upload.base import INLINE_MIME_TYPES, ZulipUploadBackend +from zerver.lib.upload.base import INLINE_MIME_TYPES, StreamingSourceWithSize, ZulipUploadBackend from zerver.models import Attachment, Message, Realm, RealmEmoji, ScheduledMessage, UserProfile from zerver.models.users import is_cross_realm_bot_email @@ -48,22 +49,28 @@ def create_attachment( file_name: str, path_id: str, content_type: str, - file_data: bytes, + file_data: bytes | StreamingSourceWithSize, user_profile: UserProfile, realm: Realm, ) -> None: assert (user_profile.realm_id == realm.id) or is_cross_realm_bot_email( user_profile.delivery_email ) + if isinstance(file_data, bytes): + file_size = len(file_data) + file_real_data: bytes | pyvips.Source = file_data + else: + file_size = file_data.size + file_real_data = file_data.source attachment = Attachment.objects.create( file_name=file_name, path_id=path_id, owner=user_profile, realm=realm, - size=len(file_data), + size=file_size, content_type=content_type, ) - maybe_thumbnail(attachment, file_data) + maybe_thumbnail(attachment, file_real_data) from zerver.actions.uploads import notify_attachment_update notify_attachment_update(user_profile, "add", attachment.to_dict()) @@ -194,6 +201,10 @@ def upload_message_attachment_from_request( ) +def attachment_vips_source(path_id: str) -> StreamingSourceWithSize: + return upload_backend.attachment_vips_source(path_id) + + def save_attachment_contents(path_id: str, filehandle: BinaryIO) -> None: return upload_backend.save_attachment_contents(path_id, filehandle) diff --git a/zerver/lib/upload/base.py b/zerver/lib/upload/base.py index c0252f1964..021f19420a 100644 --- a/zerver/lib/upload/base.py +++ b/zerver/lib/upload/base.py @@ -1,8 +1,11 @@ import os from collections.abc import Callable, Iterator +from dataclasses import dataclass from datetime import datetime from typing import IO, Any, BinaryIO +import pyvips + from zerver.models import Realm, UserProfile INLINE_MIME_TYPES = [ @@ -27,6 +30,12 @@ INLINE_MIME_TYPES = [ ] +@dataclass +class StreamingSourceWithSize: + size: int + source: pyvips.Source + + class ZulipUploadBackend: # Message attachment uploads def get_public_upload_root_url(self) -> str: @@ -48,6 +57,9 @@ class ZulipUploadBackend: def save_attachment_contents(self, path_id: str, filehandle: BinaryIO) -> None: raise NotImplementedError + def attachment_vips_source(self, path_id: str) -> StreamingSourceWithSize: + raise NotImplementedError + def delete_message_attachment(self, path_id: str) -> bool: raise NotImplementedError diff --git a/zerver/lib/upload/local.py b/zerver/lib/upload/local.py index df226ac602..a25b320788 100644 --- a/zerver/lib/upload/local.py +++ b/zerver/lib/upload/local.py @@ -7,13 +7,14 @@ from collections.abc import Callable, Iterator from datetime import datetime from typing import IO, Any, BinaryIO, Literal +import pyvips from django.conf import settings from typing_extensions import override from zerver.lib.mime_types import guess_type from zerver.lib.thumbnail import resize_avatar, resize_logo from zerver.lib.timestamp import timestamp_to_datetime -from zerver.lib.upload.base import ZulipUploadBackend +from zerver.lib.upload.base import StreamingSourceWithSize, ZulipUploadBackend from zerver.lib.utils import assert_is_not_none from zerver.models import Realm, RealmEmoji, UserProfile @@ -100,6 +101,13 @@ class LocalUploadBackend(ZulipUploadBackend): def save_attachment_contents(self, path_id: str, filehandle: BinaryIO) -> None: filehandle.write(read_local_file("files", path_id)) + @override + def attachment_vips_source(self, path_id: str) -> StreamingSourceWithSize: + file_path = os.path.join(assert_is_not_none(settings.LOCAL_UPLOADS_DIR), "files", path_id) + assert_is_local_storage_path("files", file_path) + source = pyvips.Source.new_from_file(file_path) + return StreamingSourceWithSize(size=os.path.getsize(file_path), source=source) + @override def delete_message_attachment(self, path_id: str) -> bool: return delete_local_file("files", path_id) diff --git a/zerver/lib/upload/s3.py b/zerver/lib/upload/s3.py index 5cb28a590b..c9e912cce7 100644 --- a/zerver/lib/upload/s3.py +++ b/zerver/lib/upload/s3.py @@ -8,14 +8,17 @@ from urllib.parse import urljoin, urlsplit, urlunsplit import boto3 import botocore +import pyvips from botocore.client import Config +from botocore.response import StreamingBody from django.conf import settings from django.utils.http import content_disposition_header from mypy_boto3_s3.service_resource import Bucket from typing_extensions import override +from zerver.lib.partial import partial from zerver.lib.thumbnail import resize_avatar, resize_logo -from zerver.lib.upload.base import INLINE_MIME_TYPES, ZulipUploadBackend +from zerver.lib.upload.base import INLINE_MIME_TYPES, StreamingSourceWithSize, ZulipUploadBackend from zerver.models import Realm, RealmEmoji, UserProfile # Duration that the signed upload URLs that we redirect to when @@ -236,6 +239,17 @@ class S3UploadBackend(ZulipUploadBackend): for chunk in self.uploads_bucket.Object(path_id).get()["Body"]: filehandle.write(chunk) + @override + def attachment_vips_source(self, path_id: str) -> StreamingSourceWithSize: + metadata = self.uploads_bucket.Object(path_id).get() + + def s3_read(streamingbody: StreamingBody, size: int) -> bytes: + return streamingbody.read(amt=size) + + source: pyvips.Source = pyvips.SourceCustom() + source.on_read(partial(s3_read, metadata["Body"])) + return StreamingSourceWithSize(size=metadata["ContentLength"], source=source) + @override def delete_message_attachment(self, path_id: str) -> bool: return self.delete_file_from_s3(path_id, self.uploads_bucket) diff --git a/zerver/tests/test_thumbnail.py b/zerver/tests/test_thumbnail.py index b972381ace..eadc7d5532 100644 --- a/zerver/tests/test_thumbnail.py +++ b/zerver/tests/test_thumbnail.py @@ -26,7 +26,13 @@ from zerver.lib.thumbnail import ( resize_emoji, split_thumbnail_path, ) -from zerver.lib.upload import all_message_attachments, save_attachment_contents +from zerver.lib.upload import ( + all_message_attachments, + attachment_vips_source, + create_attachment, + save_attachment_contents, + upload_backend, +) from zerver.models import Attachment, ImageAttachment from zerver.views.upload import closest_thumbnail_format from zerver.worker.thumbnail import ensure_thumbnails @@ -562,6 +568,19 @@ class TestStoreThumbnail(ZulipTestCase): with self.thumbnail_formats(still_webp, anim_webp, still_jpeg): self.assertEqual(missing_thumbnails(image_attachment), [anim_webp, still_jpeg]) + def test_maybe_thumbnail_from_stream(self) -> None: + # If we put the file in place directly (e.g. simulating a + # chunked upload), and then use the streaming source to + # create the attachment, we still thumbnail correctly. + hamlet = self.example_user("hamlet") + path_id = upload_backend.generate_message_upload_path(str(hamlet.realm.id), "img.png") + upload_backend.upload_message_attachment( + path_id, "img.png", "image/png", read_test_image_file("img.png"), hamlet + ) + source = attachment_vips_source(path_id) + create_attachment("img.png", path_id, "image/png", source, hamlet, hamlet.realm) + self.assertTrue(ImageAttachment.objects.filter(path_id=path_id).exists()) + class TestThumbnailRetrieval(ZulipTestCase): def test_get_thumbnail(self) -> None: diff --git a/zerver/tests/test_upload_local.py b/zerver/tests/test_upload_local.py index 85cabc57f8..36bd825a5c 100644 --- a/zerver/tests/test_upload_local.py +++ b/zerver/tests/test_upload_local.py @@ -13,6 +13,7 @@ from zerver.lib.test_helpers import get_test_image_file, read_test_image_file from zerver.lib.thumbnail import DEFAULT_EMOJI_SIZE, MEDIUM_AVATAR_SIZE, resize_avatar from zerver.lib.upload import ( all_message_attachments, + attachment_vips_source, delete_export_tarball, delete_message_attachment, delete_message_attachments, @@ -21,6 +22,7 @@ from zerver.lib.upload import ( upload_export_tarball, upload_message_attachment, ) +from zerver.lib.upload.base import StreamingSourceWithSize from zerver.lib.upload.local import write_local_file from zerver.models import Attachment, RealmEmoji from zerver.models.realms import get_realm @@ -52,6 +54,20 @@ class LocalStorageTest(UploadSerializeMixin, ZulipTestCase): save_attachment_contents(path_id, output) self.assertEqual(output.getvalue(), b"zulip!") + def test_attachment_vips_source(self) -> None: + user_profile = self.example_user("hamlet") + url = upload_message_attachment( + "img.png", "image/png", read_test_image_file("img.png"), user_profile + )[0] + path_id = re.sub(r"/user_uploads/", "", url) + + source = attachment_vips_source(path_id) + self.assertIsInstance(source, StreamingSourceWithSize) + self.assertEqual(source.size, len(read_test_image_file("img.png"))) + image = pyvips.Image.new_from_source(source.source, "", access="sequential") + self.assertEqual(128, image.height) + self.assertEqual(128, image.width) + def test_upload_message_attachment_local_cross_realm_path(self) -> None: """ Verifies that the path of a file uploaded by a cross-realm bot to another diff --git a/zerver/tests/test_upload_s3.py b/zerver/tests/test_upload_s3.py index 98a9d7685d..3a33763b4f 100644 --- a/zerver/tests/test_upload_s3.py +++ b/zerver/tests/test_upload_s3.py @@ -30,6 +30,7 @@ from zerver.lib.thumbnail import ( ) from zerver.lib.upload import ( all_message_attachments, + attachment_vips_source, delete_export_tarball, delete_message_attachment, delete_message_attachments, @@ -37,6 +38,7 @@ from zerver.lib.upload import ( upload_export_tarball, upload_message_attachment, ) +from zerver.lib.upload.base import StreamingSourceWithSize from zerver.lib.upload.s3 import S3UploadBackend from zerver.models import Attachment, RealmEmoji, UserProfile from zerver.models.realms import get_realm @@ -75,6 +77,22 @@ class S3Test(ZulipTestCase): save_attachment_contents(path_id, output) self.assertEqual(output.getvalue(), b"zulip!") + @use_s3_backend + def test_attachment_vips_source(self) -> None: + create_s3_buckets(settings.S3_AUTH_UPLOADS_BUCKET) + user_profile = self.example_user("hamlet") + url = upload_message_attachment( + "img.png", "image/png", read_test_image_file("img.png"), user_profile + )[0] + path_id = re.sub(r"/user_uploads/", "", url) + + source = attachment_vips_source(path_id) + self.assertIsInstance(source, StreamingSourceWithSize) + self.assertEqual(source.size, len(read_test_image_file("img.png"))) + image = pyvips.Image.new_from_source(source.source, "", access="sequential") + self.assertEqual(128, image.height) + self.assertEqual(128, image.width) + @use_s3_backend def test_upload_message_attachment_s3_cross_realm_path(self) -> None: """