thumbnail: Support checking for images from streaming sources.

We may not always have trivial access to all of the bytes of the
uploaded file -- for instance, if the file was uploaded previously, or
by some other process.  Downloading the entire image in order to check
its headers is an inefficient use of time and bandwidth.

Adjust `maybe_thumbnail` and dependencies to potentially take a
`pyvips.Source` which supports streaming data from S3 or disk.  This
allows making the ImageAttachment row, if deemed appropriate, based on
only a few KB of data, and not the entire image.
This commit is contained in:
Alex Vandiver 2024-09-10 18:33:25 +00:00 committed by Tim Abbott
parent 758aa36cbe
commit 9a1f78db22
8 changed files with 113 additions and 10 deletions

View File

@ -138,7 +138,7 @@ class BadImageError(JsonableError):
@contextmanager @contextmanager
def libvips_check_image(image_data: bytes) -> Iterator[pyvips.Image]: def libvips_check_image(image_data: bytes | pyvips.Source) -> Iterator[pyvips.Image]:
# The primary goal of this is to verify that the image is valid, # The primary goal of this is to verify that the image is valid,
# and raise BadImageError otherwise. The yielded `source_image` # and raise BadImageError otherwise. The yielded `source_image`
# may be ignored, since calling `thumbnail_buffer` is faster than # may be ignored, since calling `thumbnail_buffer` is faster than
@ -146,7 +146,10 @@ def libvips_check_image(image_data: bytes) -> Iterator[pyvips.Image]:
# cannot make use of shrink-on-load optimizations: # cannot make use of shrink-on-load optimizations:
# https://www.libvips.org/API/current/libvips-resample.html#vips-thumbnail-image # https://www.libvips.org/API/current/libvips-resample.html#vips-thumbnail-image
try: try:
source_image = pyvips.Image.new_from_buffer(image_data, "") if isinstance(image_data, bytes):
source_image = pyvips.Image.new_from_buffer(image_data, "")
else:
source_image = pyvips.Image.new_from_source(image_data, "", access="sequential")
except pyvips.Error: except pyvips.Error:
raise BadImageError(_("Could not decode image; did you upload an image file?")) raise BadImageError(_("Could not decode image; did you upload an image file?"))
@ -275,7 +278,9 @@ def missing_thumbnails(image_attachment: ImageAttachment) -> list[ThumbnailForma
return needed_thumbnails return needed_thumbnails
def maybe_thumbnail(attachment: AbstractAttachment, content: bytes) -> ImageAttachment | None: def maybe_thumbnail(
attachment: AbstractAttachment, content: bytes | pyvips.Source
) -> ImageAttachment | None:
if attachment.content_type not in THUMBNAIL_ACCEPT_IMAGE_TYPES: if attachment.content_type not in THUMBNAIL_ACCEPT_IMAGE_TYPES:
# If it doesn't self-report as an image file that we might want # If it doesn't self-report as an image file that we might want
# to thumbnail, don't parse the bytes at all. # to thumbnail, don't parse the bytes at all.

View File

@ -8,6 +8,7 @@ from datetime import datetime
from typing import IO, Any, BinaryIO from typing import IO, Any, BinaryIO
from urllib.parse import unquote, urljoin from urllib.parse import unquote, urljoin
import pyvips
from django.conf import settings from django.conf import settings
from django.core.files.uploadedfile import UploadedFile from django.core.files.uploadedfile import UploadedFile
from django.db import transaction from django.db import transaction
@ -26,7 +27,7 @@ from zerver.lib.thumbnail import (
resize_avatar, resize_avatar,
resize_emoji, resize_emoji,
) )
from zerver.lib.upload.base import INLINE_MIME_TYPES, ZulipUploadBackend from zerver.lib.upload.base import INLINE_MIME_TYPES, StreamingSourceWithSize, ZulipUploadBackend
from zerver.models import Attachment, Message, Realm, RealmEmoji, ScheduledMessage, UserProfile from zerver.models import Attachment, Message, Realm, RealmEmoji, ScheduledMessage, UserProfile
from zerver.models.users import is_cross_realm_bot_email from zerver.models.users import is_cross_realm_bot_email
@ -48,22 +49,28 @@ def create_attachment(
file_name: str, file_name: str,
path_id: str, path_id: str,
content_type: str, content_type: str,
file_data: bytes, file_data: bytes | StreamingSourceWithSize,
user_profile: UserProfile, user_profile: UserProfile,
realm: Realm, realm: Realm,
) -> None: ) -> None:
assert (user_profile.realm_id == realm.id) or is_cross_realm_bot_email( assert (user_profile.realm_id == realm.id) or is_cross_realm_bot_email(
user_profile.delivery_email user_profile.delivery_email
) )
if isinstance(file_data, bytes):
file_size = len(file_data)
file_real_data: bytes | pyvips.Source = file_data
else:
file_size = file_data.size
file_real_data = file_data.source
attachment = Attachment.objects.create( attachment = Attachment.objects.create(
file_name=file_name, file_name=file_name,
path_id=path_id, path_id=path_id,
owner=user_profile, owner=user_profile,
realm=realm, realm=realm,
size=len(file_data), size=file_size,
content_type=content_type, content_type=content_type,
) )
maybe_thumbnail(attachment, file_data) maybe_thumbnail(attachment, file_real_data)
from zerver.actions.uploads import notify_attachment_update from zerver.actions.uploads import notify_attachment_update
notify_attachment_update(user_profile, "add", attachment.to_dict()) notify_attachment_update(user_profile, "add", attachment.to_dict())
@ -194,6 +201,10 @@ def upload_message_attachment_from_request(
) )
def attachment_vips_source(path_id: str) -> StreamingSourceWithSize:
return upload_backend.attachment_vips_source(path_id)
def save_attachment_contents(path_id: str, filehandle: BinaryIO) -> None: def save_attachment_contents(path_id: str, filehandle: BinaryIO) -> None:
return upload_backend.save_attachment_contents(path_id, filehandle) return upload_backend.save_attachment_contents(path_id, filehandle)

View File

@ -1,8 +1,11 @@
import os import os
from collections.abc import Callable, Iterator from collections.abc import Callable, Iterator
from dataclasses import dataclass
from datetime import datetime from datetime import datetime
from typing import IO, Any, BinaryIO from typing import IO, Any, BinaryIO
import pyvips
from zerver.models import Realm, UserProfile from zerver.models import Realm, UserProfile
INLINE_MIME_TYPES = [ INLINE_MIME_TYPES = [
@ -27,6 +30,12 @@ INLINE_MIME_TYPES = [
] ]
@dataclass
class StreamingSourceWithSize:
size: int
source: pyvips.Source
class ZulipUploadBackend: class ZulipUploadBackend:
# Message attachment uploads # Message attachment uploads
def get_public_upload_root_url(self) -> str: def get_public_upload_root_url(self) -> str:
@ -48,6 +57,9 @@ class ZulipUploadBackend:
def save_attachment_contents(self, path_id: str, filehandle: BinaryIO) -> None: def save_attachment_contents(self, path_id: str, filehandle: BinaryIO) -> None:
raise NotImplementedError raise NotImplementedError
def attachment_vips_source(self, path_id: str) -> StreamingSourceWithSize:
raise NotImplementedError
def delete_message_attachment(self, path_id: str) -> bool: def delete_message_attachment(self, path_id: str) -> bool:
raise NotImplementedError raise NotImplementedError

View File

@ -7,13 +7,14 @@ from collections.abc import Callable, Iterator
from datetime import datetime from datetime import datetime
from typing import IO, Any, BinaryIO, Literal from typing import IO, Any, BinaryIO, Literal
import pyvips
from django.conf import settings from django.conf import settings
from typing_extensions import override from typing_extensions import override
from zerver.lib.mime_types import guess_type from zerver.lib.mime_types import guess_type
from zerver.lib.thumbnail import resize_avatar, resize_logo from zerver.lib.thumbnail import resize_avatar, resize_logo
from zerver.lib.timestamp import timestamp_to_datetime from zerver.lib.timestamp import timestamp_to_datetime
from zerver.lib.upload.base import ZulipUploadBackend from zerver.lib.upload.base import StreamingSourceWithSize, ZulipUploadBackend
from zerver.lib.utils import assert_is_not_none from zerver.lib.utils import assert_is_not_none
from zerver.models import Realm, RealmEmoji, UserProfile from zerver.models import Realm, RealmEmoji, UserProfile
@ -100,6 +101,13 @@ class LocalUploadBackend(ZulipUploadBackend):
def save_attachment_contents(self, path_id: str, filehandle: BinaryIO) -> None: def save_attachment_contents(self, path_id: str, filehandle: BinaryIO) -> None:
filehandle.write(read_local_file("files", path_id)) filehandle.write(read_local_file("files", path_id))
@override
def attachment_vips_source(self, path_id: str) -> StreamingSourceWithSize:
file_path = os.path.join(assert_is_not_none(settings.LOCAL_UPLOADS_DIR), "files", path_id)
assert_is_local_storage_path("files", file_path)
source = pyvips.Source.new_from_file(file_path)
return StreamingSourceWithSize(size=os.path.getsize(file_path), source=source)
@override @override
def delete_message_attachment(self, path_id: str) -> bool: def delete_message_attachment(self, path_id: str) -> bool:
return delete_local_file("files", path_id) return delete_local_file("files", path_id)

View File

@ -8,14 +8,17 @@ from urllib.parse import urljoin, urlsplit, urlunsplit
import boto3 import boto3
import botocore import botocore
import pyvips
from botocore.client import Config from botocore.client import Config
from botocore.response import StreamingBody
from django.conf import settings from django.conf import settings
from django.utils.http import content_disposition_header from django.utils.http import content_disposition_header
from mypy_boto3_s3.service_resource import Bucket from mypy_boto3_s3.service_resource import Bucket
from typing_extensions import override from typing_extensions import override
from zerver.lib.partial import partial
from zerver.lib.thumbnail import resize_avatar, resize_logo from zerver.lib.thumbnail import resize_avatar, resize_logo
from zerver.lib.upload.base import INLINE_MIME_TYPES, ZulipUploadBackend from zerver.lib.upload.base import INLINE_MIME_TYPES, StreamingSourceWithSize, ZulipUploadBackend
from zerver.models import Realm, RealmEmoji, UserProfile from zerver.models import Realm, RealmEmoji, UserProfile
# Duration that the signed upload URLs that we redirect to when # Duration that the signed upload URLs that we redirect to when
@ -236,6 +239,17 @@ class S3UploadBackend(ZulipUploadBackend):
for chunk in self.uploads_bucket.Object(path_id).get()["Body"]: for chunk in self.uploads_bucket.Object(path_id).get()["Body"]:
filehandle.write(chunk) filehandle.write(chunk)
@override
def attachment_vips_source(self, path_id: str) -> StreamingSourceWithSize:
metadata = self.uploads_bucket.Object(path_id).get()
def s3_read(streamingbody: StreamingBody, size: int) -> bytes:
return streamingbody.read(amt=size)
source: pyvips.Source = pyvips.SourceCustom()
source.on_read(partial(s3_read, metadata["Body"]))
return StreamingSourceWithSize(size=metadata["ContentLength"], source=source)
@override @override
def delete_message_attachment(self, path_id: str) -> bool: def delete_message_attachment(self, path_id: str) -> bool:
return self.delete_file_from_s3(path_id, self.uploads_bucket) return self.delete_file_from_s3(path_id, self.uploads_bucket)

View File

@ -26,7 +26,13 @@ from zerver.lib.thumbnail import (
resize_emoji, resize_emoji,
split_thumbnail_path, split_thumbnail_path,
) )
from zerver.lib.upload import all_message_attachments, save_attachment_contents from zerver.lib.upload import (
all_message_attachments,
attachment_vips_source,
create_attachment,
save_attachment_contents,
upload_backend,
)
from zerver.models import Attachment, ImageAttachment from zerver.models import Attachment, ImageAttachment
from zerver.views.upload import closest_thumbnail_format from zerver.views.upload import closest_thumbnail_format
from zerver.worker.thumbnail import ensure_thumbnails from zerver.worker.thumbnail import ensure_thumbnails
@ -562,6 +568,19 @@ class TestStoreThumbnail(ZulipTestCase):
with self.thumbnail_formats(still_webp, anim_webp, still_jpeg): with self.thumbnail_formats(still_webp, anim_webp, still_jpeg):
self.assertEqual(missing_thumbnails(image_attachment), [anim_webp, still_jpeg]) self.assertEqual(missing_thumbnails(image_attachment), [anim_webp, still_jpeg])
def test_maybe_thumbnail_from_stream(self) -> None:
# If we put the file in place directly (e.g. simulating a
# chunked upload), and then use the streaming source to
# create the attachment, we still thumbnail correctly.
hamlet = self.example_user("hamlet")
path_id = upload_backend.generate_message_upload_path(str(hamlet.realm.id), "img.png")
upload_backend.upload_message_attachment(
path_id, "img.png", "image/png", read_test_image_file("img.png"), hamlet
)
source = attachment_vips_source(path_id)
create_attachment("img.png", path_id, "image/png", source, hamlet, hamlet.realm)
self.assertTrue(ImageAttachment.objects.filter(path_id=path_id).exists())
class TestThumbnailRetrieval(ZulipTestCase): class TestThumbnailRetrieval(ZulipTestCase):
def test_get_thumbnail(self) -> None: def test_get_thumbnail(self) -> None:

View File

@ -13,6 +13,7 @@ from zerver.lib.test_helpers import get_test_image_file, read_test_image_file
from zerver.lib.thumbnail import DEFAULT_EMOJI_SIZE, MEDIUM_AVATAR_SIZE, resize_avatar from zerver.lib.thumbnail import DEFAULT_EMOJI_SIZE, MEDIUM_AVATAR_SIZE, resize_avatar
from zerver.lib.upload import ( from zerver.lib.upload import (
all_message_attachments, all_message_attachments,
attachment_vips_source,
delete_export_tarball, delete_export_tarball,
delete_message_attachment, delete_message_attachment,
delete_message_attachments, delete_message_attachments,
@ -21,6 +22,7 @@ from zerver.lib.upload import (
upload_export_tarball, upload_export_tarball,
upload_message_attachment, upload_message_attachment,
) )
from zerver.lib.upload.base import StreamingSourceWithSize
from zerver.lib.upload.local import write_local_file from zerver.lib.upload.local import write_local_file
from zerver.models import Attachment, RealmEmoji from zerver.models import Attachment, RealmEmoji
from zerver.models.realms import get_realm from zerver.models.realms import get_realm
@ -52,6 +54,20 @@ class LocalStorageTest(UploadSerializeMixin, ZulipTestCase):
save_attachment_contents(path_id, output) save_attachment_contents(path_id, output)
self.assertEqual(output.getvalue(), b"zulip!") self.assertEqual(output.getvalue(), b"zulip!")
def test_attachment_vips_source(self) -> None:
user_profile = self.example_user("hamlet")
url = upload_message_attachment(
"img.png", "image/png", read_test_image_file("img.png"), user_profile
)[0]
path_id = re.sub(r"/user_uploads/", "", url)
source = attachment_vips_source(path_id)
self.assertIsInstance(source, StreamingSourceWithSize)
self.assertEqual(source.size, len(read_test_image_file("img.png")))
image = pyvips.Image.new_from_source(source.source, "", access="sequential")
self.assertEqual(128, image.height)
self.assertEqual(128, image.width)
def test_upload_message_attachment_local_cross_realm_path(self) -> None: def test_upload_message_attachment_local_cross_realm_path(self) -> None:
""" """
Verifies that the path of a file uploaded by a cross-realm bot to another Verifies that the path of a file uploaded by a cross-realm bot to another

View File

@ -30,6 +30,7 @@ from zerver.lib.thumbnail import (
) )
from zerver.lib.upload import ( from zerver.lib.upload import (
all_message_attachments, all_message_attachments,
attachment_vips_source,
delete_export_tarball, delete_export_tarball,
delete_message_attachment, delete_message_attachment,
delete_message_attachments, delete_message_attachments,
@ -37,6 +38,7 @@ from zerver.lib.upload import (
upload_export_tarball, upload_export_tarball,
upload_message_attachment, upload_message_attachment,
) )
from zerver.lib.upload.base import StreamingSourceWithSize
from zerver.lib.upload.s3 import S3UploadBackend from zerver.lib.upload.s3 import S3UploadBackend
from zerver.models import Attachment, RealmEmoji, UserProfile from zerver.models import Attachment, RealmEmoji, UserProfile
from zerver.models.realms import get_realm from zerver.models.realms import get_realm
@ -75,6 +77,22 @@ class S3Test(ZulipTestCase):
save_attachment_contents(path_id, output) save_attachment_contents(path_id, output)
self.assertEqual(output.getvalue(), b"zulip!") self.assertEqual(output.getvalue(), b"zulip!")
@use_s3_backend
def test_attachment_vips_source(self) -> None:
create_s3_buckets(settings.S3_AUTH_UPLOADS_BUCKET)
user_profile = self.example_user("hamlet")
url = upload_message_attachment(
"img.png", "image/png", read_test_image_file("img.png"), user_profile
)[0]
path_id = re.sub(r"/user_uploads/", "", url)
source = attachment_vips_source(path_id)
self.assertIsInstance(source, StreamingSourceWithSize)
self.assertEqual(source.size, len(read_test_image_file("img.png")))
image = pyvips.Image.new_from_source(source.source, "", access="sequential")
self.assertEqual(128, image.height)
self.assertEqual(128, image.width)
@use_s3_backend @use_s3_backend
def test_upload_message_attachment_s3_cross_realm_path(self) -> None: def test_upload_message_attachment_s3_cross_realm_path(self) -> None:
""" """