zulip/zerver/worker/thumbnail.py

import logging
import time
from dataclasses import asdict
from io import BytesIO
from typing import Any

import pyvips
from django.db import transaction
from typing_extensions import override

from zerver.actions.message_edit import do_update_embedded_data
from zerver.lib.mime_types import guess_type
from zerver.lib.thumbnail import (
    MarkdownImageMetadata,
    StoredThumbnailFormat,
    get_default_thumbnail_url,
    get_image_thumbnail_path,
    missing_thumbnails,
    rewrite_thumbnailed_images,
)
from zerver.lib.upload import save_attachment_contents, upload_backend
from zerver.models import ArchivedMessage, ImageAttachment, Message
from zerver.worker.base import QueueProcessingWorker, assign_queue

logger = logging.getLogger(__name__)


@assign_queue("thumbnail")
class ThumbnailWorker(QueueProcessingWorker):
    @override
    def consume(self, event: dict[str, Any]) -> None:
        start = time.time()
        with transaction.atomic(savepoint=False):
            try:
                # This lock prevents us from racing with the on-demand
                # rendering that can be triggered if a request is made
                # directly to a thumbnail URL we have not made yet.
                # This may mean that we may generate 0 thumbnail
                # images once we get the lock.
                row = ImageAttachment.objects.select_for_update().get(id=event["id"])
            except ImageAttachment.DoesNotExist:  # nocoverage
                logger.info("ImageAttachment row %d missing", event["id"])
                return
            uploaded_thumbnails = ensure_thumbnails(row)
        end = time.time()
        logger.info(
            "Processed %d thumbnails (%dms)",
            uploaded_thumbnails,
            (end - start) * 1000,
        )


def ensure_thumbnails(image_attachment: ImageAttachment) -> int:
    needed_thumbnails = missing_thumbnails(image_attachment)

    if not needed_thumbnails:
        return 0

    written_images = 0
    image_bytes = BytesIO()
    save_attachment_contents(image_attachment.path_id, image_bytes)
    try:
        # TODO: We could save some computational time by using the same
        # bytes if multiple resolutions are larger than the source
        # image.  That is, if the input is 10x10, a 100x100.jpg is
        # going to be the same as a 200x200.jpg, since those set the
        # max dimensions, and we do not scale up.
        for thumbnail_format in needed_thumbnails:
            # This will scale to fit within the given dimensions; it
            # may be smaller one one or more of them.
            logger.info(
                "Resizing to %d x %d, from %d x %d",
                thumbnail_format.max_width,
                thumbnail_format.max_height,
                image_attachment.original_width_px,
                image_attachment.original_height_px,
            )
            load_opts = ""
            if image_attachment.frames > 1:
                # If the original has multiple frames, we want to load
                # one of them if we're outputting to a static format,
                # otherwise we load them all.
                if thumbnail_format.animated:
                    load_opts = "n=-1"
                else:
                    load_opts = "n=1"
            resized = pyvips.Image.thumbnail_buffer(
                image_bytes.getbuffer(),
                thumbnail_format.max_width,
                height=thumbnail_format.max_height,
                option_string=load_opts,
                size=pyvips.Size.DOWN,
            )
            thumbnailed_bytes = resized.write_to_buffer(
                f".{thumbnail_format.extension}[{thumbnail_format.opts}]"
            )
            content_type = guess_type(f"image.{thumbnail_format.extension}")[0]
            assert content_type is not None
            thumbnail_path = get_image_thumbnail_path(image_attachment, thumbnail_format)
            logger.info("Uploading %d bytes to %s", len(thumbnailed_bytes), thumbnail_path)
            upload_backend.upload_message_attachment(
                thumbnail_path,
                str(thumbnail_format),
                content_type,
                thumbnailed_bytes,
                None,
            )
            height = resized.get("page-height") if thumbnail_format.animated else resized.height
            image_attachment.thumbnail_metadata.append(
                asdict(
                    StoredThumbnailFormat(
                        extension=thumbnail_format.extension,
                        content_type=content_type,
                        max_width=thumbnail_format.max_width,
                        max_height=thumbnail_format.max_height,
                        animated=thumbnail_format.animated,
                        width=resized.width,
                        height=height,
                        byte_size=len(thumbnailed_bytes),
                    )
                )
            )
            written_images += 1

    except pyvips.Error as e:
        logger.exception(e)

        if written_images == 0 and len(image_attachment.thumbnail_metadata) == 0:
            # We have never thumbnailed this -- it most likely had
            # bad data.  Remove the ImageAttachment row, since it is
            # not valid for thumbnailing.
            update_message_rendered_content(
                image_attachment.realm_id, image_attachment.path_id, None
            )
            image_attachment.delete()
            return 0
        else:  # nocoverage
            # TODO: Clean up any dangling thumbnails we may have
            # produced?  Seems unlikely that we'd fail on one size,
            # but not another, but anything's possible.
            pass

    image_attachment.save(update_fields=["thumbnail_metadata"])
    url, is_animated = get_default_thumbnail_url(image_attachment)
    update_message_rendered_content(
        image_attachment.realm_id,
        image_attachment.path_id,
        MarkdownImageMetadata(
            url=url,
            is_animated=is_animated,
            original_width_px=image_attachment.original_width_px,
            original_height_px=image_attachment.original_height_px,
        ),
    )
    return written_images


def update_message_rendered_content(
    realm_id: int, path_id: str, image_data: MarkdownImageMetadata | None
) -> None:
    for message_class in [Message, ArchivedMessage]:
        messages_with_image = (
            message_class.objects.filter(  # type: ignore[attr-defined]  # TODO: ?
                realm_id=realm_id, attachment__path_id=path_id
            )
            .select_for_update()
            .order_by("id")
        )
        for message in messages_with_image:
            rendered_content = rewrite_thumbnailed_images(
                message.rendered_content,
                {} if image_data is None else {path_id: image_data},
                {path_id} if image_data is None else set(),
            )[0]
            if rendered_content is None:
                # There were no updates -- for instance, if we re-run
                # ensure_thumbnails on an ImageAttachment we already
                # ran it on once.  Do not bother to no-op update
                # clients.
                continue
            if isinstance(message, Message):
                # Perform a silent update push to the clients
                do_update_embedded_data(message.sender, message, rendered_content)
            else:
                message.rendered_content = rendered_content
                message.save(update_fields=["rendered_content"])
upload: Generate thumbnails when images are uploaded. A new table is created to track which path_id attachments are images, and for those their metadata, and which thumbnails have been created. Using path_id as the effective primary key lets us ignore if the attachment is archived or not, saving some foreign key messes. A new worker is added to observe events when rows are added to this table, and to generate and store thumbnails for those images in differing sizes and formats. 2024-06-20 23:58:27 +02:00			`import logging`
			`import time`
			`from dataclasses import asdict`
			`from io import BytesIO`
			`from typing import Any`

			`import pyvips`
			`from django.db import transaction`
			`from typing_extensions import override`

markdown: Show thumbnails for uploaded images. Fixes: #16210. 2024-06-21 21:02:36 +02:00			`from zerver.actions.message_edit import do_update_embedded_data`
upload: Generate thumbnails when images are uploaded. A new table is created to track which path_id attachments are images, and for those their metadata, and which thumbnails have been created. Using path_id as the effective primary key lets us ignore if the attachment is archived or not, saving some foreign key messes. A new worker is added to observe events when rows are added to this table, and to generate and store thumbnails for those images in differing sizes and formats. 2024-06-20 23:58:27 +02:00			`from zerver.lib.mime_types import guess_type`
markdown: Show thumbnails for uploaded images. Fixes: #16210. 2024-06-21 21:02:36 +02:00			`from zerver.lib.thumbnail import (`
thumbnail: Factor out a dataclass for markdown image metadata. 2024-07-22 23:07:59 +02:00			`MarkdownImageMetadata,`
markdown: Show thumbnails for uploaded images. Fixes: #16210. 2024-06-21 21:02:36 +02:00			`StoredThumbnailFormat,`
			`get_default_thumbnail_url,`
			`get_image_thumbnail_path,`
			`missing_thumbnails,`
			`rewrite_thumbnailed_images,`
			`)`
thumbnail: Move get_image_thumbnail_path and split_thumbnail_path. 2024-07-17 03:11:57 +02:00			`from zerver.lib.upload import save_attachment_contents, upload_backend`
markdown: Show thumbnails for uploaded images. Fixes: #16210. 2024-06-21 21:02:36 +02:00			`from zerver.models import ArchivedMessage, ImageAttachment, Message`
upload: Generate thumbnails when images are uploaded. A new table is created to track which path_id attachments are images, and for those their metadata, and which thumbnails have been created. Using path_id as the effective primary key lets us ignore if the attachment is archived or not, saving some foreign key messes. A new worker is added to observe events when rows are added to this table, and to generate and store thumbnails for those images in differing sizes and formats. 2024-06-20 23:58:27 +02:00			`from zerver.worker.base import QueueProcessingWorker, assign_queue`

			`logger = logging.getLogger(__name__)`


			`@assign_queue("thumbnail")`
			`class ThumbnailWorker(QueueProcessingWorker):`
			`@override`
			`def consume(self, event: dict[str, Any]) -> None:`
			`start = time.time()`
			`with transaction.atomic(savepoint=False):`
			`try:`
upload: Serve thumbnailed images. 2024-06-21 20:58:42 +02:00			`# This lock prevents us from racing with the on-demand`
			`# rendering that can be triggered if a request is made`
			`# directly to a thumbnail URL we have not made yet.`
			`# This may mean that we may generate 0 thumbnail`
			`# images once we get the lock.`
upload: Generate thumbnails when images are uploaded. A new table is created to track which path_id attachments are images, and for those their metadata, and which thumbnails have been created. Using path_id as the effective primary key lets us ignore if the attachment is archived or not, saving some foreign key messes. A new worker is added to observe events when rows are added to this table, and to generate and store thumbnails for those images in differing sizes and formats. 2024-06-20 23:58:27 +02:00			`row = ImageAttachment.objects.select_for_update().get(id=event["id"])`
			`except ImageAttachment.DoesNotExist: # nocoverage`
			`logger.info("ImageAttachment row %d missing", event["id"])`
			`return`
			`uploaded_thumbnails = ensure_thumbnails(row)`
			`end = time.time()`
			`logger.info(`
			`"Processed %d thumbnails (%dms)",`
			`uploaded_thumbnails,`
			`(end - start) * 1000,`
			`)`


			`def ensure_thumbnails(image_attachment: ImageAttachment) -> int:`
			`needed_thumbnails = missing_thumbnails(image_attachment)`

			`if not needed_thumbnails:`
			`return 0`

			`written_images = 0`
			`image_bytes = BytesIO()`
			`save_attachment_contents(image_attachment.path_id, image_bytes)`
			`try:`
			`# TODO: We could save some computational time by using the same`
			`# bytes if multiple resolutions are larger than the source`
			`# image. That is, if the input is 10x10, a 100x100.jpg is`
			`# going to be the same as a 200x200.jpg, since those set the`
			`# max dimensions, and we do not scale up.`
			`for thumbnail_format in needed_thumbnails:`
			`# This will scale to fit within the given dimensions; it`
			`# may be smaller one one or more of them.`
			`logger.info(`
			`"Resizing to %d x %d, from %d x %d",`
			`thumbnail_format.max_width,`
			`thumbnail_format.max_height,`
			`image_attachment.original_width_px,`
			`image_attachment.original_height_px,`
			`)`
			`load_opts = ""`
			`if image_attachment.frames > 1:`
			`# If the original has multiple frames, we want to load`
			`# one of them if we're outputting to a static format,`
			`# otherwise we load them all.`
			`if thumbnail_format.animated:`
			`load_opts = "n=-1"`
			`else:`
			`load_opts = "n=1"`
			`resized = pyvips.Image.thumbnail_buffer(`
			`image_bytes.getbuffer(),`
			`thumbnail_format.max_width,`
			`height=thumbnail_format.max_height,`
			`option_string=load_opts,`
			`size=pyvips.Size.DOWN,`
			`)`
			`thumbnailed_bytes = resized.write_to_buffer(`
			`f".{thumbnail_format.extension}[{thumbnail_format.opts}]"`
			`)`
			`content_type = guess_type(f"image.{thumbnail_format.extension}")[0]`
			`assert content_type is not None`
			`thumbnail_path = get_image_thumbnail_path(image_attachment, thumbnail_format)`
			`logger.info("Uploading %d bytes to %s", len(thumbnailed_bytes), thumbnail_path)`
			`upload_backend.upload_message_attachment(`
			`thumbnail_path,`
upload: Download files with their original names. Fixes: #29491. 2024-08-29 23:39:07 +02:00			`str(thumbnail_format),`
upload: Generate thumbnails when images are uploaded. A new table is created to track which path_id attachments are images, and for those their metadata, and which thumbnails have been created. Using path_id as the effective primary key lets us ignore if the attachment is archived or not, saving some foreign key messes. A new worker is added to observe events when rows are added to this table, and to generate and store thumbnails for those images in differing sizes and formats. 2024-06-20 23:58:27 +02:00			`content_type,`
			`thumbnailed_bytes,`
			`None,`
			`)`
			`height = resized.get("page-height") if thumbnail_format.animated else resized.height`
			`image_attachment.thumbnail_metadata.append(`
			`asdict(`
			`StoredThumbnailFormat(`
			`extension=thumbnail_format.extension,`
			`content_type=content_type,`
			`max_width=thumbnail_format.max_width,`
			`max_height=thumbnail_format.max_height,`
			`animated=thumbnail_format.animated,`
			`width=resized.width,`
			`height=height,`
			`byte_size=len(thumbnailed_bytes),`
			`)`
			`)`
			`)`
			`written_images += 1`

			`except pyvips.Error as e:`
			`logger.exception(e)`

			`if written_images == 0 and len(image_attachment.thumbnail_metadata) == 0:`
			`# We have never thumbnailed this -- it most likely had`
			`# bad data. Remove the ImageAttachment row, since it is`
			`# not valid for thumbnailing.`
markdown: Show thumbnails for uploaded images. Fixes: #16210. 2024-06-21 21:02:36 +02:00			`update_message_rendered_content(`
			`image_attachment.realm_id, image_attachment.path_id, None`
			`)`
upload: Generate thumbnails when images are uploaded. A new table is created to track which path_id attachments are images, and for those their metadata, and which thumbnails have been created. Using path_id as the effective primary key lets us ignore if the attachment is archived or not, saving some foreign key messes. A new worker is added to observe events when rows are added to this table, and to generate and store thumbnails for those images in differing sizes and formats. 2024-06-20 23:58:27 +02:00			`image_attachment.delete()`
			`return 0`
markdown: Show thumbnails for uploaded images. Fixes: #16210. 2024-06-21 21:02:36 +02:00			`else: # nocoverage`
			`# TODO: Clean up any dangling thumbnails we may have`
			`# produced? Seems unlikely that we'd fail on one size,`
			`# but not another, but anything's possible.`
			`pass`
upload: Generate thumbnails when images are uploaded. A new table is created to track which path_id attachments are images, and for those their metadata, and which thumbnails have been created. Using path_id as the effective primary key lets us ignore if the attachment is archived or not, saving some foreign key messes. A new worker is added to observe events when rows are added to this table, and to generate and store thumbnails for those images in differing sizes and formats. 2024-06-20 23:58:27 +02:00
			`image_attachment.save(update_fields=["thumbnail_metadata"])`
thumbnail: Factor out a dataclass for markdown image metadata. 2024-07-22 23:07:59 +02:00			`url, is_animated = get_default_thumbnail_url(image_attachment)`
markdown: Show thumbnails for uploaded images. Fixes: #16210. 2024-06-21 21:02:36 +02:00			`update_message_rendered_content(`
			`image_attachment.realm_id,`
			`image_attachment.path_id,`
thumbnail: Factor out a dataclass for markdown image metadata. 2024-07-22 23:07:59 +02:00			`MarkdownImageMetadata(`
			`url=url,`
			`is_animated=is_animated,`
thumbnail: Add a data-original-dimensions attribute. This allows clients to potentially lay out the thumbnails more intelligently, or to provide a better "progressive-load" experience when enlarging the thumbnail. 2024-07-22 23:16:03 +02:00			`original_width_px=image_attachment.original_width_px,`
			`original_height_px=image_attachment.original_height_px,`
thumbnail: Factor out a dataclass for markdown image metadata. 2024-07-22 23:07:59 +02:00			`),`
markdown: Show thumbnails for uploaded images. Fixes: #16210. 2024-06-21 21:02:36 +02:00			`)`
upload: Generate thumbnails when images are uploaded. A new table is created to track which path_id attachments are images, and for those their metadata, and which thumbnails have been created. Using path_id as the effective primary key lets us ignore if the attachment is archived or not, saving some foreign key messes. A new worker is added to observe events when rows are added to this table, and to generate and store thumbnails for those images in differing sizes and formats. 2024-06-20 23:58:27 +02:00			`return written_images`
markdown: Show thumbnails for uploaded images. Fixes: #16210. 2024-06-21 21:02:36 +02:00

			`def update_message_rendered_content(`
thumbnail: Factor out a dataclass for markdown image metadata. 2024-07-22 23:07:59 +02:00			`realm_id: int, path_id: str, image_data: MarkdownImageMetadata \| None`
markdown: Show thumbnails for uploaded images. Fixes: #16210. 2024-06-21 21:02:36 +02:00			`) -> None:`
			`for message_class in [Message, ArchivedMessage]:`
			`messages_with_image = (`
			`message_class.objects.filter( # type: ignore[attr-defined] # TODO: ?`
			`realm_id=realm_id, attachment__path_id=path_id`
			`)`
			`.select_for_update()`
			`.order_by("id")`
			`)`
			`for message in messages_with_image:`
			`rendered_content = rewrite_thumbnailed_images(`
			`message.rendered_content,`
			`{} if image_data is None else {path_id: image_data},`
			`{path_id} if image_data is None else set(),`
thumbnail: Resolve a race condition when rendering messages. Messages are rendered outside of a transaction, for performance reasons, and then sent inside of one. This opens thumbnailing up to a race where the thumbnails have not yet been written when the message is rendered, but the message has not been sent when thumbnailing completes, causing `rewrite_thumbnailed_images` to be a no-op and the message being left with a spinner which never resolves. Explicitly lock and use he ImageAttachment data inside the message-sending transaction, to rewrite the message content with the latest information about the existing thumbnails. Despite the thumbnailing worker taking a lock on Message rows to update them, this does not lead to deadlocks -- the INSERT of the Message rows happens in a transaction, ensuring that either the message rending blocks the thumbnailing until the Message row is created, or that the `rewrite_thumbnailed_images` and Message INSERT waits until thumbnailing is complete (and updated no Message rows). 2024-07-31 18:07:10 +02:00			`)[0]`
markdown: Show thumbnails for uploaded images. Fixes: #16210. 2024-06-21 21:02:36 +02:00			`if rendered_content is None:`
			`# There were no updates -- for instance, if we re-run`
			`# ensure_thumbnails on an ImageAttachment we already`
			`# ran it on once. Do not bother to no-op update`
			`# clients.`
			`continue`
			`if isinstance(message, Message):`
			`# Perform a silent update push to the clients`
			`do_update_embedded_data(message.sender, message, rendered_content)`
			`else:`
			`message.rendered_content = rendered_content`
			`message.save(update_fields=["rendered_content"])`