2024-06-20 23:58:27 +02:00
|
|
|
import logging
|
|
|
|
import time
|
|
|
|
from dataclasses import asdict
|
|
|
|
from io import BytesIO
|
|
|
|
from typing import Any
|
|
|
|
|
|
|
|
import pyvips
|
|
|
|
from django.db import transaction
|
|
|
|
from typing_extensions import override
|
|
|
|
|
2024-06-21 21:02:36 +02:00
|
|
|
from zerver.actions.message_edit import do_update_embedded_data
|
2024-06-20 23:58:27 +02:00
|
|
|
from zerver.lib.mime_types import guess_type
|
2024-06-21 21:02:36 +02:00
|
|
|
from zerver.lib.thumbnail import (
|
2024-07-22 23:07:59 +02:00
|
|
|
MarkdownImageMetadata,
|
2024-06-21 21:02:36 +02:00
|
|
|
StoredThumbnailFormat,
|
|
|
|
get_default_thumbnail_url,
|
|
|
|
get_image_thumbnail_path,
|
|
|
|
missing_thumbnails,
|
|
|
|
rewrite_thumbnailed_images,
|
|
|
|
)
|
2024-07-17 03:11:57 +02:00
|
|
|
from zerver.lib.upload import save_attachment_contents, upload_backend
|
2024-10-04 19:24:32 +02:00
|
|
|
from zerver.models import ArchivedMessage, ImageAttachment, Message
|
2024-06-20 23:58:27 +02:00
|
|
|
from zerver.worker.base import QueueProcessingWorker, assign_queue
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
@assign_queue("thumbnail")
|
|
|
|
class ThumbnailWorker(QueueProcessingWorker):
|
|
|
|
@override
|
|
|
|
def consume(self, event: dict[str, Any]) -> None:
|
|
|
|
start = time.time()
|
|
|
|
with transaction.atomic(savepoint=False):
|
|
|
|
try:
|
2024-06-21 20:58:42 +02:00
|
|
|
# This lock prevents us from racing with the on-demand
|
|
|
|
# rendering that can be triggered if a request is made
|
|
|
|
# directly to a thumbnail URL we have not made yet.
|
|
|
|
# This may mean that we may generate 0 thumbnail
|
|
|
|
# images once we get the lock.
|
2024-06-20 23:58:27 +02:00
|
|
|
row = ImageAttachment.objects.select_for_update().get(id=event["id"])
|
|
|
|
except ImageAttachment.DoesNotExist: # nocoverage
|
|
|
|
logger.info("ImageAttachment row %d missing", event["id"])
|
|
|
|
return
|
|
|
|
uploaded_thumbnails = ensure_thumbnails(row)
|
|
|
|
end = time.time()
|
|
|
|
logger.info(
|
|
|
|
"Processed %d thumbnails (%dms)",
|
|
|
|
uploaded_thumbnails,
|
|
|
|
(end - start) * 1000,
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def ensure_thumbnails(image_attachment: ImageAttachment) -> int:
|
|
|
|
needed_thumbnails = missing_thumbnails(image_attachment)
|
|
|
|
|
|
|
|
if not needed_thumbnails:
|
|
|
|
return 0
|
|
|
|
|
|
|
|
written_images = 0
|
|
|
|
image_bytes = BytesIO()
|
|
|
|
save_attachment_contents(image_attachment.path_id, image_bytes)
|
|
|
|
try:
|
|
|
|
# TODO: We could save some computational time by using the same
|
|
|
|
# bytes if multiple resolutions are larger than the source
|
|
|
|
# image. That is, if the input is 10x10, a 100x100.jpg is
|
|
|
|
# going to be the same as a 200x200.jpg, since those set the
|
|
|
|
# max dimensions, and we do not scale up.
|
|
|
|
for thumbnail_format in needed_thumbnails:
|
|
|
|
# This will scale to fit within the given dimensions; it
|
|
|
|
# may be smaller one one or more of them.
|
|
|
|
logger.info(
|
|
|
|
"Resizing to %d x %d, from %d x %d",
|
|
|
|
thumbnail_format.max_width,
|
|
|
|
thumbnail_format.max_height,
|
|
|
|
image_attachment.original_width_px,
|
|
|
|
image_attachment.original_height_px,
|
|
|
|
)
|
|
|
|
load_opts = ""
|
|
|
|
if image_attachment.frames > 1:
|
|
|
|
# If the original has multiple frames, we want to load
|
|
|
|
# one of them if we're outputting to a static format,
|
|
|
|
# otherwise we load them all.
|
|
|
|
if thumbnail_format.animated:
|
|
|
|
load_opts = "n=-1"
|
|
|
|
else:
|
|
|
|
load_opts = "n=1"
|
|
|
|
resized = pyvips.Image.thumbnail_buffer(
|
|
|
|
image_bytes.getbuffer(),
|
|
|
|
thumbnail_format.max_width,
|
|
|
|
height=thumbnail_format.max_height,
|
|
|
|
option_string=load_opts,
|
|
|
|
size=pyvips.Size.DOWN,
|
|
|
|
)
|
|
|
|
thumbnailed_bytes = resized.write_to_buffer(
|
|
|
|
f".{thumbnail_format.extension}[{thumbnail_format.opts}]"
|
|
|
|
)
|
|
|
|
content_type = guess_type(f"image.{thumbnail_format.extension}")[0]
|
|
|
|
assert content_type is not None
|
|
|
|
thumbnail_path = get_image_thumbnail_path(image_attachment, thumbnail_format)
|
|
|
|
logger.info("Uploading %d bytes to %s", len(thumbnailed_bytes), thumbnail_path)
|
|
|
|
upload_backend.upload_message_attachment(
|
|
|
|
thumbnail_path,
|
2024-08-29 23:39:07 +02:00
|
|
|
str(thumbnail_format),
|
2024-06-20 23:58:27 +02:00
|
|
|
content_type,
|
|
|
|
thumbnailed_bytes,
|
|
|
|
None,
|
|
|
|
)
|
|
|
|
height = resized.get("page-height") if thumbnail_format.animated else resized.height
|
|
|
|
image_attachment.thumbnail_metadata.append(
|
|
|
|
asdict(
|
|
|
|
StoredThumbnailFormat(
|
|
|
|
extension=thumbnail_format.extension,
|
|
|
|
content_type=content_type,
|
|
|
|
max_width=thumbnail_format.max_width,
|
|
|
|
max_height=thumbnail_format.max_height,
|
|
|
|
animated=thumbnail_format.animated,
|
|
|
|
width=resized.width,
|
|
|
|
height=height,
|
|
|
|
byte_size=len(thumbnailed_bytes),
|
|
|
|
)
|
|
|
|
)
|
|
|
|
)
|
|
|
|
written_images += 1
|
|
|
|
|
|
|
|
except pyvips.Error as e:
|
|
|
|
logger.exception(e)
|
|
|
|
|
|
|
|
if written_images == 0 and len(image_attachment.thumbnail_metadata) == 0:
|
|
|
|
# We have never thumbnailed this -- it most likely had
|
|
|
|
# bad data. Remove the ImageAttachment row, since it is
|
|
|
|
# not valid for thumbnailing.
|
2024-06-21 21:02:36 +02:00
|
|
|
update_message_rendered_content(
|
|
|
|
image_attachment.realm_id, image_attachment.path_id, None
|
|
|
|
)
|
2024-06-20 23:58:27 +02:00
|
|
|
image_attachment.delete()
|
|
|
|
return 0
|
2024-06-21 21:02:36 +02:00
|
|
|
else: # nocoverage
|
|
|
|
# TODO: Clean up any dangling thumbnails we may have
|
|
|
|
# produced? Seems unlikely that we'd fail on one size,
|
|
|
|
# but not another, but anything's possible.
|
|
|
|
pass
|
2024-06-20 23:58:27 +02:00
|
|
|
|
|
|
|
image_attachment.save(update_fields=["thumbnail_metadata"])
|
2024-07-22 23:07:59 +02:00
|
|
|
url, is_animated = get_default_thumbnail_url(image_attachment)
|
2024-06-21 21:02:36 +02:00
|
|
|
update_message_rendered_content(
|
|
|
|
image_attachment.realm_id,
|
|
|
|
image_attachment.path_id,
|
2024-07-22 23:07:59 +02:00
|
|
|
MarkdownImageMetadata(
|
|
|
|
url=url,
|
|
|
|
is_animated=is_animated,
|
2024-07-22 23:16:03 +02:00
|
|
|
original_width_px=image_attachment.original_width_px,
|
|
|
|
original_height_px=image_attachment.original_height_px,
|
2024-07-22 23:07:59 +02:00
|
|
|
),
|
2024-06-21 21:02:36 +02:00
|
|
|
)
|
2024-06-20 23:58:27 +02:00
|
|
|
return written_images
|
2024-06-21 21:02:36 +02:00
|
|
|
|
|
|
|
|
|
|
|
def update_message_rendered_content(
|
2024-07-22 23:07:59 +02:00
|
|
|
realm_id: int, path_id: str, image_data: MarkdownImageMetadata | None
|
2024-06-21 21:02:36 +02:00
|
|
|
) -> None:
|
2024-10-04 19:24:32 +02:00
|
|
|
for message_class in (Message, ArchivedMessage):
|
2024-06-21 21:02:36 +02:00
|
|
|
messages_with_image = (
|
2024-10-04 19:24:32 +02:00
|
|
|
message_class.objects.filter(realm_id=realm_id, attachment__path_id=path_id)
|
thumbnail: Only lock the message row, not the Attachment row.
This prevents a deadlock between the thumbnailing worker and message
sending, as follows:
1. A user uploads an image, making Attachment and ImageAttachment
rows, as well as enqueuing a job in the thumbnailing queue.
2. Message sending starts a transaction, creates the Message row,
and calls `do_claim_attachments`, which edits the Attachment row
of the upload (implicitly locking it).
3. The thumbnailing worker starts a transaction, locks the
ImageAttachment row for its image, thumbnails it, and then
attempts to `select_for_update()` the message objects (joined to
the Attachments table) to find the ones which link to the
attachment in question. This query blocks, since "a locking
clause without a table list affects all tables used in the
statement"[^1] and the message-send request already has a write
lock on the Attachments row in question.
4. The message-send request attempts to re-fetch the ImageAttachment
row inside the transaction, which tries to pull a lock on it.
5. Deadlock, because the message-send request has the Attachment
lock, and waits for the ImageAttachment lock; the thumbnailing
worker has the ImageAttachment lock, and waits for the Attachment
lock.
We break this deadlock by limiting the
`update_message_rendered_content` `select_for_update` to only take
the lock on the Message table, and not also the Attachments table --
no changes will be made to the Attachments, so no lock is necessary
there. This allows the thumbnailing worker to successfully pull the
empty list of messages (since the message-send request has not
commits its transaction, and thus the Message row is not visible
yet), and release its ImageAttachment lock so that the message-send
request can proceed.
[^1]: https://www.postgresql.org/docs/current/sql-select.html#SQL-FOR-UPDATE-SHARE
2024-10-04 17:31:25 +02:00
|
|
|
.select_for_update(of=("self",))
|
2024-06-21 21:02:36 +02:00
|
|
|
.order_by("id")
|
|
|
|
)
|
|
|
|
for message in messages_with_image:
|
2024-10-04 17:32:03 +02:00
|
|
|
assert message.rendered_content is not None
|
2024-06-21 21:02:36 +02:00
|
|
|
rendered_content = rewrite_thumbnailed_images(
|
|
|
|
message.rendered_content,
|
|
|
|
{} if image_data is None else {path_id: image_data},
|
|
|
|
{path_id} if image_data is None else set(),
|
2024-07-31 18:07:10 +02:00
|
|
|
)[0]
|
2024-06-21 21:02:36 +02:00
|
|
|
if rendered_content is None:
|
|
|
|
# There were no updates -- for instance, if we re-run
|
|
|
|
# ensure_thumbnails on an ImageAttachment we already
|
|
|
|
# ran it on once. Do not bother to no-op update
|
|
|
|
# clients.
|
|
|
|
continue
|
|
|
|
if isinstance(message, Message):
|
|
|
|
# Perform a silent update push to the clients
|
|
|
|
do_update_embedded_data(message.sender, message, rendered_content)
|
|
|
|
else:
|
|
|
|
message.rendered_content = rendered_content
|
|
|
|
message.save(update_fields=["rendered_content"])
|