thumbnail: Make thumbnailing work with data import.

We didn't have thumbnailing for images coming from data import and this
commit adds the functionality.

There are a few fundamental issues that the implementation needs to
solve.

1. The images come from an untrusted source and therefore we don't want
   to just pass them through to thumbnailing without checking. For that
   reason, we cannot just import ImageAttachment rows from the export
   data, even for zulip=>zulip imports.
   The right way to process images is to pass them to maybe_thumbail(),
   which runs libvips_check_image() on them to verify we're okay with
   thumbnailing, creates ImageAttachment rows for them and sends them
   to the thumbnailing queue worker. This approach lets us handle both
   zulip=>zulip and 3rd party=>zulip imports in the same way,

2. There is a somewhat circular dependency between the Message,
   Attachment and ImageAttachment import process:

- ImageAttachments would ideally be created after importing
  Attachments, but they need to already exist at the time of Message
  import. Otherwise, the markdown processor doesn't know it has to add
  HTML for image previews to messages that reference images. This would
  mean that messages imported from 3rd party tools don't get image
  previews.
- Attachments only get created after Message import however, due to the
  many-to-many relationship between Message and Attachment.

This is solved by fixing up some data of Attachments pre-emptively, such
as the path_ids. This gives us the necessary information for creating
ImageAttachments before importing Messages.

While we generate ImageAttachment rows synchronously, the actual
thumbnailing job is sent to the queue worker. Theoretically, the worker
could be very backlogged and not process the thumbnails anytime soon.
This is fine - if the app is loaded and tries to display a message with
such a not-yet-generated thumbnail, the code in `serve_file` will
generate the thumbnails synchronously on the fly and the user will see
the image preview displayed normally. See:

1b47134d0d/zerver/views/upload.py (L333-L342)
This commit is contained in:
Mateusz Mandera 2024-10-17 21:20:49 +02:00 committed by Tim Abbott
parent a6b0385229
commit da4443f392
5 changed files with 155 additions and 18 deletions

View File

@ -260,6 +260,9 @@ NON_EXPORTED_TABLES = {
"zerver_submessage",
# Drafts don't need to be exported as they are supposed to be more ephemeral.
"zerver_draft",
# The importer cannot trust ImageAttachment objects anyway and needs to check
# and process images for thumbnailing on its own.
"zerver_imageattachment",
# For any tables listed below here, it's a bug that they are not present in the export.
}

View File

@ -8,6 +8,7 @@ from typing import Any
import bmemcached
import orjson
import pyvips
from bs4 import BeautifulSoup
from django.conf import settings
from django.core.cache import cache
@ -34,7 +35,7 @@ from zerver.lib.push_notifications import sends_notifications_directly
from zerver.lib.remote_server import maybe_enqueue_audit_log_upload
from zerver.lib.server_initialization import create_internal_realm, server_initialized
from zerver.lib.streams import render_stream_description
from zerver.lib.thumbnail import BadImageError
from zerver.lib.thumbnail import THUMBNAIL_ACCEPT_IMAGE_TYPES, BadImageError, maybe_thumbnail
from zerver.lib.timestamp import datetime_to_timestamp
from zerver.lib.upload import ensure_avatar_image, sanitize_name, upload_backend, upload_emoji_image
from zerver.lib.upload.s3 import get_bucket
@ -161,7 +162,16 @@ id_map_to_list: dict[str, dict[int, list[int]]] = {
}
path_maps: dict[str, dict[str, str]] = {
"attachment_path": {},
# Maps original attachment path pre-import to the final, post-import
# attachment path.
"old_attachment_path_to_new_path": {},
# Inverse of old_attachment_path_to_new_path.
"new_attachment_path_to_old_path": {},
# Maps the new (post-import) attachment path to the absolute path to the file
# in the on-disk export data that we're importing.
# Allows code running after this is filled to access file contents
# without needing to go through S3 to get it.
"new_attachment_path_to_local_data_path": {},
}
message_id_to_attachments: dict[str, dict[int, list[str]]] = {
@ -212,13 +222,12 @@ def fix_upload_links(data: TableData, message_table: TableName) -> None:
for message in data[message_table]:
if message["has_attachment"] is True:
for attachment_path in message_id_to_attachments[message_table][message["id"]]:
message["content"] = message["content"].replace(
attachment_path, path_maps["attachment_path"][attachment_path]
)
old_path = path_maps["new_attachment_path_to_old_path"][attachment_path]
message["content"] = message["content"].replace(old_path, attachment_path)
if message["rendered_content"]:
message["rendered_content"] = message["rendered_content"].replace(
attachment_path, path_maps["attachment_path"][attachment_path]
old_path, attachment_path
)
@ -976,7 +985,11 @@ def import_uploads(
relative_path = upload_backend.generate_message_upload_path(
str(record["realm_id"]), sanitize_name(os.path.basename(record["path"]))
)
path_maps["attachment_path"][record["s3_path"]] = relative_path
path_maps["old_attachment_path_to_new_path"][record["s3_path"]] = relative_path
path_maps["new_attachment_path_to_old_path"][relative_path] = record["s3_path"]
path_maps["new_attachment_path_to_local_data_path"][relative_path] = os.path.join(
import_dir, record["path"]
)
if s3_uploads:
key = bucket.Object(relative_path)
@ -1612,6 +1625,22 @@ def do_import_realm(import_dir: Path, subdomain: str, processes: int = 1) -> Rea
with open(attachments_file, "rb") as f:
attachment_data = orjson.loads(f.read())
# We need to import ImageAttachments before messages, as the message rendering logic
# checks for existence of ImageAttachment records to determine if HTML content for image
# preview needs to be added to a message.
# In order for ImageAttachments to be correctly created, we need to know the new path_ids
# and content_types of the attachments.
#
# Begin by fixing up the Attachment data.
fix_attachments_data(attachment_data)
# Now we're ready create ImageAttachment rows and enqueue thumbnailing
# for the images.
# This order ensures that during message import, rendered_content will be generated
# correctly with image previews.
# The important detail here is that we **only** care about having ImageAttachment
# rows ready at the time of message import. Thumbnailing happens in a queue worker
# in a different process, and we don't care about when it'll complete.
create_image_attachments_and_maybe_enqueue_thumbnailing(realm, attachment_data)
map_messages_to_attachments(attachment_data)
# Import zerver_message and zerver_usermessage
@ -1930,10 +1959,6 @@ def import_attachments(data: TableData) -> None:
"scheduledmessage_id",
)
# Update 'path_id' for the attachments
for attachment in data[parent_db_table_name]:
attachment["path_id"] = path_maps["attachment_path"][attachment["path_id"]]
# Next, load the parent rows.
bulk_import_model(data, parent_model)
@ -1960,6 +1985,36 @@ def import_attachments(data: TableData) -> None:
logging.info("Successfully imported M2M table %s", m2m_table_name)
def fix_attachments_data(attachment_data: TableData) -> None:
for attachment in attachment_data["zerver_attachment"]:
attachment["path_id"] = path_maps["old_attachment_path_to_new_path"][attachment["path_id"]]
# In the case of images, content_type needs to be set for thumbnailing.
# Zulip exports set this, but third-party exports may not.
if attachment.get("content_type") is None:
guessed_content_type = guess_type(attachment["path_id"])[0]
if guessed_content_type in THUMBNAIL_ACCEPT_IMAGE_TYPES:
attachment["content_type"] = guessed_content_type
def create_image_attachments_and_maybe_enqueue_thumbnailing(
realm: Realm, attachment_data: TableData
) -> None:
for attachment in attachment_data["zerver_attachment"]:
if attachment["content_type"] not in THUMBNAIL_ACCEPT_IMAGE_TYPES:
continue
path_id = attachment["path_id"]
content_type = attachment["content_type"]
# We don't have to go to S3 to obtain the file. We still have the export
# data on disk and stored the absolute path to it.
local_filename = path_maps["new_attachment_path_to_local_data_path"][path_id]
pyvips_source = pyvips.Source.new_from_file(local_filename)
maybe_thumbnail(pyvips_source, content_type, path_id, realm.id)
continue
def import_analytics_data(realm: Realm, import_dir: Path, crossrealm_user_ids: set[int]) -> None:
analytics_filename = os.path.join(import_dir, "analytics.json")
if not os.path.exists(analytics_filename):

View File

@ -14,7 +14,7 @@ from typing_extensions import override
from zerver.lib.exceptions import ErrorCode, JsonableError
from zerver.lib.queue import queue_event_on_commit
from zerver.models import AbstractAttachment, ImageAttachment
from zerver.models import ImageAttachment
DEFAULT_AVATAR_SIZE = 100
MEDIUM_AVATAR_SIZE = 500
@ -279,9 +279,9 @@ def missing_thumbnails(image_attachment: ImageAttachment) -> list[ThumbnailForma
def maybe_thumbnail(
attachment: AbstractAttachment, content: bytes | pyvips.Source
content: bytes | pyvips.Source, content_type: str | None, path_id: str, realm_id: int
) -> ImageAttachment | None:
if attachment.content_type not in THUMBNAIL_ACCEPT_IMAGE_TYPES:
if content_type not in THUMBNAIL_ACCEPT_IMAGE_TYPES:
# If it doesn't self-report as an image file that we might want
# to thumbnail, don't parse the bytes at all.
return None
@ -301,8 +301,8 @@ def maybe_thumbnail(
(width, height) = (image.width, image.height)
image_row = ImageAttachment.objects.create(
realm_id=attachment.realm_id,
path_id=attachment.path_id,
realm_id=realm_id,
path_id=path_id,
original_width_px=width,
original_height_px=height,
frames=image.get_n_pages(),

View File

@ -70,7 +70,7 @@ def create_attachment(
size=file_size,
content_type=content_type,
)
maybe_thumbnail(attachment, file_real_data)
maybe_thumbnail(file_real_data, content_type, path_id, realm.id)
from zerver.actions.uploads import notify_attachment_update
notify_attachment_update(user_profile, "add", attachment.to_dict())

View File

@ -93,6 +93,7 @@ from zerver.models import (
)
from zerver.models.clients import get_client
from zerver.models.groups import SystemGroups
from zerver.models.messages import ImageAttachment
from zerver.models.presence import PresenceSequence
from zerver.models.realm_audit_logs import AuditLogEventType
from zerver.models.realms import get_realm
@ -335,6 +336,9 @@ class ExportFile(ZulipTestCase):
class RealmImportExportTest(ExportFile):
def create_user_and_login(self, email: str, realm: Realm) -> None:
self.register(email, "test", subdomain=realm.subdomain)
def export_realm(
self,
realm: Realm,
@ -811,6 +815,32 @@ class RealmImportExportTest(ExportFile):
)
self.assertEqual(realm_emoji.name, "hawaii")
# We want to set up some image data to verify image attachment thumbnailing works correctly
# in the import.
# We'll create a new user to use as the sender of the messages with such images,
# so that we can easily find them after importing - by fetching messages sent
# by the thumbnailing_test_user_email account.
thumbnailing_test_user_email = "thumbnailing_test@zulip.com"
self.create_user_and_login(thumbnailing_test_user_email, original_realm)
thumbnailing_test_user = get_user_by_delivery_email(
thumbnailing_test_user_email, original_realm
)
# Send a message with the image. After the import, we'll verify that this message
# and the associated ImageAttachment have been created correctly.
image_path_id = self.upload_and_thumbnail_image("img.png")
self.send_stream_message(
sender=thumbnailing_test_user,
stream_name="Verona",
content=f"An [image](/user_uploads/{image_path_id})",
)
image_attachment = ImageAttachment.objects.get(path_id=image_path_id)
# Malform some ImageAttachment info. These shouldn't get exported (and certainly not imported!)
# anyway, so we can test that this misinformation doesn't make its way into the imported realm.
image_attachment.original_width_px = 9999
image_attachment.original_height_px = 9999
image_attachment.save()
# Deactivate a user to ensure such a case is covered.
do_deactivate_user(self.example_user("aaron"), acting_user=None)
@ -1003,7 +1033,14 @@ class RealmImportExportTest(ExportFile):
self.export_realm(original_realm, export_type=RealmExport.EXPORT_FULL_WITHOUT_CONSENT)
with self.settings(BILLING_ENABLED=False), self.assertLogs(level="INFO"):
with (
self.settings(BILLING_ENABLED=False),
self.assertLogs(level="INFO"),
# With captureOnCommitCallbacks we ensure that tasks delegated to the queue workers
# are executed immediately. We use this to make thumbnailing runs in the import
# process in this test.
self.captureOnCommitCallbacks(execute=True),
):
do_import_realm(get_output_dir(), "test-zulip")
# Make sure our export/import didn't somehow leak info into the
@ -1164,6 +1201,48 @@ class RealmImportExportTest(ExportFile):
Message.objects.filter(realm=imported_realm).count(),
)
# Verify thumbnailing.
imported_thumbnailing_test_user = get_user_by_delivery_email(
thumbnailing_test_user_email, imported_realm
)
imported_messages_with_thumbnail = Message.objects.filter(
sender=imported_thumbnailing_test_user, realm=imported_realm
)
imported_message_with_thumbnail = imported_messages_with_thumbnail.latest("id")
attachment_with_thumbnail = Attachment.objects.get(
owner=imported_thumbnailing_test_user, messages=imported_message_with_thumbnail
)
path_id = attachment_with_thumbnail.path_id
# An ImageAttachment has been created in the import process.
imported_image_attachment = ImageAttachment.objects.get(
path_id=path_id, realm=imported_realm
)
# It figured out the dimensions correctly and didn't inherit the bad data in the
# original ImageAttachment.
self.assertEqual(imported_image_attachment.original_width_px, 128)
self.assertEqual(imported_image_attachment.original_height_px, 128)
# ImageAttachment.thumbnail_metadata contains information about thumbnails that actually
# got generated. By asserting it's not empty, we make sure thumbnailing ran for the image
# and that we didn't merely create the ImageAttachment row in the database.
self.assertNotEqual(len(imported_image_attachment.thumbnail_metadata), 0)
self.assertTrue(imported_image_attachment.thumbnail_metadata[0])
# Content and rendered_content got updated correctly, to point to the correct, new path_id
# and include the HTML for image preview using the thumbnail.
self.assertEqual(
imported_message_with_thumbnail.content, f"An [image](/user_uploads/{path_id})"
)
expected_rendered_preview = (
f'<p>An <a href="/user_uploads/{path_id}">image</a></p>\n'
f'<div class="message_inline_image"><a href="/user_uploads/{path_id}" title="image">'
f'<img data-original-dimensions="128x128" src="/user_uploads/thumbnail/{path_id}/840x560.webp"></a></div>'
)
self.assertEqual(
imported_message_with_thumbnail.rendered_content, expected_rendered_preview
)
def test_import_message_edit_history(self) -> None:
realm = get_realm("zulip")
iago = self.example_user("iago")