diff --git a/zerver/lib/export.py b/zerver/lib/export.py index e8be4e7771..568a856ab2 100644 --- a/zerver/lib/export.py +++ b/zerver/lib/export.py @@ -260,6 +260,9 @@ NON_EXPORTED_TABLES = { "zerver_submessage", # Drafts don't need to be exported as they are supposed to be more ephemeral. "zerver_draft", + # The importer cannot trust ImageAttachment objects anyway and needs to check + # and process images for thumbnailing on its own. + "zerver_imageattachment", # For any tables listed below here, it's a bug that they are not present in the export. } diff --git a/zerver/lib/import_realm.py b/zerver/lib/import_realm.py index d58cbda996..c3ef0a94ec 100644 --- a/zerver/lib/import_realm.py +++ b/zerver/lib/import_realm.py @@ -8,6 +8,7 @@ from typing import Any import bmemcached import orjson +import pyvips from bs4 import BeautifulSoup from django.conf import settings from django.core.cache import cache @@ -34,7 +35,7 @@ from zerver.lib.push_notifications import sends_notifications_directly from zerver.lib.remote_server import maybe_enqueue_audit_log_upload from zerver.lib.server_initialization import create_internal_realm, server_initialized from zerver.lib.streams import render_stream_description -from zerver.lib.thumbnail import BadImageError +from zerver.lib.thumbnail import THUMBNAIL_ACCEPT_IMAGE_TYPES, BadImageError, maybe_thumbnail from zerver.lib.timestamp import datetime_to_timestamp from zerver.lib.upload import ensure_avatar_image, sanitize_name, upload_backend, upload_emoji_image from zerver.lib.upload.s3 import get_bucket @@ -161,7 +162,16 @@ id_map_to_list: dict[str, dict[int, list[int]]] = { } path_maps: dict[str, dict[str, str]] = { - "attachment_path": {}, + # Maps original attachment path pre-import to the final, post-import + # attachment path. + "old_attachment_path_to_new_path": {}, + # Inverse of old_attachment_path_to_new_path. + "new_attachment_path_to_old_path": {}, + # Maps the new (post-import) attachment path to the absolute path to the file + # in the on-disk export data that we're importing. + # Allows code running after this is filled to access file contents + # without needing to go through S3 to get it. + "new_attachment_path_to_local_data_path": {}, } message_id_to_attachments: dict[str, dict[int, list[str]]] = { @@ -212,13 +222,12 @@ def fix_upload_links(data: TableData, message_table: TableName) -> None: for message in data[message_table]: if message["has_attachment"] is True: for attachment_path in message_id_to_attachments[message_table][message["id"]]: - message["content"] = message["content"].replace( - attachment_path, path_maps["attachment_path"][attachment_path] - ) + old_path = path_maps["new_attachment_path_to_old_path"][attachment_path] + message["content"] = message["content"].replace(old_path, attachment_path) if message["rendered_content"]: message["rendered_content"] = message["rendered_content"].replace( - attachment_path, path_maps["attachment_path"][attachment_path] + old_path, attachment_path ) @@ -976,7 +985,11 @@ def import_uploads( relative_path = upload_backend.generate_message_upload_path( str(record["realm_id"]), sanitize_name(os.path.basename(record["path"])) ) - path_maps["attachment_path"][record["s3_path"]] = relative_path + path_maps["old_attachment_path_to_new_path"][record["s3_path"]] = relative_path + path_maps["new_attachment_path_to_old_path"][relative_path] = record["s3_path"] + path_maps["new_attachment_path_to_local_data_path"][relative_path] = os.path.join( + import_dir, record["path"] + ) if s3_uploads: key = bucket.Object(relative_path) @@ -1612,6 +1625,22 @@ def do_import_realm(import_dir: Path, subdomain: str, processes: int = 1) -> Rea with open(attachments_file, "rb") as f: attachment_data = orjson.loads(f.read()) + # We need to import ImageAttachments before messages, as the message rendering logic + # checks for existence of ImageAttachment records to determine if HTML content for image + # preview needs to be added to a message. + # In order for ImageAttachments to be correctly created, we need to know the new path_ids + # and content_types of the attachments. + # + # Begin by fixing up the Attachment data. + fix_attachments_data(attachment_data) + # Now we're ready create ImageAttachment rows and enqueue thumbnailing + # for the images. + # This order ensures that during message import, rendered_content will be generated + # correctly with image previews. + # The important detail here is that we **only** care about having ImageAttachment + # rows ready at the time of message import. Thumbnailing happens in a queue worker + # in a different process, and we don't care about when it'll complete. + create_image_attachments_and_maybe_enqueue_thumbnailing(realm, attachment_data) map_messages_to_attachments(attachment_data) # Import zerver_message and zerver_usermessage @@ -1930,10 +1959,6 @@ def import_attachments(data: TableData) -> None: "scheduledmessage_id", ) - # Update 'path_id' for the attachments - for attachment in data[parent_db_table_name]: - attachment["path_id"] = path_maps["attachment_path"][attachment["path_id"]] - # Next, load the parent rows. bulk_import_model(data, parent_model) @@ -1960,6 +1985,36 @@ def import_attachments(data: TableData) -> None: logging.info("Successfully imported M2M table %s", m2m_table_name) +def fix_attachments_data(attachment_data: TableData) -> None: + for attachment in attachment_data["zerver_attachment"]: + attachment["path_id"] = path_maps["old_attachment_path_to_new_path"][attachment["path_id"]] + + # In the case of images, content_type needs to be set for thumbnailing. + # Zulip exports set this, but third-party exports may not. + if attachment.get("content_type") is None: + guessed_content_type = guess_type(attachment["path_id"])[0] + if guessed_content_type in THUMBNAIL_ACCEPT_IMAGE_TYPES: + attachment["content_type"] = guessed_content_type + + +def create_image_attachments_and_maybe_enqueue_thumbnailing( + realm: Realm, attachment_data: TableData +) -> None: + for attachment in attachment_data["zerver_attachment"]: + if attachment["content_type"] not in THUMBNAIL_ACCEPT_IMAGE_TYPES: + continue + + path_id = attachment["path_id"] + content_type = attachment["content_type"] + + # We don't have to go to S3 to obtain the file. We still have the export + # data on disk and stored the absolute path to it. + local_filename = path_maps["new_attachment_path_to_local_data_path"][path_id] + pyvips_source = pyvips.Source.new_from_file(local_filename) + maybe_thumbnail(pyvips_source, content_type, path_id, realm.id) + continue + + def import_analytics_data(realm: Realm, import_dir: Path, crossrealm_user_ids: set[int]) -> None: analytics_filename = os.path.join(import_dir, "analytics.json") if not os.path.exists(analytics_filename): diff --git a/zerver/lib/thumbnail.py b/zerver/lib/thumbnail.py index 4bba1ee9dd..ce565d5817 100644 --- a/zerver/lib/thumbnail.py +++ b/zerver/lib/thumbnail.py @@ -14,7 +14,7 @@ from typing_extensions import override from zerver.lib.exceptions import ErrorCode, JsonableError from zerver.lib.queue import queue_event_on_commit -from zerver.models import AbstractAttachment, ImageAttachment +from zerver.models import ImageAttachment DEFAULT_AVATAR_SIZE = 100 MEDIUM_AVATAR_SIZE = 500 @@ -279,9 +279,9 @@ def missing_thumbnails(image_attachment: ImageAttachment) -> list[ThumbnailForma def maybe_thumbnail( - attachment: AbstractAttachment, content: bytes | pyvips.Source + content: bytes | pyvips.Source, content_type: str | None, path_id: str, realm_id: int ) -> ImageAttachment | None: - if attachment.content_type not in THUMBNAIL_ACCEPT_IMAGE_TYPES: + if content_type not in THUMBNAIL_ACCEPT_IMAGE_TYPES: # If it doesn't self-report as an image file that we might want # to thumbnail, don't parse the bytes at all. return None @@ -301,8 +301,8 @@ def maybe_thumbnail( (width, height) = (image.width, image.height) image_row = ImageAttachment.objects.create( - realm_id=attachment.realm_id, - path_id=attachment.path_id, + realm_id=realm_id, + path_id=path_id, original_width_px=width, original_height_px=height, frames=image.get_n_pages(), diff --git a/zerver/lib/upload/__init__.py b/zerver/lib/upload/__init__.py index e1e5d83fa5..3557c3f601 100644 --- a/zerver/lib/upload/__init__.py +++ b/zerver/lib/upload/__init__.py @@ -70,7 +70,7 @@ def create_attachment( size=file_size, content_type=content_type, ) - maybe_thumbnail(attachment, file_real_data) + maybe_thumbnail(file_real_data, content_type, path_id, realm.id) from zerver.actions.uploads import notify_attachment_update notify_attachment_update(user_profile, "add", attachment.to_dict()) diff --git a/zerver/tests/test_import_export.py b/zerver/tests/test_import_export.py index 5b38e877e4..cfda844d94 100644 --- a/zerver/tests/test_import_export.py +++ b/zerver/tests/test_import_export.py @@ -93,6 +93,7 @@ from zerver.models import ( ) from zerver.models.clients import get_client from zerver.models.groups import SystemGroups +from zerver.models.messages import ImageAttachment from zerver.models.presence import PresenceSequence from zerver.models.realm_audit_logs import AuditLogEventType from zerver.models.realms import get_realm @@ -335,6 +336,9 @@ class ExportFile(ZulipTestCase): class RealmImportExportTest(ExportFile): + def create_user_and_login(self, email: str, realm: Realm) -> None: + self.register(email, "test", subdomain=realm.subdomain) + def export_realm( self, realm: Realm, @@ -811,6 +815,32 @@ class RealmImportExportTest(ExportFile): ) self.assertEqual(realm_emoji.name, "hawaii") + # We want to set up some image data to verify image attachment thumbnailing works correctly + # in the import. + # We'll create a new user to use as the sender of the messages with such images, + # so that we can easily find them after importing - by fetching messages sent + # by the thumbnailing_test_user_email account. + thumbnailing_test_user_email = "thumbnailing_test@zulip.com" + self.create_user_and_login(thumbnailing_test_user_email, original_realm) + thumbnailing_test_user = get_user_by_delivery_email( + thumbnailing_test_user_email, original_realm + ) + + # Send a message with the image. After the import, we'll verify that this message + # and the associated ImageAttachment have been created correctly. + image_path_id = self.upload_and_thumbnail_image("img.png") + self.send_stream_message( + sender=thumbnailing_test_user, + stream_name="Verona", + content=f"An [image](/user_uploads/{image_path_id})", + ) + image_attachment = ImageAttachment.objects.get(path_id=image_path_id) + # Malform some ImageAttachment info. These shouldn't get exported (and certainly not imported!) + # anyway, so we can test that this misinformation doesn't make its way into the imported realm. + image_attachment.original_width_px = 9999 + image_attachment.original_height_px = 9999 + image_attachment.save() + # Deactivate a user to ensure such a case is covered. do_deactivate_user(self.example_user("aaron"), acting_user=None) @@ -1003,7 +1033,14 @@ class RealmImportExportTest(ExportFile): self.export_realm(original_realm, export_type=RealmExport.EXPORT_FULL_WITHOUT_CONSENT) - with self.settings(BILLING_ENABLED=False), self.assertLogs(level="INFO"): + with ( + self.settings(BILLING_ENABLED=False), + self.assertLogs(level="INFO"), + # With captureOnCommitCallbacks we ensure that tasks delegated to the queue workers + # are executed immediately. We use this to make thumbnailing runs in the import + # process in this test. + self.captureOnCommitCallbacks(execute=True), + ): do_import_realm(get_output_dir(), "test-zulip") # Make sure our export/import didn't somehow leak info into the @@ -1164,6 +1201,48 @@ class RealmImportExportTest(ExportFile): Message.objects.filter(realm=imported_realm).count(), ) + # Verify thumbnailing. + imported_thumbnailing_test_user = get_user_by_delivery_email( + thumbnailing_test_user_email, imported_realm + ) + imported_messages_with_thumbnail = Message.objects.filter( + sender=imported_thumbnailing_test_user, realm=imported_realm + ) + imported_message_with_thumbnail = imported_messages_with_thumbnail.latest("id") + attachment_with_thumbnail = Attachment.objects.get( + owner=imported_thumbnailing_test_user, messages=imported_message_with_thumbnail + ) + + path_id = attachment_with_thumbnail.path_id + # An ImageAttachment has been created in the import process. + imported_image_attachment = ImageAttachment.objects.get( + path_id=path_id, realm=imported_realm + ) + + # It figured out the dimensions correctly and didn't inherit the bad data in the + # original ImageAttachment. + self.assertEqual(imported_image_attachment.original_width_px, 128) + self.assertEqual(imported_image_attachment.original_height_px, 128) + # ImageAttachment.thumbnail_metadata contains information about thumbnails that actually + # got generated. By asserting it's not empty, we make sure thumbnailing ran for the image + # and that we didn't merely create the ImageAttachment row in the database. + self.assertNotEqual(len(imported_image_attachment.thumbnail_metadata), 0) + self.assertTrue(imported_image_attachment.thumbnail_metadata[0]) + + # Content and rendered_content got updated correctly, to point to the correct, new path_id + # and include the HTML for image preview using the thumbnail. + self.assertEqual( + imported_message_with_thumbnail.content, f"An [image](/user_uploads/{path_id})" + ) + expected_rendered_preview = ( + f'

An image

\n' + f'
' + f'
' + ) + self.assertEqual( + imported_message_with_thumbnail.rendered_content, expected_rendered_preview + ) + def test_import_message_edit_history(self) -> None: realm = get_realm("zulip") iago = self.example_user("iago")