migrations: Derive emoji content-type from the bytes.

2024-07-17 16:14:17 +00:00 · 2024-07-17 16:14:17 +00:00 · ab6b6639e6
parent 8271c2b083
commit ab6b6639e6
2 changed files with 45 additions and 20 deletions
--- a/tools/semgrep-py.yml
+++ b/tools/semgrep-py.yml
@ -54,7 +54,7 @@ rules:
  - id: dont-import-models-in-migrations
    patterns:
-      - pattern-not: from zerver.lib.mime_types import guess_type
+      - pattern-not: from zerver.lib.mime_types import $X
      - pattern-not: from zerver.lib.redis_utils import get_redis_client
      - pattern-not: from zerver.lib.utils import generate_api_key
      - pattern-not: from zerver.models.linkifiers import filter_pattern_validator
--- a/zerver/migrations/0553_copy_emoji_images.py
+++ b/zerver/migrations/0553_copy_emoji_images.py
@ -7,6 +7,7 @@ from typing import Any
 import boto3
 import botocore
 import magic
 import pyvips
 from botocore.client import Config
 from django.conf import settings
@ -14,7 +15,7 @@ from django.db import migrations
 from django.db.backends.base.schema import BaseDatabaseSchemaEditor
 from django.db.migrations.state import StateApps
-from zerver.lib.mime_types import guess_extension, guess_type
+from zerver.lib.mime_types import guess_extension
 # From zerver.lib.thumbnail
 DEFAULT_EMOJI_SIZE = 64
@ -142,31 +143,50 @@ def thumbnail_local_emoji(apps: StateApps) -> None:
        if total_processed % 100 == 0:
            print(f"Processed {total_processed} custom emoji")
        old_file_name = emoji.file_name
        try:
-            old_file_name = emoji.file_name
+            base_path = os.path.join(
-            content_type = guess_type(old_file_name)[0]
+                settings.LOCAL_AVATARS_DIR, str(emoji.realm_id), "emoji/images"
            )
            copy_from_path = f"{base_path}/{old_file_name}.original"
            if not os.path.exists(copy_from_path) and os.path.exists(
                f"{base_path}/{old_file_name}"
            ):
                # Imports currently don't write ".original" files, so check without that
                copy_from_path = f"{base_path}/{old_file_name}"
                if not os.path.exists(copy_from_path):
                    raise SkipImageError("Failed to read .original file: Does not exist")
            with open(copy_from_path, "rb") as fh:
                original_bytes = fh.read()
            # We used to accept any bytes which pillow could
            # thumbnail, with any filename, and would use the
            # guessed-from-filename content-type when serving the
            # emoji.  Examine the bytes of the image to verify that it
            # is an image of reasonable type, and then derive the real
            # filename extension (which we will still use for deriving
            # content-type at serving time) from that.  This ensures
            # that the contents are a valid image, and that we put the
            # right content-type on it when served -- the filename
            # used for the initial upload becomes completely
            # irrelevant.
            content_type = magic.from_buffer(original_bytes[:1024], mime=True)
            if content_type not in VALID_EMOJI_CONTENT_TYPE:
                raise SkipImageError(f"Invalid content-type: {content_type}")
            new_file_name = get_emoji_file_name(content_type, emoji.id)
            if old_file_name == new_file_name:
                continue
-            base_path = os.path.join(
+
                settings.LOCAL_AVATARS_DIR, str(emoji.realm_id), "emoji/images"
            )
            print(f"{base_path}/{old_file_name} -> {base_path}/{new_file_name}")
            try:
                if os.path.exists(f"{base_path}/{new_file_name}.original"):
                    os.unlink(f"{base_path}/{new_file_name}.original")
-                from_file = f"{base_path}/{old_file_name}.original"
+                os.link(copy_from_path, f"{base_path}/{new_file_name}.original")
                if not os.path.exists(from_file) and os.path.exists(f"{base_path}/{old_file_name}"):
                    # Imports currently don't write ".original" files, so check without that
                    from_file = f"{base_path}/{old_file_name}"
                os.link(from_file, f"{base_path}/{new_file_name}.original")
                with open(f"{base_path}/{new_file_name}.original", "rb") as fh:
                    original_bytes = fh.read()
            except OSError as e:
-                raise SkipImageError(f"Failed to read original file: {e}")
+                raise SkipImageError(f"Failed to update .original file: {e}")
            animated, still = resize_emoji(original_bytes, new_file_name)
            try:
@ -230,7 +250,6 @@ def thumbnail_s3(apps: StateApps) -> None:
            try:
                old_data = avatar_bucket.Object(copy_from_path).get()
                original_bytes = old_data["Body"].read()
                content_type = old_data["ContentType"]
            except botocore.exceptions.ClientError:
                # Imports currently don't write ".original" files, so check without that
                try:
@ -239,10 +258,16 @@ def thumbnail_s3(apps: StateApps) -> None:
                except botocore.exceptions.ClientError as e:
                    raise SkipImageError(f"Failed to read .original file: {e}")
                original_bytes = old_data["Body"].read()
-                # They also may have uploaded as "application/octet-stream", so guess the
+
-                # content-type from the filename.  If we can't guess, then we'll hit the
+            # We used to accept any bytes which pillow could
-                # SkipImageError case right below this.
+            # thumbnail, with any filename, and would store the
-                content_type = guess_type(old_file_name)[0] or "application/octet-stream"
+            # guessed-from-filename content-type in S3, to be used
            # when serving the emoji.  Examine the bytes of the image
            # to verify that it is an image of reasonable type, and
            # then both store that content-type in S3 (for later
            # serving), as well as using it to derive the right
            # filename extension (for clarity).
            content_type = magic.from_buffer(original_bytes[:1024], mime=True)
            if content_type not in VALID_EMOJI_CONTENT_TYPE:
                raise SkipImageError(f"Invalid content-type: {content_type}")