migrations: Derive emoji content-type from the bytes.

2024-07-17 16:14:17 +00:00 · 2024-07-17 16:14:17 +00:00 · ab6b6639e6
parent 8271c2b083
commit ab6b6639e6
2 changed files with 45 additions and 20 deletions
--- a/tools/semgrep-py.yml
+++ b/tools/semgrep-py.yml
@ -54,7 +54,7 @@ rules:

  - id: dont-import-models-in-migrations
    patterns:
-      - pattern-not: from zerver.lib.mime_types import guess_type
+      - pattern-not: from zerver.lib.mime_types import $X
      - pattern-not: from zerver.lib.redis_utils import get_redis_client
      - pattern-not: from zerver.lib.utils import generate_api_key
      - pattern-not: from zerver.models.linkifiers import filter_pattern_validator
--- a/zerver/migrations/0553_copy_emoji_images.py
+++ b/zerver/migrations/0553_copy_emoji_images.py
@ -7,6 +7,7 @@ from typing import Any

 import boto3
 import botocore
+import magic
 import pyvips
 from botocore.client import Config
 from django.conf import settings
@ -14,7 +15,7 @@ from django.db import migrations
 from django.db.backends.base.schema import BaseDatabaseSchemaEditor
 from django.db.migrations.state import StateApps

-from zerver.lib.mime_types import guess_extension, guess_type
+from zerver.lib.mime_types import guess_extension

 # From zerver.lib.thumbnail
 DEFAULT_EMOJI_SIZE = 64
@ -142,31 +143,50 @@ def thumbnail_local_emoji(apps: StateApps) -> None:
        if total_processed % 100 == 0:
            print(f"Processed {total_processed} custom emoji")

+        old_file_name = emoji.file_name
        try:
-            old_file_name = emoji.file_name
-            content_type = guess_type(old_file_name)[0]
+            base_path = os.path.join(
+                settings.LOCAL_AVATARS_DIR, str(emoji.realm_id), "emoji/images"
+            )
+            copy_from_path = f"{base_path}/{old_file_name}.original"
+            if not os.path.exists(copy_from_path) and os.path.exists(
+                f"{base_path}/{old_file_name}"
+            ):
+                # Imports currently don't write ".original" files, so check without that
+                copy_from_path = f"{base_path}/{old_file_name}"
+                if not os.path.exists(copy_from_path):
+                    raise SkipImageError("Failed to read .original file: Does not exist")
+
+            with open(copy_from_path, "rb") as fh:
+                original_bytes = fh.read()
+
+            # We used to accept any bytes which pillow could
+            # thumbnail, with any filename, and would use the
+            # guessed-from-filename content-type when serving the
+            # emoji.  Examine the bytes of the image to verify that it
+            # is an image of reasonable type, and then derive the real
+            # filename extension (which we will still use for deriving
+            # content-type at serving time) from that.  This ensures
+            # that the contents are a valid image, and that we put the
+            # right content-type on it when served -- the filename
+            # used for the initial upload becomes completely
+            # irrelevant.
+            content_type = magic.from_buffer(original_bytes[:1024], mime=True)
+
            if content_type not in VALID_EMOJI_CONTENT_TYPE:
                raise SkipImageError(f"Invalid content-type: {content_type}")

            new_file_name = get_emoji_file_name(content_type, emoji.id)
            if old_file_name == new_file_name:
                continue
-            base_path = os.path.join(
-                settings.LOCAL_AVATARS_DIR, str(emoji.realm_id), "emoji/images"
-            )
+
            print(f"{base_path}/{old_file_name} -> {base_path}/{new_file_name}")
            try:
                if os.path.exists(f"{base_path}/{new_file_name}.original"):
                    os.unlink(f"{base_path}/{new_file_name}.original")
-                from_file = f"{base_path}/{old_file_name}.original"
-                if not os.path.exists(from_file) and os.path.exists(f"{base_path}/{old_file_name}"):
-                    # Imports currently don't write ".original" files, so check without that
-                    from_file = f"{base_path}/{old_file_name}"
-                os.link(from_file, f"{base_path}/{new_file_name}.original")
-                with open(f"{base_path}/{new_file_name}.original", "rb") as fh:
-                    original_bytes = fh.read()
+                os.link(copy_from_path, f"{base_path}/{new_file_name}.original")
            except OSError as e:
-                raise SkipImageError(f"Failed to read original file: {e}")
+                raise SkipImageError(f"Failed to update .original file: {e}")

            animated, still = resize_emoji(original_bytes, new_file_name)
            try:
@ -230,7 +250,6 @@ def thumbnail_s3(apps: StateApps) -> None:
            try:
                old_data = avatar_bucket.Object(copy_from_path).get()
                original_bytes = old_data["Body"].read()
-                content_type = old_data["ContentType"]
            except botocore.exceptions.ClientError:
                # Imports currently don't write ".original" files, so check without that
                try:
@ -239,10 +258,16 @@ def thumbnail_s3(apps: StateApps) -> None:
                except botocore.exceptions.ClientError as e:
                    raise SkipImageError(f"Failed to read .original file: {e}")
                original_bytes = old_data["Body"].read()
-                # They also may have uploaded as "application/octet-stream", so guess the
-                # content-type from the filename.  If we can't guess, then we'll hit the
-                # SkipImageError case right below this.
-                content_type = guess_type(old_file_name)[0] or "application/octet-stream"
+
+            # We used to accept any bytes which pillow could
+            # thumbnail, with any filename, and would store the
+            # guessed-from-filename content-type in S3, to be used
+            # when serving the emoji.  Examine the bytes of the image
+            # to verify that it is an image of reasonable type, and
+            # then both store that content-type in S3 (for later
+            # serving), as well as using it to derive the right
+            # filename extension (for clarity).
+            content_type = magic.from_buffer(original_bytes[:1024], mime=True)

            if content_type not in VALID_EMOJI_CONTENT_TYPE:
                raise SkipImageError(f"Invalid content-type: {content_type}")