zulip/zerver/migrations/0576_backfill_imageattachme...

import os
from functools import reduce
from operator import or_

import boto3
import pyvips
from botocore.client import Config
from botocore.exceptions import ClientError
from botocore.response import StreamingBody
from django.conf import settings
from django.db import migrations
from django.db.backends.base.schema import BaseDatabaseSchemaEditor
from django.db.migrations.state import StateApps
from django.db.models import Exists, OuterRef, Q

from zerver.lib.partial import partial


def backfill_imageattachment(apps: StateApps, schema_editor: BaseDatabaseSchemaEditor) -> None:
    ImageAttachment = apps.get_model("zerver", "ImageAttachment")
    Attachment = apps.get_model("zerver", "Attachment")

    if settings.LOCAL_UPLOADS_DIR is None:
        upload_bucket = boto3.resource(
            "s3",
            aws_access_key_id=settings.S3_KEY,
            aws_secret_access_key=settings.S3_SECRET_KEY,
            region_name=settings.S3_REGION,
            endpoint_url=settings.S3_ENDPOINT_URL,
            config=Config(
                signature_version=None,
                s3={"addressing_style": settings.S3_ADDRESSING_STYLE},
            ),
        ).Bucket(settings.S3_AUTH_UPLOADS_BUCKET)

    # Historical attachments do not have a mime_type value, so we used
    # to rely on the file extension.  We replicate that when
    # backfilling.  This is the value from zerver.lib.markdown:
    IMAGE_EXTENSIONS = [".bmp", ".gif", ".jpe", ".jpeg", ".jpg", ".png", ".webp"]

    extension_limits = Q()
    extension_limits = reduce(
        or_,
        [Q(file_name__endswith=extension) for extension in IMAGE_EXTENSIONS],
        extension_limits,
    )

    min_id: int | None = 0
    while True:
        attachments = (
            Attachment.objects.alias(
                has_imageattachment=Exists(
                    ImageAttachment.objects.filter(path_id=OuterRef("path_id"))
                )
            )
            .filter(extension_limits, has_imageattachment=False, id__gt=min_id)
            .order_by("id")
        )[:1000]

        min_id = None
        for attachment in attachments:
            min_id = attachment.id

            if settings.LOCAL_UPLOADS_DIR is None:
                try:
                    metadata = upload_bucket.Object(attachment.path_id).get()
                except ClientError:
                    print(f"{attachment.path_id}: Missing!")
                    continue

                def s3_read(streamingbody: StreamingBody, size: int) -> bytes:
                    return streamingbody.read(amt=size)

                # We use the streaming body to only pull down as much
                # of the image as we need to examine the headers --
                # generally about 40k
                source: pyvips.Source = pyvips.SourceCustom()
                source.on_read(partial(s3_read, metadata["Body"]))
            else:
                attachment_path = os.path.join(settings.LOCAL_UPLOADS_DIR, attachment.path_id)
                if not os.path.exists(attachment_path):
                    print(f"{attachment.path_id}: Missing!")
                    continue
                source = pyvips.Source.new_from_file(attachment_path)
            try:
                image = pyvips.Image.new_from_source(source, "", access="sequential")

                # "original_width_px" and "original_height_px" here are
                # _as rendered_, after applying the orientation
                # information which the image may contain.
                if (
                    "orientation" in image.get_fields()
                    and image.get("orientation") >= 5
                    and image.get("orientation") <= 8
                ):
                    (width, height) = (image.height, image.width)
                else:
                    (width, height) = (image.width, image.height)

                ImageAttachment.objects.create(
                    realm_id=attachment.realm_id,
                    path_id=attachment.path_id,
                    original_width_px=width,
                    original_height_px=height,
                    frames=image.get_n_pages(),
                    thumbnail_metadata=[],
                )
            except pyvips.Error:
                pass

        if min_id is None:
            break


class Migration(migrations.Migration):
    atomic = False
    dependencies = [
        # Because this will be backported to 9.x, we only depend on the last migration in 9.x
        ("zerver", "0558_realmuserdefault_web_animate_image_previews_and_more"),
    ]

    operations = [
        migrations.RunPython(
            backfill_imageattachment, reverse_code=migrations.RunPython.noop, elidable=True
        )
    ]
thumbnail: Backfill ImageAttachment rows. We previously used the file extension to determine if we should attempt to inline an image. After b42863be4b2c, we rely on the existence of ImageAttachment rows to determine if something is an image which can be viewed inline. This means that messages containing files uploaded before that commit, when (re-)rendered, will be judged as not having inline'able images. Backfill all of the ImageAttachment rows for image-like file extensions. We are careful to only download the bytes that we need in the image headers, to minimize bandwidth from S3 in the event that the S3 backend is in use. We do _not_ produce thumbnails for the images during this migration; see the subsequent commit. Because this migration will be backported to 9.x, it is marked as only depending on the last migration in `9.x`, with a subsequent merge migration into the tip of `main`. 2024-08-28 03:44:40 +02:00			`import os`
			`from functools import reduce`
			`from operator import or_`

			`import boto3`
			`import pyvips`
			`from botocore.client import Config`
			`from botocore.exceptions import ClientError`
			`from botocore.response import StreamingBody`
			`from django.conf import settings`
			`from django.db import migrations`
			`from django.db.backends.base.schema import BaseDatabaseSchemaEditor`
			`from django.db.migrations.state import StateApps`
			`from django.db.models import Exists, OuterRef, Q`

			`from zerver.lib.partial import partial`


			`def backfill_imageattachment(apps: StateApps, schema_editor: BaseDatabaseSchemaEditor) -> None:`
			`ImageAttachment = apps.get_model("zerver", "ImageAttachment")`
			`Attachment = apps.get_model("zerver", "Attachment")`

			`if settings.LOCAL_UPLOADS_DIR is None:`
			`upload_bucket = boto3.resource(`
			`"s3",`
			`aws_access_key_id=settings.S3_KEY,`
			`aws_secret_access_key=settings.S3_SECRET_KEY,`
			`region_name=settings.S3_REGION,`
			`endpoint_url=settings.S3_ENDPOINT_URL,`
			`config=Config(`
			`signature_version=None,`
			`s3={"addressing_style": settings.S3_ADDRESSING_STYLE},`
			`),`
			`).Bucket(settings.S3_AUTH_UPLOADS_BUCKET)`

			`# Historical attachments do not have a mime_type value, so we used`
			`# to rely on the file extension. We replicate that when`
			`# backfilling. This is the value from zerver.lib.markdown:`
			`IMAGE_EXTENSIONS = [".bmp", ".gif", ".jpe", ".jpeg", ".jpg", ".png", ".webp"]`

			`extension_limits = Q()`
			`extension_limits = reduce(`
			`or_,`
			`[Q(file_name__endswith=extension) for extension in IMAGE_EXTENSIONS],`
			`extension_limits,`
			`)`

			`min_id: int \| None = 0`
			`while True:`
			`attachments = (`
			`Attachment.objects.alias(`
			`has_imageattachment=Exists(`
			`ImageAttachment.objects.filter(path_id=OuterRef("path_id"))`
			`)`
			`)`
			`.filter(extension_limits, has_imageattachment=False, id__gt=min_id)`
			`.order_by("id")`
			`)[:1000]`

			`min_id = None`
			`for attachment in attachments:`
			`min_id = attachment.id`

			`if settings.LOCAL_UPLOADS_DIR is None:`
			`try:`
			`metadata = upload_bucket.Object(attachment.path_id).get()`
			`except ClientError:`
			`print(f"{attachment.path_id}: Missing!")`
			`continue`

			`def s3_read(streamingbody: StreamingBody, size: int) -> bytes:`
			`return streamingbody.read(amt=size)`

			`# We use the streaming body to only pull down as much`
			`# of the image as we need to examine the headers --`
			`# generally about 40k`
			`source: pyvips.Source = pyvips.SourceCustom()`
			`source.on_read(partial(s3_read, metadata["Body"]))`
			`else:`
			`attachment_path = os.path.join(settings.LOCAL_UPLOADS_DIR, attachment.path_id)`
			`if not os.path.exists(attachment_path):`
			`print(f"{attachment.path_id}: Missing!")`
			`continue`
			`source = pyvips.Source.new_from_file(attachment_path)`
			`try:`
			`image = pyvips.Image.new_from_source(source, "", access="sequential")`

			`# "original_width_px" and "original_height_px" here are`
			`# _as rendered_, after applying the orientation`
			`# information which the image may contain.`
			`if (`
			`"orientation" in image.get_fields()`
			`and image.get("orientation") >= 5`
			`and image.get("orientation") <= 8`
			`):`
			`(width, height) = (image.height, image.width)`
			`else:`
			`(width, height) = (image.width, image.height)`

			`ImageAttachment.objects.create(`
			`realm_id=attachment.realm_id,`
			`path_id=attachment.path_id,`
			`original_width_px=width,`
			`original_height_px=height,`
			`frames=image.get_n_pages(),`
			`thumbnail_metadata=[],`
			`)`
			`except pyvips.Error:`
			`pass`

			`if min_id is None:`
			`break`


			`class Migration(migrations.Migration):`
			`atomic = False`
			`dependencies = [`
			`# Because this will be backported to 9.x, we only depend on the last migration in 9.x`
			`("zerver", "0558_realmuserdefault_web_animate_image_previews_and_more"),`
			`]`

			`operations = [`
			`migrations.RunPython(`
			`backfill_imageattachment, reverse_code=migrations.RunPython.noop, elidable=True`
			`)`
			`]`