thumbnail: Backfill ImageAttachment rows.

We previously used the file extension to determine if we should
attempt to inline an image.  After b42863be4b, we rely on the
existence of ImageAttachment rows to determine if something is an
image which can be viewed inline.  This means that messages
containing files uploaded before that commit, when (re-)rendered, will
be judged as not having inline'able images.

Backfill all of the ImageAttachment rows for image-like file
extensions.  We are careful to only download the bytes that we need in
the image headers, to minimize bandwidth from S3 in the event that the
S3 backend is in use.  We do _not_ produce thumbnails for the images
during this migration; see the subsequent commit.

Because this migration will be backported to 9.x, it is marked as only
depending on the last migration in `9.x`, with a subsequent merge
migration into the tip of `main`.
This commit is contained in:
Alex Vandiver 2024-08-28 01:44:40 +00:00 committed by Tim Abbott
parent d796deddf8
commit df91cdf333
3 changed files with 137 additions and 0 deletions

View File

@ -54,6 +54,7 @@ rules:
- id: dont-import-models-in-migrations - id: dont-import-models-in-migrations
patterns: patterns:
- pattern-not: from zerver.lib.partial import partial
- pattern-not: from zerver.lib.mime_types import $X - pattern-not: from zerver.lib.mime_types import $X
- pattern-not: from zerver.lib.redis_utils import get_redis_client - pattern-not: from zerver.lib.redis_utils import get_redis_client
- pattern-not: from zerver.lib.utils import generate_api_key - pattern-not: from zerver.lib.utils import generate_api_key

View File

@ -0,0 +1,126 @@
import os
from functools import reduce
from operator import or_
import boto3
import pyvips
from botocore.client import Config
from botocore.exceptions import ClientError
from botocore.response import StreamingBody
from django.conf import settings
from django.db import migrations
from django.db.backends.base.schema import BaseDatabaseSchemaEditor
from django.db.migrations.state import StateApps
from django.db.models import Exists, OuterRef, Q
from zerver.lib.partial import partial
def backfill_imageattachment(apps: StateApps, schema_editor: BaseDatabaseSchemaEditor) -> None:
ImageAttachment = apps.get_model("zerver", "ImageAttachment")
Attachment = apps.get_model("zerver", "Attachment")
if settings.LOCAL_UPLOADS_DIR is None:
upload_bucket = boto3.resource(
"s3",
aws_access_key_id=settings.S3_KEY,
aws_secret_access_key=settings.S3_SECRET_KEY,
region_name=settings.S3_REGION,
endpoint_url=settings.S3_ENDPOINT_URL,
config=Config(
signature_version=None,
s3={"addressing_style": settings.S3_ADDRESSING_STYLE},
),
).Bucket(settings.S3_AUTH_UPLOADS_BUCKET)
# Historical attachments do not have a mime_type value, so we used
# to rely on the file extension. We replicate that when
# backfilling. This is the value from zerver.lib.markdown:
IMAGE_EXTENSIONS = [".bmp", ".gif", ".jpe", ".jpeg", ".jpg", ".png", ".webp"]
extension_limits = Q()
extension_limits = reduce(
or_,
[Q(file_name__endswith=extension) for extension in IMAGE_EXTENSIONS],
extension_limits,
)
min_id: int | None = 0
while True:
attachments = (
Attachment.objects.alias(
has_imageattachment=Exists(
ImageAttachment.objects.filter(path_id=OuterRef("path_id"))
)
)
.filter(extension_limits, has_imageattachment=False, id__gt=min_id)
.order_by("id")
)[:1000]
min_id = None
for attachment in attachments:
min_id = attachment.id
if settings.LOCAL_UPLOADS_DIR is None:
try:
metadata = upload_bucket.Object(attachment.path_id).get()
except ClientError:
print(f"{attachment.path_id}: Missing!")
continue
def s3_read(streamingbody: StreamingBody, size: int) -> bytes:
return streamingbody.read(amt=size)
# We use the streaming body to only pull down as much
# of the image as we need to examine the headers --
# generally about 40k
source: pyvips.Source = pyvips.SourceCustom()
source.on_read(partial(s3_read, metadata["Body"]))
else:
attachment_path = os.path.join(settings.LOCAL_UPLOADS_DIR, attachment.path_id)
if not os.path.exists(attachment_path):
print(f"{attachment.path_id}: Missing!")
continue
source = pyvips.Source.new_from_file(attachment_path)
try:
image = pyvips.Image.new_from_source(source, "", access="sequential")
# "original_width_px" and "original_height_px" here are
# _as rendered_, after applying the orientation
# information which the image may contain.
if (
"orientation" in image.get_fields()
and image.get("orientation") >= 5
and image.get("orientation") <= 8
):
(width, height) = (image.height, image.width)
else:
(width, height) = (image.width, image.height)
ImageAttachment.objects.create(
realm_id=attachment.realm_id,
path_id=attachment.path_id,
original_width_px=width,
original_height_px=height,
frames=image.get_n_pages(),
thumbnail_metadata=[],
)
except pyvips.Error:
pass
if min_id is None:
break
class Migration(migrations.Migration):
atomic = False
dependencies = [
# Because this will be backported to 9.x, we only depend on the last migration in 9.x
("zerver", "0558_realmuserdefault_web_animate_image_previews_and_more"),
]
operations = [
migrations.RunPython(
backfill_imageattachment, reverse_code=migrations.RunPython.noop, elidable=True
)
]

View File

@ -0,0 +1,10 @@
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("zerver", "0575_alter_directmessagegroup_group_size"),
("zerver", "0576_backfill_imageattachment"),
]
operations = []