2022-04-14 23:43:26 +02:00
|
|
|
import logging
|
2024-07-28 03:21:33 +02:00
|
|
|
from dataclasses import dataclass
|
2024-07-12 02:30:23 +02:00
|
|
|
from typing import Any
|
2022-04-14 23:43:26 +02:00
|
|
|
|
2023-12-15 20:03:19 +01:00
|
|
|
from zerver.lib.attachments import get_old_unclaimed_attachments, validate_attachment_request
|
2022-04-14 23:43:26 +02:00
|
|
|
from zerver.lib.markdown import MessageRenderingResult
|
2024-07-17 03:11:57 +02:00
|
|
|
from zerver.lib.thumbnail import StoredThumbnailFormat, get_image_thumbnail_path
|
|
|
|
from zerver.lib.upload import claim_attachment, delete_message_attachments
|
2024-06-20 23:58:27 +02:00
|
|
|
from zerver.models import (
|
|
|
|
Attachment,
|
|
|
|
ImageAttachment,
|
|
|
|
Message,
|
|
|
|
ScheduledMessage,
|
|
|
|
Stream,
|
|
|
|
UserProfile,
|
|
|
|
)
|
2024-05-15 19:24:37 +02:00
|
|
|
from zerver.tornado.django_api import send_event_on_commit
|
2022-04-14 23:43:26 +02:00
|
|
|
|
|
|
|
|
2024-07-28 03:21:33 +02:00
|
|
|
@dataclass
|
|
|
|
class AttachmentChangeResult:
|
|
|
|
did_attachment_change: bool
|
|
|
|
detached_attachments: list[dict[str, Any]]
|
|
|
|
|
|
|
|
|
2022-04-14 23:43:26 +02:00
|
|
|
def notify_attachment_update(
|
2024-07-12 02:30:17 +02:00
|
|
|
user_profile: UserProfile, op: str, attachment_dict: dict[str, Any]
|
2022-04-14 23:43:26 +02:00
|
|
|
) -> None:
|
|
|
|
event = {
|
|
|
|
"type": "attachment",
|
|
|
|
"op": op,
|
|
|
|
"attachment": attachment_dict,
|
|
|
|
"upload_space_used": user_profile.realm.currently_used_upload_space_bytes(),
|
|
|
|
}
|
2024-05-15 19:24:37 +02:00
|
|
|
send_event_on_commit(user_profile.realm, event, [user_profile.id])
|
2022-04-14 23:43:26 +02:00
|
|
|
|
|
|
|
|
2023-05-07 20:04:37 +02:00
|
|
|
def do_claim_attachments(
|
2024-07-12 02:30:23 +02:00
|
|
|
message: Message | ScheduledMessage, potential_path_ids: list[str]
|
2023-05-07 20:04:37 +02:00
|
|
|
) -> bool:
|
2022-04-14 23:43:26 +02:00
|
|
|
claimed = False
|
|
|
|
for path_id in potential_path_ids:
|
|
|
|
user_profile = message.sender
|
|
|
|
is_message_realm_public = False
|
|
|
|
is_message_web_public = False
|
|
|
|
if message.is_stream_message():
|
|
|
|
stream = Stream.objects.get(id=message.recipient.type_id)
|
|
|
|
is_message_realm_public = stream.is_public()
|
|
|
|
is_message_web_public = stream.is_web_public
|
|
|
|
|
|
|
|
if not validate_attachment_request(user_profile, path_id):
|
|
|
|
# Technically, there are 2 cases here:
|
|
|
|
# * The user put something in their message that has the form
|
2023-08-15 11:19:16 +02:00
|
|
|
# of an upload URL, but does not actually correspond to a previously
|
|
|
|
# uploaded file. validate_attachment_request will return None.
|
2022-04-14 23:43:26 +02:00
|
|
|
# * The user is trying to send a link to a file they don't have permission to
|
|
|
|
# access themselves. validate_attachment_request will return False.
|
|
|
|
#
|
|
|
|
# Either case is unusual and suggests a UI bug that got
|
|
|
|
# the user in this situation, so we log in these cases.
|
|
|
|
logging.warning(
|
|
|
|
"User %s tried to share upload %s in message %s, but lacks permission",
|
|
|
|
user_profile.id,
|
|
|
|
path_id,
|
|
|
|
message.id,
|
|
|
|
)
|
|
|
|
continue
|
|
|
|
|
|
|
|
claimed = True
|
|
|
|
attachment = claim_attachment(
|
2024-05-20 18:53:28 +02:00
|
|
|
path_id, message, is_message_realm_public, is_message_web_public
|
2022-04-14 23:43:26 +02:00
|
|
|
)
|
2023-05-07 20:04:37 +02:00
|
|
|
if not isinstance(message, ScheduledMessage):
|
|
|
|
# attachment update events don't say anything about scheduled messages,
|
|
|
|
# so sending an event is pointless.
|
|
|
|
notify_attachment_update(user_profile, "update", attachment.to_dict())
|
2022-04-14 23:43:26 +02:00
|
|
|
return claimed
|
|
|
|
|
|
|
|
|
2024-07-12 23:26:52 +02:00
|
|
|
DELETE_BATCH_SIZE = 1000
|
|
|
|
|
|
|
|
|
2022-04-14 23:43:26 +02:00
|
|
|
def do_delete_old_unclaimed_attachments(weeks_ago: int) -> None:
|
2022-05-18 22:07:15 +02:00
|
|
|
old_unclaimed_attachments, old_unclaimed_archived_attachments = get_old_unclaimed_attachments(
|
|
|
|
weeks_ago
|
|
|
|
)
|
2022-04-14 23:43:26 +02:00
|
|
|
|
retention: Prevent deletion of partially-archived messages.
Previously, this code:
```python3
old_archived_attachments = ArchivedAttachment.objects.annotate(
has_other_messages=Exists(
Attachment.objects.filter(id=OuterRef("id"))
.exclude(messages=None)
.exclude(scheduled_messages=None)
)
).filter(messages=None, create_time__lt=delta_weeks_ago, has_other_messages=False)
```
...protected from removal any ArchivedAttachment objects where there
was an Attachment which had _both_ a message _and_ a scheduled
message, instead of _either_ a message _or_ a scheduled message.
Since files are removed from disk when the ArchivedAttachment rows are
deleted, this meant that if an upload was referenced in two messages,
and one was deleted, the file was permanently deleted when the
ArchivedMessage and ArchivedAttachment were cleaned up, despite being
still referenced in live Messages and Attachments.
Switch from `.exclude(messages=None).exclude(scheduled_messages=None)`
to `.exclude(messages=None, scheduled_messages=None)` which "OR"s
those conditions appropriately.
Pull the relevant test into its own file, and expand it significantly
to cover this, and other, corner cases.
2023-07-28 20:53:07 +02:00
|
|
|
# An attachment may be removed from Attachments and
|
|
|
|
# ArchiveAttachments in the same run; prevent warnings from the
|
|
|
|
# backing store by only removing it from there once.
|
|
|
|
already_removed = set()
|
2024-07-12 23:26:52 +02:00
|
|
|
storage_paths = []
|
2022-04-14 23:43:26 +02:00
|
|
|
for attachment in old_unclaimed_attachments:
|
2024-07-12 23:26:52 +02:00
|
|
|
storage_paths.append(attachment.path_id)
|
2024-06-20 23:58:27 +02:00
|
|
|
image_row = ImageAttachment.objects.filter(path_id=attachment.path_id).first()
|
|
|
|
if image_row:
|
|
|
|
for existing_thumbnail in image_row.thumbnail_metadata:
|
|
|
|
thumb = StoredThumbnailFormat(**existing_thumbnail)
|
|
|
|
storage_paths.append(get_image_thumbnail_path(image_row, thumb))
|
|
|
|
image_row.delete()
|
retention: Prevent deletion of partially-archived messages.
Previously, this code:
```python3
old_archived_attachments = ArchivedAttachment.objects.annotate(
has_other_messages=Exists(
Attachment.objects.filter(id=OuterRef("id"))
.exclude(messages=None)
.exclude(scheduled_messages=None)
)
).filter(messages=None, create_time__lt=delta_weeks_ago, has_other_messages=False)
```
...protected from removal any ArchivedAttachment objects where there
was an Attachment which had _both_ a message _and_ a scheduled
message, instead of _either_ a message _or_ a scheduled message.
Since files are removed from disk when the ArchivedAttachment rows are
deleted, this meant that if an upload was referenced in two messages,
and one was deleted, the file was permanently deleted when the
ArchivedMessage and ArchivedAttachment were cleaned up, despite being
still referenced in live Messages and Attachments.
Switch from `.exclude(messages=None).exclude(scheduled_messages=None)`
to `.exclude(messages=None, scheduled_messages=None)` which "OR"s
those conditions appropriately.
Pull the relevant test into its own file, and expand it significantly
to cover this, and other, corner cases.
2023-07-28 20:53:07 +02:00
|
|
|
already_removed.add(attachment.path_id)
|
2022-04-14 23:43:26 +02:00
|
|
|
attachment.delete()
|
2024-07-12 23:26:52 +02:00
|
|
|
if len(storage_paths) >= DELETE_BATCH_SIZE:
|
2024-08-28 20:47:51 +02:00
|
|
|
delete_message_attachments(storage_paths[:DELETE_BATCH_SIZE])
|
|
|
|
storage_paths = storage_paths[DELETE_BATCH_SIZE:]
|
2022-07-18 21:58:47 +02:00
|
|
|
for archived_attachment in old_unclaimed_archived_attachments:
|
retention: Prevent deletion of partially-archived messages.
Previously, this code:
```python3
old_archived_attachments = ArchivedAttachment.objects.annotate(
has_other_messages=Exists(
Attachment.objects.filter(id=OuterRef("id"))
.exclude(messages=None)
.exclude(scheduled_messages=None)
)
).filter(messages=None, create_time__lt=delta_weeks_ago, has_other_messages=False)
```
...protected from removal any ArchivedAttachment objects where there
was an Attachment which had _both_ a message _and_ a scheduled
message, instead of _either_ a message _or_ a scheduled message.
Since files are removed from disk when the ArchivedAttachment rows are
deleted, this meant that if an upload was referenced in two messages,
and one was deleted, the file was permanently deleted when the
ArchivedMessage and ArchivedAttachment were cleaned up, despite being
still referenced in live Messages and Attachments.
Switch from `.exclude(messages=None).exclude(scheduled_messages=None)`
to `.exclude(messages=None, scheduled_messages=None)` which "OR"s
those conditions appropriately.
Pull the relevant test into its own file, and expand it significantly
to cover this, and other, corner cases.
2023-07-28 20:53:07 +02:00
|
|
|
if archived_attachment.path_id not in already_removed:
|
2024-07-12 23:26:52 +02:00
|
|
|
storage_paths.append(archived_attachment.path_id)
|
2024-06-20 23:58:27 +02:00
|
|
|
image_row = ImageAttachment.objects.filter(path_id=archived_attachment.path_id).first()
|
|
|
|
if image_row: # nocoverage
|
|
|
|
for existing_thumbnail in image_row.thumbnail_metadata:
|
|
|
|
thumb = StoredThumbnailFormat(**existing_thumbnail)
|
|
|
|
storage_paths.append(get_image_thumbnail_path(image_row, thumb))
|
|
|
|
image_row.delete()
|
2022-07-18 21:58:47 +02:00
|
|
|
archived_attachment.delete()
|
2024-07-12 23:26:52 +02:00
|
|
|
if len(storage_paths) >= DELETE_BATCH_SIZE:
|
2024-08-28 20:47:51 +02:00
|
|
|
delete_message_attachments(storage_paths[:DELETE_BATCH_SIZE])
|
|
|
|
storage_paths = storage_paths[DELETE_BATCH_SIZE:]
|
|
|
|
|
2024-07-12 23:26:52 +02:00
|
|
|
if storage_paths:
|
|
|
|
delete_message_attachments(storage_paths)
|
2022-04-14 23:43:26 +02:00
|
|
|
|
|
|
|
|
|
|
|
def check_attachment_reference_change(
|
2024-07-12 02:30:23 +02:00
|
|
|
message: Message | ScheduledMessage, rendering_result: MessageRenderingResult
|
2024-07-28 03:21:33 +02:00
|
|
|
) -> AttachmentChangeResult:
|
2022-04-14 23:43:26 +02:00
|
|
|
# For a unsaved message edit (message.* has been updated, but not
|
|
|
|
# saved to the database), adjusts Attachment data to correspond to
|
|
|
|
# the new content.
|
|
|
|
prev_attachments = {a.path_id for a in message.attachment_set.all()}
|
|
|
|
new_attachments = set(rendering_result.potential_attachment_path_ids)
|
|
|
|
|
|
|
|
if new_attachments == prev_attachments:
|
2024-07-28 03:21:33 +02:00
|
|
|
return AttachmentChangeResult(bool(prev_attachments), [])
|
2022-04-14 23:43:26 +02:00
|
|
|
|
|
|
|
to_remove = list(prev_attachments - new_attachments)
|
|
|
|
if len(to_remove) > 0:
|
|
|
|
attachments_to_update = Attachment.objects.filter(path_id__in=to_remove).select_for_update()
|
|
|
|
message.attachment_set.remove(*attachments_to_update)
|
|
|
|
|
2024-07-28 03:21:33 +02:00
|
|
|
sender = message.sender
|
|
|
|
detached_attachments_query = Attachment.objects.filter(
|
|
|
|
path_id__in=to_remove, messages__isnull=True, owner=sender
|
|
|
|
)
|
|
|
|
detached_attachments = [attachment.to_dict() for attachment in detached_attachments_query]
|
|
|
|
|
2022-04-14 23:43:26 +02:00
|
|
|
to_add = list(new_attachments - prev_attachments)
|
|
|
|
if len(to_add) > 0:
|
|
|
|
do_claim_attachments(message, to_add)
|
|
|
|
|
2024-07-28 03:21:33 +02:00
|
|
|
return AttachmentChangeResult(message.attachment_set.exists(), detached_attachments)
|