retention: Prevent deletion of partially-archived messages.
Previously, this code:
```python3
old_archived_attachments = ArchivedAttachment.objects.annotate(
has_other_messages=Exists(
Attachment.objects.filter(id=OuterRef("id"))
.exclude(messages=None)
.exclude(scheduled_messages=None)
)
).filter(messages=None, create_time__lt=delta_weeks_ago, has_other_messages=False)
```
...protected from removal any ArchivedAttachment objects where there
was an Attachment which had _both_ a message _and_ a scheduled
message, instead of _either_ a message _or_ a scheduled message.
Since files are removed from disk when the ArchivedAttachment rows are
deleted, this meant that if an upload was referenced in two messages,
and one was deleted, the file was permanently deleted when the
ArchivedMessage and ArchivedAttachment were cleaned up, despite being
still referenced in live Messages and Attachments.
Switch from `.exclude(messages=None).exclude(scheduled_messages=None)`
to `.exclude(messages=None, scheduled_messages=None)` which "OR"s
those conditions appropriately.
Pull the relevant test into its own file, and expand it significantly
to cover this, and other, corner cases.
2023-07-28 20:53:07 +02:00
|
|
|
import os
|
|
|
|
import re
|
|
|
|
from datetime import datetime, timedelta
|
2024-07-12 23:26:52 +02:00
|
|
|
from unittest.mock import patch
|
retention: Prevent deletion of partially-archived messages.
Previously, this code:
```python3
old_archived_attachments = ArchivedAttachment.objects.annotate(
has_other_messages=Exists(
Attachment.objects.filter(id=OuterRef("id"))
.exclude(messages=None)
.exclude(scheduled_messages=None)
)
).filter(messages=None, create_time__lt=delta_weeks_ago, has_other_messages=False)
```
...protected from removal any ArchivedAttachment objects where there
was an Attachment which had _both_ a message _and_ a scheduled
message, instead of _either_ a message _or_ a scheduled message.
Since files are removed from disk when the ArchivedAttachment rows are
deleted, this meant that if an upload was referenced in two messages,
and one was deleted, the file was permanently deleted when the
ArchivedMessage and ArchivedAttachment were cleaned up, despite being
still referenced in live Messages and Attachments.
Switch from `.exclude(messages=None).exclude(scheduled_messages=None)`
to `.exclude(messages=None, scheduled_messages=None)` which "OR"s
those conditions appropriately.
Pull the relevant test into its own file, and expand it significantly
to cover this, and other, corner cases.
2023-07-28 20:53:07 +02:00
|
|
|
|
|
|
|
import time_machine
|
|
|
|
from django.conf import settings
|
|
|
|
from django.utils.timezone import now as timezone_now
|
|
|
|
|
|
|
|
from zerver.actions.message_delete import do_delete_messages
|
|
|
|
from zerver.actions.scheduled_messages import check_schedule_message, delete_scheduled_message
|
|
|
|
from zerver.actions.uploads import do_delete_old_unclaimed_attachments
|
|
|
|
from zerver.lib.retention import clean_archived_data
|
|
|
|
from zerver.lib.test_classes import UploadSerializeMixin, ZulipTestCase
|
2024-06-20 23:58:27 +02:00
|
|
|
from zerver.lib.test_helpers import get_test_image_file
|
2024-08-28 20:47:51 +02:00
|
|
|
from zerver.lib.thumbnail import ThumbnailFormat
|
2023-12-15 04:33:19 +01:00
|
|
|
from zerver.models import ArchivedAttachment, Attachment, Message, UserProfile
|
|
|
|
from zerver.models.clients import get_client
|
retention: Prevent deletion of partially-archived messages.
Previously, this code:
```python3
old_archived_attachments = ArchivedAttachment.objects.annotate(
has_other_messages=Exists(
Attachment.objects.filter(id=OuterRef("id"))
.exclude(messages=None)
.exclude(scheduled_messages=None)
)
).filter(messages=None, create_time__lt=delta_weeks_ago, has_other_messages=False)
```
...protected from removal any ArchivedAttachment objects where there
was an Attachment which had _both_ a message _and_ a scheduled
message, instead of _either_ a message _or_ a scheduled message.
Since files are removed from disk when the ArchivedAttachment rows are
deleted, this meant that if an upload was referenced in two messages,
and one was deleted, the file was permanently deleted when the
ArchivedMessage and ArchivedAttachment were cleaned up, despite being
still referenced in live Messages and Attachments.
Switch from `.exclude(messages=None).exclude(scheduled_messages=None)`
to `.exclude(messages=None, scheduled_messages=None)` which "OR"s
those conditions appropriately.
Pull the relevant test into its own file, and expand it significantly
to cover this, and other, corner cases.
2023-07-28 20:53:07 +02:00
|
|
|
|
|
|
|
|
|
|
|
class UnclaimedAttachmentTest(UploadSerializeMixin, ZulipTestCase):
|
|
|
|
def make_attachment(
|
2024-07-12 02:30:23 +02:00
|
|
|
self, filename: str, when: datetime | None = None, uploader: UserProfile | None = None
|
retention: Prevent deletion of partially-archived messages.
Previously, this code:
```python3
old_archived_attachments = ArchivedAttachment.objects.annotate(
has_other_messages=Exists(
Attachment.objects.filter(id=OuterRef("id"))
.exclude(messages=None)
.exclude(scheduled_messages=None)
)
).filter(messages=None, create_time__lt=delta_weeks_ago, has_other_messages=False)
```
...protected from removal any ArchivedAttachment objects where there
was an Attachment which had _both_ a message _and_ a scheduled
message, instead of _either_ a message _or_ a scheduled message.
Since files are removed from disk when the ArchivedAttachment rows are
deleted, this meant that if an upload was referenced in two messages,
and one was deleted, the file was permanently deleted when the
ArchivedMessage and ArchivedAttachment were cleaned up, despite being
still referenced in live Messages and Attachments.
Switch from `.exclude(messages=None).exclude(scheduled_messages=None)`
to `.exclude(messages=None, scheduled_messages=None)` which "OR"s
those conditions appropriately.
Pull the relevant test into its own file, and expand it significantly
to cover this, and other, corner cases.
2023-07-28 20:53:07 +02:00
|
|
|
) -> Attachment:
|
|
|
|
if when is None:
|
|
|
|
when = timezone_now() - timedelta(weeks=2)
|
|
|
|
if uploader is None:
|
|
|
|
uploader = self.example_user("hamlet")
|
|
|
|
self.login_user(uploader)
|
|
|
|
|
2023-09-29 17:00:28 +02:00
|
|
|
with time_machine.travel(when, tick=False):
|
2024-07-16 23:26:36 +02:00
|
|
|
with get_test_image_file(filename) as file_obj:
|
|
|
|
response = self.assert_json_success(
|
|
|
|
self.client_post("/json/user_uploads", {"file": file_obj})
|
|
|
|
)
|
2024-07-15 07:06:38 +02:00
|
|
|
path_id = re.sub(r"/user_uploads/", "", response["url"])
|
retention: Prevent deletion of partially-archived messages.
Previously, this code:
```python3
old_archived_attachments = ArchivedAttachment.objects.annotate(
has_other_messages=Exists(
Attachment.objects.filter(id=OuterRef("id"))
.exclude(messages=None)
.exclude(scheduled_messages=None)
)
).filter(messages=None, create_time__lt=delta_weeks_ago, has_other_messages=False)
```
...protected from removal any ArchivedAttachment objects where there
was an Attachment which had _both_ a message _and_ a scheduled
message, instead of _either_ a message _or_ a scheduled message.
Since files are removed from disk when the ArchivedAttachment rows are
deleted, this meant that if an upload was referenced in two messages,
and one was deleted, the file was permanently deleted when the
ArchivedMessage and ArchivedAttachment were cleaned up, despite being
still referenced in live Messages and Attachments.
Switch from `.exclude(messages=None).exclude(scheduled_messages=None)`
to `.exclude(messages=None, scheduled_messages=None)` which "OR"s
those conditions appropriately.
Pull the relevant test into its own file, and expand it significantly
to cover this, and other, corner cases.
2023-07-28 20:53:07 +02:00
|
|
|
return Attachment.objects.get(path_id=path_id)
|
|
|
|
|
|
|
|
def assert_exists(
|
|
|
|
self,
|
|
|
|
attachment: Attachment,
|
|
|
|
*,
|
|
|
|
has_file: bool,
|
|
|
|
has_attachment: bool,
|
|
|
|
has_archived_attachment: bool,
|
|
|
|
) -> None:
|
|
|
|
assert settings.LOCAL_FILES_DIR
|
|
|
|
self.assertEqual( # File existence on disk
|
|
|
|
os.path.isfile(os.path.join(settings.LOCAL_FILES_DIR, attachment.path_id)), has_file
|
|
|
|
)
|
|
|
|
self.assertEqual( # Attachment row
|
|
|
|
Attachment.objects.filter(id=attachment.id).exists(), has_attachment
|
|
|
|
)
|
|
|
|
self.assertEqual( # ArchivedAttachment row
|
|
|
|
ArchivedAttachment.objects.filter(id=attachment.id).exists(), has_archived_attachment
|
|
|
|
)
|
|
|
|
|
2024-06-20 23:58:27 +02:00
|
|
|
def test_delete_unused_thumbnails(self) -> None:
|
|
|
|
assert settings.LOCAL_FILES_DIR
|
|
|
|
with self.captureOnCommitCallbacks(execute=True):
|
|
|
|
unused_attachment = self.make_attachment("img.png")
|
|
|
|
|
|
|
|
self.assert_exists(
|
|
|
|
unused_attachment, has_file=True, has_attachment=True, has_archived_attachment=False
|
|
|
|
)
|
|
|
|
|
|
|
|
# It also has thumbnails
|
|
|
|
self.assertTrue(
|
|
|
|
os.path.isdir(
|
|
|
|
os.path.join(settings.LOCAL_FILES_DIR, "thumbnail", unused_attachment.path_id)
|
|
|
|
)
|
|
|
|
)
|
|
|
|
self.assertGreater(
|
|
|
|
len(
|
|
|
|
os.listdir(
|
|
|
|
os.path.join(settings.LOCAL_FILES_DIR, "thumbnail", unused_attachment.path_id)
|
|
|
|
)
|
|
|
|
),
|
|
|
|
0,
|
|
|
|
)
|
|
|
|
|
|
|
|
# If we have 3 weeks of grace, nothing happens
|
|
|
|
do_delete_old_unclaimed_attachments(3)
|
|
|
|
self.assert_exists(
|
|
|
|
unused_attachment, has_file=True, has_attachment=True, has_archived_attachment=False
|
|
|
|
)
|
|
|
|
self.assertTrue(
|
|
|
|
os.path.isdir(
|
|
|
|
os.path.join(settings.LOCAL_FILES_DIR, "thumbnail", unused_attachment.path_id)
|
|
|
|
)
|
|
|
|
)
|
|
|
|
self.assertGreater(
|
|
|
|
len(
|
|
|
|
os.listdir(
|
|
|
|
os.path.join(settings.LOCAL_FILES_DIR, "thumbnail", unused_attachment.path_id)
|
|
|
|
)
|
|
|
|
),
|
|
|
|
0,
|
|
|
|
)
|
|
|
|
|
|
|
|
# If we have 1 weeks of grace, the Attachment is deleted, and so is the file on disk
|
|
|
|
do_delete_old_unclaimed_attachments(1)
|
|
|
|
self.assert_exists(
|
|
|
|
unused_attachment, has_file=False, has_attachment=False, has_archived_attachment=False
|
|
|
|
)
|
|
|
|
self.assertFalse(
|
|
|
|
os.path.exists(
|
|
|
|
os.path.join(settings.LOCAL_FILES_DIR, "thumbnail", unused_attachment.path_id)
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
retention: Prevent deletion of partially-archived messages.
Previously, this code:
```python3
old_archived_attachments = ArchivedAttachment.objects.annotate(
has_other_messages=Exists(
Attachment.objects.filter(id=OuterRef("id"))
.exclude(messages=None)
.exclude(scheduled_messages=None)
)
).filter(messages=None, create_time__lt=delta_weeks_ago, has_other_messages=False)
```
...protected from removal any ArchivedAttachment objects where there
was an Attachment which had _both_ a message _and_ a scheduled
message, instead of _either_ a message _or_ a scheduled message.
Since files are removed from disk when the ArchivedAttachment rows are
deleted, this meant that if an upload was referenced in two messages,
and one was deleted, the file was permanently deleted when the
ArchivedMessage and ArchivedAttachment were cleaned up, despite being
still referenced in live Messages and Attachments.
Switch from `.exclude(messages=None).exclude(scheduled_messages=None)`
to `.exclude(messages=None, scheduled_messages=None)` which "OR"s
those conditions appropriately.
Pull the relevant test into its own file, and expand it significantly
to cover this, and other, corner cases.
2023-07-28 20:53:07 +02:00
|
|
|
def test_delete_unused_upload(self) -> None:
|
2024-06-20 23:58:27 +02:00
|
|
|
unused_attachment = self.make_attachment("text.txt")
|
retention: Prevent deletion of partially-archived messages.
Previously, this code:
```python3
old_archived_attachments = ArchivedAttachment.objects.annotate(
has_other_messages=Exists(
Attachment.objects.filter(id=OuterRef("id"))
.exclude(messages=None)
.exclude(scheduled_messages=None)
)
).filter(messages=None, create_time__lt=delta_weeks_ago, has_other_messages=False)
```
...protected from removal any ArchivedAttachment objects where there
was an Attachment which had _both_ a message _and_ a scheduled
message, instead of _either_ a message _or_ a scheduled message.
Since files are removed from disk when the ArchivedAttachment rows are
deleted, this meant that if an upload was referenced in two messages,
and one was deleted, the file was permanently deleted when the
ArchivedMessage and ArchivedAttachment were cleaned up, despite being
still referenced in live Messages and Attachments.
Switch from `.exclude(messages=None).exclude(scheduled_messages=None)`
to `.exclude(messages=None, scheduled_messages=None)` which "OR"s
those conditions appropriately.
Pull the relevant test into its own file, and expand it significantly
to cover this, and other, corner cases.
2023-07-28 20:53:07 +02:00
|
|
|
self.assert_exists(
|
|
|
|
unused_attachment, has_file=True, has_attachment=True, has_archived_attachment=False
|
|
|
|
)
|
|
|
|
|
|
|
|
# If we have 3 weeks of grace, nothing happens
|
|
|
|
do_delete_old_unclaimed_attachments(3)
|
|
|
|
self.assert_exists(
|
|
|
|
unused_attachment, has_file=True, has_attachment=True, has_archived_attachment=False
|
|
|
|
)
|
|
|
|
|
|
|
|
# If we have 1 weeks of grace, the Attachment is deleted, and so is the file on disk
|
|
|
|
do_delete_old_unclaimed_attachments(1)
|
|
|
|
self.assert_exists(
|
|
|
|
unused_attachment, has_file=False, has_attachment=False, has_archived_attachment=False
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_delete_used_upload(self) -> None:
|
|
|
|
hamlet = self.example_user("hamlet")
|
2024-06-20 23:58:27 +02:00
|
|
|
attachment = self.make_attachment("text.txt")
|
retention: Prevent deletion of partially-archived messages.
Previously, this code:
```python3
old_archived_attachments = ArchivedAttachment.objects.annotate(
has_other_messages=Exists(
Attachment.objects.filter(id=OuterRef("id"))
.exclude(messages=None)
.exclude(scheduled_messages=None)
)
).filter(messages=None, create_time__lt=delta_weeks_ago, has_other_messages=False)
```
...protected from removal any ArchivedAttachment objects where there
was an Attachment which had _both_ a message _and_ a scheduled
message, instead of _either_ a message _or_ a scheduled message.
Since files are removed from disk when the ArchivedAttachment rows are
deleted, this meant that if an upload was referenced in two messages,
and one was deleted, the file was permanently deleted when the
ArchivedMessage and ArchivedAttachment were cleaned up, despite being
still referenced in live Messages and Attachments.
Switch from `.exclude(messages=None).exclude(scheduled_messages=None)`
to `.exclude(messages=None, scheduled_messages=None)` which "OR"s
those conditions appropriately.
Pull the relevant test into its own file, and expand it significantly
to cover this, and other, corner cases.
2023-07-28 20:53:07 +02:00
|
|
|
|
|
|
|
# Send message referencing that message
|
|
|
|
self.subscribe(hamlet, "Denmark")
|
|
|
|
body = f"Some files here ...[zulip.txt](http://{hamlet.realm.host}/user_uploads/{attachment.path_id})"
|
|
|
|
self.send_stream_message(hamlet, "Denmark", body, "test")
|
|
|
|
|
|
|
|
# Because the message is claimed, it is not removed
|
|
|
|
do_delete_old_unclaimed_attachments(1)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=False
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_delete_upload_archived_message(self) -> None:
|
|
|
|
hamlet = self.example_user("hamlet")
|
2024-06-20 23:58:27 +02:00
|
|
|
attachment = self.make_attachment("text.txt")
|
retention: Prevent deletion of partially-archived messages.
Previously, this code:
```python3
old_archived_attachments = ArchivedAttachment.objects.annotate(
has_other_messages=Exists(
Attachment.objects.filter(id=OuterRef("id"))
.exclude(messages=None)
.exclude(scheduled_messages=None)
)
).filter(messages=None, create_time__lt=delta_weeks_ago, has_other_messages=False)
```
...protected from removal any ArchivedAttachment objects where there
was an Attachment which had _both_ a message _and_ a scheduled
message, instead of _either_ a message _or_ a scheduled message.
Since files are removed from disk when the ArchivedAttachment rows are
deleted, this meant that if an upload was referenced in two messages,
and one was deleted, the file was permanently deleted when the
ArchivedMessage and ArchivedAttachment were cleaned up, despite being
still referenced in live Messages and Attachments.
Switch from `.exclude(messages=None).exclude(scheduled_messages=None)`
to `.exclude(messages=None, scheduled_messages=None)` which "OR"s
those conditions appropriately.
Pull the relevant test into its own file, and expand it significantly
to cover this, and other, corner cases.
2023-07-28 20:53:07 +02:00
|
|
|
|
|
|
|
# Send message referencing that message
|
|
|
|
self.subscribe(hamlet, "Denmark")
|
|
|
|
body = f"Some files here ...[zulip.txt](http://{hamlet.realm.host}/user_uploads/{attachment.path_id})"
|
|
|
|
message_id = self.send_stream_message(hamlet, "Denmark", body, "test")
|
|
|
|
|
|
|
|
# Delete that message; this moves it to ArchivedAttachment but leaves the file on disk
|
2024-07-17 05:11:28 +02:00
|
|
|
do_delete_messages(hamlet.realm, [Message.objects.get(id=message_id)], acting_user=None)
|
retention: Prevent deletion of partially-archived messages.
Previously, this code:
```python3
old_archived_attachments = ArchivedAttachment.objects.annotate(
has_other_messages=Exists(
Attachment.objects.filter(id=OuterRef("id"))
.exclude(messages=None)
.exclude(scheduled_messages=None)
)
).filter(messages=None, create_time__lt=delta_weeks_ago, has_other_messages=False)
```
...protected from removal any ArchivedAttachment objects where there
was an Attachment which had _both_ a message _and_ a scheduled
message, instead of _either_ a message _or_ a scheduled message.
Since files are removed from disk when the ArchivedAttachment rows are
deleted, this meant that if an upload was referenced in two messages,
and one was deleted, the file was permanently deleted when the
ArchivedMessage and ArchivedAttachment were cleaned up, despite being
still referenced in live Messages and Attachments.
Switch from `.exclude(messages=None).exclude(scheduled_messages=None)`
to `.exclude(messages=None, scheduled_messages=None)` which "OR"s
those conditions appropriately.
Pull the relevant test into its own file, and expand it significantly
to cover this, and other, corner cases.
2023-07-28 20:53:07 +02:00
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=False, has_archived_attachment=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Removing unclaimed attachments leaves the file, since it is
|
|
|
|
# attached to an existing ArchivedAttachment
|
|
|
|
do_delete_old_unclaimed_attachments(1)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=False, has_archived_attachment=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Now purge the ArchivedMessage
|
|
|
|
with self.settings(ARCHIVED_DATA_VACUUMING_DELAY_DAYS=0):
|
|
|
|
clean_archived_data()
|
|
|
|
|
|
|
|
# The attachment still exists as an unclaimed ArchivedAttachment
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=False, has_archived_attachment=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Removing unclaimed attachments now cleans it out
|
|
|
|
do_delete_old_unclaimed_attachments(1)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=False, has_attachment=False, has_archived_attachment=False
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_delete_one_message(self) -> None:
|
|
|
|
hamlet = self.example_user("hamlet")
|
2024-06-20 23:58:27 +02:00
|
|
|
attachment = self.make_attachment("text.txt")
|
retention: Prevent deletion of partially-archived messages.
Previously, this code:
```python3
old_archived_attachments = ArchivedAttachment.objects.annotate(
has_other_messages=Exists(
Attachment.objects.filter(id=OuterRef("id"))
.exclude(messages=None)
.exclude(scheduled_messages=None)
)
).filter(messages=None, create_time__lt=delta_weeks_ago, has_other_messages=False)
```
...protected from removal any ArchivedAttachment objects where there
was an Attachment which had _both_ a message _and_ a scheduled
message, instead of _either_ a message _or_ a scheduled message.
Since files are removed from disk when the ArchivedAttachment rows are
deleted, this meant that if an upload was referenced in two messages,
and one was deleted, the file was permanently deleted when the
ArchivedMessage and ArchivedAttachment were cleaned up, despite being
still referenced in live Messages and Attachments.
Switch from `.exclude(messages=None).exclude(scheduled_messages=None)`
to `.exclude(messages=None, scheduled_messages=None)` which "OR"s
those conditions appropriately.
Pull the relevant test into its own file, and expand it significantly
to cover this, and other, corner cases.
2023-07-28 20:53:07 +02:00
|
|
|
|
|
|
|
# Send message referencing that message
|
|
|
|
self.subscribe(hamlet, "Denmark")
|
|
|
|
body = f"Some files here ...[zulip.txt](http://{hamlet.realm.host}/user_uploads/{attachment.path_id})"
|
|
|
|
first_message_id = self.send_stream_message(hamlet, "Denmark", body, "test")
|
|
|
|
second_message_id = self.send_stream_message(hamlet, "Denmark", body, "test")
|
|
|
|
|
|
|
|
# Delete the second message; this leaves an Attachment and an
|
|
|
|
# ArchivedAttachment, both associated with a message
|
2024-07-17 05:11:28 +02:00
|
|
|
do_delete_messages(
|
|
|
|
hamlet.realm, [Message.objects.get(id=first_message_id)], acting_user=None
|
|
|
|
)
|
retention: Prevent deletion of partially-archived messages.
Previously, this code:
```python3
old_archived_attachments = ArchivedAttachment.objects.annotate(
has_other_messages=Exists(
Attachment.objects.filter(id=OuterRef("id"))
.exclude(messages=None)
.exclude(scheduled_messages=None)
)
).filter(messages=None, create_time__lt=delta_weeks_ago, has_other_messages=False)
```
...protected from removal any ArchivedAttachment objects where there
was an Attachment which had _both_ a message _and_ a scheduled
message, instead of _either_ a message _or_ a scheduled message.
Since files are removed from disk when the ArchivedAttachment rows are
deleted, this meant that if an upload was referenced in two messages,
and one was deleted, the file was permanently deleted when the
ArchivedMessage and ArchivedAttachment were cleaned up, despite being
still referenced in live Messages and Attachments.
Switch from `.exclude(messages=None).exclude(scheduled_messages=None)`
to `.exclude(messages=None, scheduled_messages=None)` which "OR"s
those conditions appropriately.
Pull the relevant test into its own file, and expand it significantly
to cover this, and other, corner cases.
2023-07-28 20:53:07 +02:00
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Removing unclaimed attachments leaves the file, since it is
|
|
|
|
# attached to an existing Attachment and ArchivedAttachment
|
|
|
|
# which have Messages and ArchivedMessages, respectively
|
|
|
|
do_delete_old_unclaimed_attachments(1)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Purging the ArchivedMessage does not affect the Attachment
|
|
|
|
# or ArchivedAttachment
|
|
|
|
with self.settings(ARCHIVED_DATA_VACUUMING_DELAY_DAYS=0):
|
|
|
|
clean_archived_data()
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Removing unclaimed attachments still does nothing, because
|
|
|
|
# the ArchivedAttachment is protected by the existing
|
|
|
|
# Attachment.
|
|
|
|
do_delete_old_unclaimed_attachments(1)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Deleting the other message now leaves just an ArchivedAttachment
|
2024-07-17 05:11:28 +02:00
|
|
|
do_delete_messages(
|
|
|
|
hamlet.realm, [Message.objects.get(id=second_message_id)], acting_user=None
|
|
|
|
)
|
retention: Prevent deletion of partially-archived messages.
Previously, this code:
```python3
old_archived_attachments = ArchivedAttachment.objects.annotate(
has_other_messages=Exists(
Attachment.objects.filter(id=OuterRef("id"))
.exclude(messages=None)
.exclude(scheduled_messages=None)
)
).filter(messages=None, create_time__lt=delta_weeks_ago, has_other_messages=False)
```
...protected from removal any ArchivedAttachment objects where there
was an Attachment which had _both_ a message _and_ a scheduled
message, instead of _either_ a message _or_ a scheduled message.
Since files are removed from disk when the ArchivedAttachment rows are
deleted, this meant that if an upload was referenced in two messages,
and one was deleted, the file was permanently deleted when the
ArchivedMessage and ArchivedAttachment were cleaned up, despite being
still referenced in live Messages and Attachments.
Switch from `.exclude(messages=None).exclude(scheduled_messages=None)`
to `.exclude(messages=None, scheduled_messages=None)` which "OR"s
those conditions appropriately.
Pull the relevant test into its own file, and expand it significantly
to cover this, and other, corner cases.
2023-07-28 20:53:07 +02:00
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=False, has_archived_attachment=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Cleaning out the archived message and purging unclaimed
|
|
|
|
# attachments now finally removes it.
|
|
|
|
with self.settings(ARCHIVED_DATA_VACUUMING_DELAY_DAYS=0):
|
|
|
|
clean_archived_data()
|
|
|
|
do_delete_old_unclaimed_attachments(1)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=False, has_attachment=False, has_archived_attachment=False
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_delete_with_scheduled_messages(self) -> None:
|
|
|
|
hamlet = self.example_user("hamlet")
|
2024-06-20 23:58:27 +02:00
|
|
|
attachment = self.make_attachment("text.txt")
|
retention: Prevent deletion of partially-archived messages.
Previously, this code:
```python3
old_archived_attachments = ArchivedAttachment.objects.annotate(
has_other_messages=Exists(
Attachment.objects.filter(id=OuterRef("id"))
.exclude(messages=None)
.exclude(scheduled_messages=None)
)
).filter(messages=None, create_time__lt=delta_weeks_ago, has_other_messages=False)
```
...protected from removal any ArchivedAttachment objects where there
was an Attachment which had _both_ a message _and_ a scheduled
message, instead of _either_ a message _or_ a scheduled message.
Since files are removed from disk when the ArchivedAttachment rows are
deleted, this meant that if an upload was referenced in two messages,
and one was deleted, the file was permanently deleted when the
ArchivedMessage and ArchivedAttachment were cleaned up, despite being
still referenced in live Messages and Attachments.
Switch from `.exclude(messages=None).exclude(scheduled_messages=None)`
to `.exclude(messages=None, scheduled_messages=None)` which "OR"s
those conditions appropriately.
Pull the relevant test into its own file, and expand it significantly
to cover this, and other, corner cases.
2023-07-28 20:53:07 +02:00
|
|
|
|
|
|
|
# Schedule a future send with the attachment
|
|
|
|
self.subscribe(hamlet, "Denmark")
|
|
|
|
body = f"Some files here ...[zulip.txt](http://{hamlet.realm.host}/user_uploads/{attachment.path_id})"
|
|
|
|
scheduled_message_id = check_schedule_message(
|
|
|
|
hamlet,
|
|
|
|
get_client("website"),
|
|
|
|
"stream",
|
|
|
|
[self.get_stream_id("Denmark")],
|
|
|
|
"Test topic",
|
|
|
|
body,
|
|
|
|
timezone_now() + timedelta(days=365),
|
|
|
|
hamlet.realm,
|
|
|
|
)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=False
|
|
|
|
)
|
|
|
|
|
|
|
|
# The ScheduledMessage protects the attachment from being removed
|
|
|
|
do_delete_old_unclaimed_attachments(1)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=False
|
|
|
|
)
|
|
|
|
|
|
|
|
# Deleting the ScheduledMessage leaves the attachment dangling
|
|
|
|
delete_scheduled_message(hamlet, scheduled_message_id)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=False
|
|
|
|
)
|
|
|
|
|
|
|
|
# Having no referents, it is now a target for removal
|
|
|
|
do_delete_old_unclaimed_attachments(1)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=False, has_attachment=False, has_archived_attachment=False
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_delete_with_scheduled_message_and_archive(self) -> None:
|
|
|
|
hamlet = self.example_user("hamlet")
|
2024-06-20 23:58:27 +02:00
|
|
|
attachment = self.make_attachment("text.txt")
|
retention: Prevent deletion of partially-archived messages.
Previously, this code:
```python3
old_archived_attachments = ArchivedAttachment.objects.annotate(
has_other_messages=Exists(
Attachment.objects.filter(id=OuterRef("id"))
.exclude(messages=None)
.exclude(scheduled_messages=None)
)
).filter(messages=None, create_time__lt=delta_weeks_ago, has_other_messages=False)
```
...protected from removal any ArchivedAttachment objects where there
was an Attachment which had _both_ a message _and_ a scheduled
message, instead of _either_ a message _or_ a scheduled message.
Since files are removed from disk when the ArchivedAttachment rows are
deleted, this meant that if an upload was referenced in two messages,
and one was deleted, the file was permanently deleted when the
ArchivedMessage and ArchivedAttachment were cleaned up, despite being
still referenced in live Messages and Attachments.
Switch from `.exclude(messages=None).exclude(scheduled_messages=None)`
to `.exclude(messages=None, scheduled_messages=None)` which "OR"s
those conditions appropriately.
Pull the relevant test into its own file, and expand it significantly
to cover this, and other, corner cases.
2023-07-28 20:53:07 +02:00
|
|
|
|
|
|
|
# Schedule a message, and also send one now
|
|
|
|
self.subscribe(hamlet, "Denmark")
|
|
|
|
body = f"Some files here ...[zulip.txt](http://{hamlet.realm.host}/user_uploads/{attachment.path_id})"
|
|
|
|
scheduled_message_id = check_schedule_message(
|
|
|
|
hamlet,
|
|
|
|
get_client("website"),
|
|
|
|
"stream",
|
|
|
|
[self.get_stream_id("Denmark")],
|
|
|
|
"Test topic",
|
|
|
|
body,
|
|
|
|
timezone_now() + timedelta(days=365),
|
|
|
|
hamlet.realm,
|
|
|
|
)
|
|
|
|
sent_message_id = self.send_stream_message(hamlet, "Denmark", body, "test")
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=False
|
|
|
|
)
|
|
|
|
|
|
|
|
# Deleting the sent message leaves us with an Attachment
|
|
|
|
# attached to the scheduled message, and an archived
|
|
|
|
# attachment with an archived message
|
2024-07-17 05:11:28 +02:00
|
|
|
do_delete_messages(
|
|
|
|
hamlet.realm, [Message.objects.get(id=sent_message_id)], acting_user=None
|
|
|
|
)
|
retention: Prevent deletion of partially-archived messages.
Previously, this code:
```python3
old_archived_attachments = ArchivedAttachment.objects.annotate(
has_other_messages=Exists(
Attachment.objects.filter(id=OuterRef("id"))
.exclude(messages=None)
.exclude(scheduled_messages=None)
)
).filter(messages=None, create_time__lt=delta_weeks_ago, has_other_messages=False)
```
...protected from removal any ArchivedAttachment objects where there
was an Attachment which had _both_ a message _and_ a scheduled
message, instead of _either_ a message _or_ a scheduled message.
Since files are removed from disk when the ArchivedAttachment rows are
deleted, this meant that if an upload was referenced in two messages,
and one was deleted, the file was permanently deleted when the
ArchivedMessage and ArchivedAttachment were cleaned up, despite being
still referenced in live Messages and Attachments.
Switch from `.exclude(messages=None).exclude(scheduled_messages=None)`
to `.exclude(messages=None, scheduled_messages=None)` which "OR"s
those conditions appropriately.
Pull the relevant test into its own file, and expand it significantly
to cover this, and other, corner cases.
2023-07-28 20:53:07 +02:00
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Expiring the archived message leaves a dangling
|
|
|
|
# ArchivedAttachment and a protected Attachment
|
|
|
|
with self.settings(ARCHIVED_DATA_VACUUMING_DELAY_DAYS=0):
|
|
|
|
clean_archived_data()
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Removing unclaimed attachments deletes nothing, since the
|
|
|
|
# the ArchivedAttachment is protected by the Attachment which
|
|
|
|
# is still protected by the scheduled message
|
|
|
|
do_delete_old_unclaimed_attachments(1)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Deleting the ScheduledMessage leaves the attachment fully dangling
|
|
|
|
delete_scheduled_message(hamlet, scheduled_message_id)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Having no referents, it is now a target for removal
|
|
|
|
do_delete_old_unclaimed_attachments(1)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=False, has_attachment=False, has_archived_attachment=False
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_delete_with_unscheduled_message_and_archive(self) -> None:
|
|
|
|
# This is subtly different from the test above -- we delete
|
|
|
|
# the scheduled message first, which is the only way to get an
|
|
|
|
# Attachment with not referents as well as an
|
|
|
|
# ArchivedAttachment which does have references. Normally,
|
|
|
|
# the process of archiving prunes Attachments which have no
|
|
|
|
# references.
|
|
|
|
hamlet = self.example_user("hamlet")
|
2024-06-20 23:58:27 +02:00
|
|
|
attachment = self.make_attachment("text.txt")
|
retention: Prevent deletion of partially-archived messages.
Previously, this code:
```python3
old_archived_attachments = ArchivedAttachment.objects.annotate(
has_other_messages=Exists(
Attachment.objects.filter(id=OuterRef("id"))
.exclude(messages=None)
.exclude(scheduled_messages=None)
)
).filter(messages=None, create_time__lt=delta_weeks_ago, has_other_messages=False)
```
...protected from removal any ArchivedAttachment objects where there
was an Attachment which had _both_ a message _and_ a scheduled
message, instead of _either_ a message _or_ a scheduled message.
Since files are removed from disk when the ArchivedAttachment rows are
deleted, this meant that if an upload was referenced in two messages,
and one was deleted, the file was permanently deleted when the
ArchivedMessage and ArchivedAttachment were cleaned up, despite being
still referenced in live Messages and Attachments.
Switch from `.exclude(messages=None).exclude(scheduled_messages=None)`
to `.exclude(messages=None, scheduled_messages=None)` which "OR"s
those conditions appropriately.
Pull the relevant test into its own file, and expand it significantly
to cover this, and other, corner cases.
2023-07-28 20:53:07 +02:00
|
|
|
|
|
|
|
# Schedule a message, and also send one now
|
|
|
|
self.subscribe(hamlet, "Denmark")
|
|
|
|
body = f"Some files here ...[zulip.txt](http://{hamlet.realm.host}/user_uploads/{attachment.path_id})"
|
|
|
|
scheduled_message_id = check_schedule_message(
|
|
|
|
hamlet,
|
|
|
|
get_client("website"),
|
|
|
|
"stream",
|
|
|
|
[self.get_stream_id("Denmark")],
|
|
|
|
"Test topic",
|
|
|
|
body,
|
|
|
|
timezone_now() + timedelta(days=365),
|
|
|
|
hamlet.realm,
|
|
|
|
)
|
|
|
|
sent_message_id = self.send_stream_message(hamlet, "Denmark", body, "test")
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=False
|
|
|
|
)
|
|
|
|
|
|
|
|
# Delete the message and then unschedule the scheduled message
|
|
|
|
# before expiring the ArchivedMessages.
|
2024-07-17 05:11:28 +02:00
|
|
|
do_delete_messages(
|
|
|
|
hamlet.realm, [Message.objects.get(id=sent_message_id)], acting_user=None
|
|
|
|
)
|
retention: Prevent deletion of partially-archived messages.
Previously, this code:
```python3
old_archived_attachments = ArchivedAttachment.objects.annotate(
has_other_messages=Exists(
Attachment.objects.filter(id=OuterRef("id"))
.exclude(messages=None)
.exclude(scheduled_messages=None)
)
).filter(messages=None, create_time__lt=delta_weeks_ago, has_other_messages=False)
```
...protected from removal any ArchivedAttachment objects where there
was an Attachment which had _both_ a message _and_ a scheduled
message, instead of _either_ a message _or_ a scheduled message.
Since files are removed from disk when the ArchivedAttachment rows are
deleted, this meant that if an upload was referenced in two messages,
and one was deleted, the file was permanently deleted when the
ArchivedMessage and ArchivedAttachment were cleaned up, despite being
still referenced in live Messages and Attachments.
Switch from `.exclude(messages=None).exclude(scheduled_messages=None)`
to `.exclude(messages=None, scheduled_messages=None)` which "OR"s
those conditions appropriately.
Pull the relevant test into its own file, and expand it significantly
to cover this, and other, corner cases.
2023-07-28 20:53:07 +02:00
|
|
|
delete_scheduled_message(hamlet, scheduled_message_id)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Attempting to expire unclaimed attachments leaves the
|
|
|
|
# unreferenced Attachment which is protected by the
|
|
|
|
# ArchivedAttachment which has archived messages referencing
|
|
|
|
# it.
|
|
|
|
do_delete_old_unclaimed_attachments(1)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Expiring archived messages leaves us with a dangling
|
|
|
|
# Attachment and ArchivedAttachment, with neither having
|
|
|
|
# referents.
|
|
|
|
with self.settings(ARCHIVED_DATA_VACUUMING_DELAY_DAYS=0):
|
|
|
|
clean_archived_data()
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Having no referents in either place, it is now a target for
|
|
|
|
# removal
|
|
|
|
do_delete_old_unclaimed_attachments(1)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=False, has_attachment=False, has_archived_attachment=False
|
|
|
|
)
|
2024-07-12 23:26:52 +02:00
|
|
|
|
|
|
|
def test_delete_batch_size(self) -> None:
|
2024-08-28 20:47:51 +02:00
|
|
|
# 3 attachments, each of which has 2 files because of the thumbnail
|
|
|
|
thumbnail_format = ThumbnailFormat("webp", 100, 75, animated=False)
|
|
|
|
with self.thumbnail_formats(thumbnail_format), self.captureOnCommitCallbacks(execute=True):
|
|
|
|
attachments = [self.make_attachment("img.png") for _ in range(3)]
|
2024-07-12 23:26:52 +02:00
|
|
|
|
|
|
|
with (
|
2024-08-28 20:47:51 +02:00
|
|
|
patch("zerver.actions.uploads.DELETE_BATCH_SIZE", 5),
|
2024-07-12 23:26:52 +02:00
|
|
|
patch("zerver.actions.uploads.delete_message_attachments") as delete_mock,
|
|
|
|
):
|
|
|
|
do_delete_old_unclaimed_attachments(1)
|
|
|
|
|
2024-08-28 20:47:51 +02:00
|
|
|
# We expect all of the 5 attachments to be deleted, across two
|
|
|
|
# different calls of 5- and 1-element lists. Since each image
|
|
|
|
# attachment is two files, this means that the thumbnail and
|
|
|
|
# its original is split across batches.
|
2024-07-12 23:26:52 +02:00
|
|
|
self.assertEqual(delete_mock.call_count, 2)
|
2024-08-28 20:47:51 +02:00
|
|
|
self.assert_length(delete_mock.call_args_list[0][0][0], 5)
|
|
|
|
self.assert_length(delete_mock.call_args_list[1][0][0], 1)
|
|
|
|
|
|
|
|
deleted = set(delete_mock.call_args_list[0][0][0] + delete_mock.call_args_list[1][0][0])
|
|
|
|
existing = set()
|
|
|
|
for attachment in attachments:
|
|
|
|
existing.add(attachment.path_id)
|
|
|
|
existing.add(f"thumbnail/{attachment.path_id}/{thumbnail_format!s}")
|
|
|
|
self.assertEqual(deleted, existing)
|
2024-07-12 23:26:52 +02:00
|
|
|
|
|
|
|
def test_delete_batch_size_archived(self) -> None:
|
|
|
|
hamlet = self.example_user("hamlet")
|
2024-06-20 23:58:27 +02:00
|
|
|
attachments = [self.make_attachment("text.txt") for _ in range(20)]
|
2024-07-12 23:26:52 +02:00
|
|
|
|
|
|
|
# Send message referencing 10/20 of those attachments
|
|
|
|
self.subscribe(hamlet, "Denmark")
|
|
|
|
body = "Some files here\n" + "\n".join(
|
|
|
|
f"[a](http://{hamlet.realm.host}/user_uploads/{attachment.path_id}"
|
|
|
|
for attachment in attachments[:10]
|
|
|
|
)
|
|
|
|
message_id = self.send_stream_message(hamlet, "Denmark", body, "test")
|
|
|
|
|
|
|
|
# Delete and purge the message, leaving both the ArchivedAttachments dangling
|
2024-07-17 05:11:28 +02:00
|
|
|
do_delete_messages(hamlet.realm, [Message.objects.get(id=message_id)], acting_user=None)
|
2024-07-12 23:26:52 +02:00
|
|
|
with self.settings(ARCHIVED_DATA_VACUUMING_DELAY_DAYS=0):
|
|
|
|
clean_archived_data()
|
|
|
|
|
|
|
|
# Removing unclaimed attachments now cleans them all out
|
|
|
|
with (
|
|
|
|
patch("zerver.actions.uploads.DELETE_BATCH_SIZE", 6),
|
|
|
|
patch("zerver.actions.uploads.delete_message_attachments") as delete_mock,
|
|
|
|
):
|
|
|
|
do_delete_old_unclaimed_attachments(1)
|
|
|
|
|
|
|
|
# We expect all of the 20 attachments (10 of which are
|
|
|
|
# ArchivedAttachments) to be deleted, across four different
|
|
|
|
# calls: 6, 6, 6, 2
|
|
|
|
self.assertEqual(delete_mock.call_count, 4)
|
|
|
|
self.assert_length(delete_mock.call_args_list[0][0][0], 6)
|
|
|
|
self.assert_length(delete_mock.call_args_list[1][0][0], 6)
|
|
|
|
self.assert_length(delete_mock.call_args_list[2][0][0], 6)
|
|
|
|
self.assert_length(delete_mock.call_args_list[3][0][0], 2)
|
|
|
|
|
|
|
|
deleted_path_ids = {elem for call in delete_mock.call_args_list for elem in call[0][0]}
|
|
|
|
self.assertEqual(
|
|
|
|
deleted_path_ids,
|
|
|
|
{attachment.path_id for attachment in attachments},
|
|
|
|
)
|