upload: Batch deleting old attachments.

delete_message_attachments() is faster than calling
delete_message_attachment() one-by-one.
This commit is contained in:
Alex Vandiver 2024-07-12 21:26:52 +00:00 committed by Tim Abbott
parent 502b9a76a9
commit 5f25fae0fa
2 changed files with 75 additions and 3 deletions

View File

@ -3,7 +3,7 @@ from typing import Any
from zerver.lib.attachments import get_old_unclaimed_attachments, validate_attachment_request
from zerver.lib.markdown import MessageRenderingResult
from zerver.lib.upload import claim_attachment, delete_message_attachment
from zerver.lib.upload import claim_attachment, delete_message_attachments
from zerver.models import Attachment, Message, ScheduledMessage, Stream, UserProfile
from zerver.tornado.django_api import send_event_on_commit
@ -62,6 +62,9 @@ def do_claim_attachments(
return claimed
DELETE_BATCH_SIZE = 1000
def do_delete_old_unclaimed_attachments(weeks_ago: int) -> None:
old_unclaimed_attachments, old_unclaimed_archived_attachments = get_old_unclaimed_attachments(
weeks_ago
@ -71,14 +74,23 @@ def do_delete_old_unclaimed_attachments(weeks_ago: int) -> None:
# ArchiveAttachments in the same run; prevent warnings from the
# backing store by only removing it from there once.
already_removed = set()
storage_paths = []
for attachment in old_unclaimed_attachments:
delete_message_attachment(attachment.path_id)
storage_paths.append(attachment.path_id)
already_removed.add(attachment.path_id)
attachment.delete()
if len(storage_paths) >= DELETE_BATCH_SIZE:
delete_message_attachments(storage_paths)
storage_paths = []
for archived_attachment in old_unclaimed_archived_attachments:
if archived_attachment.path_id not in already_removed:
delete_message_attachment(archived_attachment.path_id)
storage_paths.append(archived_attachment.path_id)
archived_attachment.delete()
if len(storage_paths) >= DELETE_BATCH_SIZE:
delete_message_attachments(storage_paths)
storage_paths = []
if storage_paths:
delete_message_attachments(storage_paths)
def check_attachment_reference_change(

View File

@ -2,6 +2,7 @@ import os
import re
from datetime import datetime, timedelta
from io import StringIO
from unittest.mock import patch
import time_machine
from django.conf import settings
@ -337,3 +338,62 @@ class UnclaimedAttachmentTest(UploadSerializeMixin, ZulipTestCase):
self.assert_exists(
attachment, has_file=False, has_attachment=False, has_archived_attachment=False
)
def test_delete_batch_size(self) -> None:
attachments = [self.make_attachment("unused.txt") for _ in range(10)]
with (
patch("zerver.actions.uploads.DELETE_BATCH_SIZE", 6),
patch("zerver.actions.uploads.delete_message_attachments") as delete_mock,
):
do_delete_old_unclaimed_attachments(1)
# We expect all of the 10 attachments to be deleted,
# across two different calls of 6- and 4-element lists
self.assertEqual(delete_mock.call_count, 2)
self.assert_length(delete_mock.call_args_list[0][0][0], 6)
self.assert_length(delete_mock.call_args_list[1][0][0], 4)
self.assertEqual(
set(delete_mock.call_args_list[0][0][0] + delete_mock.call_args_list[1][0][0]),
{attachment.path_id for attachment in attachments},
)
def test_delete_batch_size_archived(self) -> None:
hamlet = self.example_user("hamlet")
attachments = [self.make_attachment("unused.txt") for _ in range(20)]
# Send message referencing 10/20 of those attachments
self.subscribe(hamlet, "Denmark")
body = "Some files here\n" + "\n".join(
f"[a](http://{hamlet.realm.host}/user_uploads/{attachment.path_id}"
for attachment in attachments[:10]
)
message_id = self.send_stream_message(hamlet, "Denmark", body, "test")
# Delete and purge the message, leaving both the ArchivedAttachments dangling
do_delete_messages(hamlet.realm, [Message.objects.get(id=message_id)])
with self.settings(ARCHIVED_DATA_VACUUMING_DELAY_DAYS=0):
clean_archived_data()
# Removing unclaimed attachments now cleans them all out
with (
patch("zerver.actions.uploads.DELETE_BATCH_SIZE", 6),
patch("zerver.actions.uploads.delete_message_attachments") as delete_mock,
):
do_delete_old_unclaimed_attachments(1)
# We expect all of the 20 attachments (10 of which are
# ArchivedAttachments) to be deleted, across four different
# calls: 6, 6, 6, 2
self.assertEqual(delete_mock.call_count, 4)
self.assert_length(delete_mock.call_args_list[0][0][0], 6)
self.assert_length(delete_mock.call_args_list[1][0][0], 6)
self.assert_length(delete_mock.call_args_list[2][0][0], 6)
self.assert_length(delete_mock.call_args_list[3][0][0], 2)
deleted_path_ids = {elem for call in delete_mock.call_args_list for elem in call[0][0]}
self.assertEqual(
deleted_path_ids,
{attachment.path_id for attachment in attachments},
)