retention: Optimize fetching of realms and streams with retention policy.

2020-05-07 14:24:45 +02:00 · 2020-05-07 14:24:45 +02:00 · 812ac4714f
parent 50d8d61d3c
commit 812ac4714f
2 changed files with 136 additions and 17 deletions
--- a/zerver/lib/retention.py
+++ b/zerver/lib/retention.py
@ -12,7 +12,7 @@ from zerver.models import (Message, UserMessage, ArchivedUserMessage, Realm,
                           SubMessage, ArchivedSubMessage, Recipient, Stream, ArchiveTransaction,
                           get_user_including_cross_realm)

-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple

 import logging

@ -303,15 +303,7 @@ def archive_personal_and_huddle_messages(realm: Realm, chunk_size: int=MESSAGE_B
    message_count = move_expired_personal_and_huddle_messages_to_archive(realm, chunk_size)
    logger.info("Done. Archived %s messages", message_count)

-def archive_stream_messages(realm: Realm, chunk_size: int=MESSAGE_BATCH_SIZE) -> None:
-    # We don't archive, if the stream has message_retention_days set to -1,
-    # or if neither the stream nor the realm have a retention policy.
-    query = Stream.objects.select_related("recipient").filter(
-        realm_id=realm.id).exclude(message_retention_days=-1)
-    if not realm.message_retention_days:
-        query = query.exclude(message_retention_days__isnull=True)
-
-    streams = list(query)
+def archive_stream_messages(realm: Realm, streams: List[Stream], chunk_size: int=MESSAGE_BATCH_SIZE) -> None:
    if not streams:
        return

@ -337,18 +329,57 @@ def archive_stream_messages(realm: Realm, chunk_size: int=MESSAGE_BATCH_SIZE) ->
 def archive_messages(chunk_size: int=MESSAGE_BATCH_SIZE) -> None:
    logger.info("Starting the archiving process with chunk_size %s", chunk_size)

-    # We exclude SYSTEM_BOT_REALM here because the logic for archiving
-    # private messages and huddles isn't designed to correctly handle
-    # that realm.  In practice, excluding it has no effect, because
-    # that realm is expected to always have message_retention_days=None.
-    for realm in Realm.objects.exclude(string_id=settings.SYSTEM_BOT_REALM):
-        archive_stream_messages(realm, chunk_size)
+    for realm, streams in get_realms_and_streams_for_archiving():
+        archive_stream_messages(realm, streams, chunk_size)
        if realm.message_retention_days:
            archive_personal_and_huddle_messages(realm, chunk_size)

        # Messages have been archived for the realm, now we can clean up attachments:
        delete_expired_attachments(realm)

+def get_realms_and_streams_for_archiving() -> List[Tuple[Realm, List[Stream]]]:
+    """
+    This function constructs a list of (realm, streams_of_the_realm) tuples
+    where each realm is a Realm that requires calling the archiving functions on it,
+    and streams_of_the_realm is a list of streams of the realm to call archive_stream_messages with.
+
+    The purpose of this is performance - for servers with thousands of realms, it is important
+    to fetch all this data in bulk.
+    """
+
+    realm_id_to_realm = {}
+    realm_id_to_streams_list: Dict[int, List[Stream]] = {}
+
+    # All realms with a retention policy set qualify for archiving:
+    for realm in Realm.objects.filter(message_retention_days__isnull=False):
+        realm_id_to_realm[realm.id] = realm
+        realm_id_to_streams_list[realm.id] = []
+
+    # Now we find all streams that require archiving.
+    # First category are streams in retention-enabled realms,
+    # that don't have retention explicitly disabled (through the value -1).
+    query_one = Stream.objects.exclude(message_retention_days=-1) \
+        .filter(realm__message_retention_days__isnull=False) \
+        .select_related('realm', 'recipient')
+    # Second category are streams that are in realms without a realm-wide retention policy,
+    # but have their own stream-specific policy enabled.
+    query_two = Stream.objects.filter(realm__message_retention_days__isnull=True) \
+        .exclude(message_retention_days__isnull=True) \
+        .exclude(message_retention_days=-1) \
+        .select_related('realm', 'recipient')
+    query = query_one.union(query_two)
+
+    for stream in query:
+        realm = stream.realm
+        realm_id_to_realm[realm.id] = realm
+        if realm.id not in realm_id_to_streams_list:
+            realm_id_to_streams_list[realm.id] = []
+
+        realm_id_to_streams_list[realm.id].append(stream)
+
+    return [(realm_id_to_realm[realm_id], realm_id_to_streams_list[realm_id])
+            for realm_id in realm_id_to_realm]
+
 def move_messages_to_archive(message_ids: List[int], chunk_size: int=MESSAGE_BATCH_SIZE) -> None:
    query = SQL("""
    INSERT INTO zerver_archivedmessage ({dst_fields}, archive_transaction_id)
--- a/zerver/tests/test_retention.py
+++ b/zerver/tests/test_retention.py
@ -1,5 +1,5 @@
 from datetime import datetime, timedelta
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 from unittest import mock

 from django.conf import settings
@ -18,6 +18,7 @@ from zerver.lib.retention import (
    move_messages_to_archive,
    restore_all_data_from_archive,
    clean_archived_data,
+    get_realms_and_streams_for_archiving,
 )
 from zerver.tornado.event_queue import send_event

@ -745,6 +746,93 @@ class TestCleaningArchive(ArchiveMessagesTestingBase):
        for message in ArchivedMessage.objects.all():
            self.assertEqual(message.archive_transaction_id, remaining_transactions[0].id)

+
+class TestGetRealmAndStreamsForArchiving(ZulipTestCase):
+    def fix_ordering_of_result(self, result: List[Tuple[Realm, List[Stream]]]) -> None:
+        """
+        This is a helper for giving the struture returned by get_realms_and_streams_for_archiving
+        a consistent ordering.
+        """
+        # Sort the list of tuples by realm id:
+        result.sort(key=lambda x: x[0].id)
+
+        # Now we sort the lists of streams in each tuple:
+        for realm, streams_list in result:
+            streams_list.sort(key=lambda stream: stream.id)
+
+    def simple_get_realms_and_streams_for_archiving(self) -> List[Tuple[Realm, List[Stream]]]:
+        """
+        This is an implementation of the function we're testing, but using the obvious,
+        unoptimized algorithm. We can use this for additional verification of correctness,
+        by comparing the output of the two implementations.
+        """
+
+        result = []
+        for realm in Realm.objects.all():
+            if realm.message_retention_days is not None:
+                streams = Stream.objects.filter(realm=realm).exclude(message_retention_days=-1)
+                result.append((realm, list(streams)))
+            else:
+                streams = Stream.objects.filter(realm=realm).exclude(message_retention_days__isnull=True) \
+                    .exclude(message_retention_days=-1)
+                if streams.exists():
+                    result.append((realm, list(streams)))
+
+        return result
+
+    def test_get_realms_and_streams_for_archiving(self) -> None:
+        zulip_realm = get_realm("zulip")
+        zulip_realm.message_retention_days = 10
+        zulip_realm.save()
+
+        verona = get_stream("Verona", zulip_realm)
+        verona.message_retention_days = -1  # Block archiving for this stream
+        verona.save()
+        denmark = get_stream("Denmark", zulip_realm)
+        denmark.message_retention_days = 1
+        denmark.save()
+
+        zephyr_realm = get_realm("zephyr")
+        zephyr_realm.message_retention_days = None
+        zephyr_realm.save()
+        self.make_stream("normal stream", realm=zephyr_realm)
+
+        archiving_blocked_zephyr_stream = self.make_stream("no archiving", realm=zephyr_realm)
+        archiving_blocked_zephyr_stream.message_retention_days = -1
+        archiving_blocked_zephyr_stream.save()
+
+        archiving_enabled_zephyr_stream = self.make_stream("with archiving", realm=zephyr_realm)
+        archiving_enabled_zephyr_stream.message_retention_days = 1
+        archiving_enabled_zephyr_stream.save()
+
+        Realm.objects.create(string_id="no_archiving", invite_required=False, message_retention_days=None)
+        empty_realm_with_archiving = Realm.objects.create(string_id="with_archiving", invite_required=False,
+                                                          message_retention_days=1)
+
+        # We construct a list representing how the result of get_realms_and_streams_for_archiving should be.
+        # One nuisance is that the ordering of the elements in the result structure is not deterministic,
+        # so we use a helper to order both structures in a consistent manner. This wouldn't be necessary
+        # if python had a true "unordered list" data structure. Set doesn't do the job, because it requires
+        # elements to be hashable.
+        expected_result = [
+            (zulip_realm, list(Stream.objects.filter(realm=zulip_realm).exclude(id=verona.id))),
+            (zephyr_realm, [archiving_enabled_zephyr_stream]),
+            (empty_realm_with_archiving, []),
+        ]
+        self.fix_ordering_of_result(expected_result)
+
+        simple_algorithm_result = self.simple_get_realms_and_streams_for_archiving()
+        self.fix_ordering_of_result(simple_algorithm_result)
+
+        result = get_realms_and_streams_for_archiving()
+        self.fix_ordering_of_result(result)
+
+        self.assert_length(result, len(expected_result))
+        self.assertEqual(result, expected_result)
+
+        self.assert_length(result, len(simple_algorithm_result))
+        self.assertEqual(result, simple_algorithm_result)
+
 class TestDoDeleteMessages(ZulipTestCase):
    def test_do_delete_messages_multiple(self) -> None:
        realm = get_realm("zulip")