From 3518d317972ed437a44dcf7d548223e136892a4b Mon Sep 17 00:00:00 2001 From: Alex Vandiver Date: Wed, 30 Aug 2023 19:38:59 +0000 Subject: [PATCH] migrations: Add indexes with realm_id. This is designed to help PostgreSQL have better specificity and locality in its indexes. Subsequent commits will adjust the code to make sure that we use these indexes rather than the `realm_id`-less versions. We do not add a `realm_id` variation to the full-text index, since it is a GIN index; multi-column GIN indexes are not terribly performant, require the `btree_gin` extension for `int` types (which requires superuser privileges on PostgreSQL 12 and earlier), and cannot be consistently added concurrently on running instances. After all indexes have been made, we also run `CREATE STATISTICS` in order to give PostgreSQL the opportunity to realize that recipient and sender are highly correlated with message realm, allowing it to estimate that `(realm_id, recipient_id)` is likely as specific as matching a given `recipient_id`, instead of as likely as matching `realm_id` times matching a `recipient_id`. Finally, those statistics must be filled by `ANALYZE zerver_message`, which is run last. --- .../0472_add_message_realm_id_indexes.py | 102 ++++++++++++++++++ zerver/models.py | 75 +++++++++++++ zerver/tests/test_migrations.py | 2 + 3 files changed, 179 insertions(+) create mode 100644 zerver/migrations/0472_add_message_realm_id_indexes.py diff --git a/zerver/migrations/0472_add_message_realm_id_indexes.py b/zerver/migrations/0472_add_message_realm_id_indexes.py new file mode 100644 index 0000000000..485a26ef49 --- /dev/null +++ b/zerver/migrations/0472_add_message_realm_id_indexes.py @@ -0,0 +1,102 @@ +import django.db.models.functions.text +from django.contrib.postgres.operations import AddIndexConcurrently +from django.db import migrations, models + + +class Migration(migrations.Migration): + atomic = False + + dependencies = [ + ("zerver", "0471_alter_realm_create_multiuse_invite_group"), + ] + + # The non-realm_id-prefixed versions will be removed in the next migration. + operations = [ + AddIndexConcurrently( + model_name="message", + index=models.Index( + models.F("realm_id"), + models.F("recipient_id"), + models.F("id"), + name="zerver_message_realm_recipient_id", + ), + ), + AddIndexConcurrently( + model_name="message", + index=models.Index( + models.F("realm_id"), + models.F("recipient_id"), + models.F("date_sent"), + name="zerver_message_realm_recipient_date_sent", + ), + ), + AddIndexConcurrently( + model_name="message", + index=models.Index( + models.F("realm_id"), + models.F("sender_id"), + models.F("recipient_id"), + name="zerver_message_realm_sender_recipient", + ), + ), + AddIndexConcurrently( + model_name="message", + index=models.Index( + models.F("realm_id"), models.F("date_sent"), name="zerver_message_realm_date_sent" + ), + ), + AddIndexConcurrently( + model_name="message", + index=models.Index( + models.F("realm_id"), + django.db.models.functions.text.Upper("subject"), + models.OrderBy(models.F("id"), descending=True, nulls_last=True), + name="zerver_message_realm_upper_subject", + ), + ), + AddIndexConcurrently( + model_name="message", + index=models.Index( + models.F("realm_id"), + models.F("recipient_id"), + django.db.models.functions.text.Upper("subject"), + models.OrderBy(models.F("id"), descending=True, nulls_last=True), + name="zerver_message_realm_recipient_upper_subject", + ), + ), + AddIndexConcurrently( + model_name="message", + index=models.Index( + models.F("realm_id"), + models.F("recipient_id"), + models.F("subject"), + models.OrderBy(models.F("id"), descending=True, nulls_last=True), + name="zerver_message_realm_recipient_subject", + ), + ), + AddIndexConcurrently( + model_name="message", + index=models.Index( + models.F("realm_id"), + models.OrderBy(models.F("id"), descending=True, nulls_last=True), + name="zerver_message_realm_id", + ), + ), + AddIndexConcurrently( + model_name="scheduledmessage", + index=models.Index( + condition=models.Q(("delivered", False)), + fields=["realm_id", "sender", "delivery_type", "scheduled_timestamp"], + name="zerver_realm_unsent_scheduled_messages_by_user", + ), + ), + migrations.RunSQL( + sql="CREATE STATISTICS IF NOT EXISTS zerver_message_realm_recipient ON realm_id, recipient_id FROM zerver_message", + reverse_sql="DROP STATISTICS IF EXISTS zerver_message_realm_recipient", + ), + migrations.RunSQL( + sql="CREATE STATISTICS IF NOT EXISTS zerver_message_realm_sender ON realm_id, sender_id FROM zerver_message", + reverse_sql="DROP STATISTICS IF EXISTS zerver_message_realm_sender", + ), + migrations.RunSQL("ANALYZE zerver_message"), + ] diff --git a/zerver/models.py b/zerver/models.py index 621aa30298..4045be6a4b 100644 --- a/zerver/models.py +++ b/zerver/models.py @@ -3116,6 +3116,74 @@ class Message(AbstractMessage): F("id").desc(nulls_last=True), name="zerver_message_recipient_subject", ), + # Indexes prefixed with realm_id + models.Index( + # For moving messages between streams or marking + # streams as read. The "id" at the end makes it easy + # to scan the resulting messages in order, and perform + # batching. + "realm_id", + "recipient_id", + "id", + name="zerver_message_realm_recipient_id", + ), + models.Index( + # For generating digest emails and message archiving, + # which both group by stream. + "realm_id", + "recipient_id", + "date_sent", + name="zerver_message_realm_recipient_date_sent", + ), + models.Index( + # For exports, which want to limit both sender and + # receiver. The prefix of this index (realm_id, + # sender_id) can be used for scrubbing users and/or + # deleting users' messages. + "realm_id", + "sender_id", + "recipient_id", + name="zerver_message_realm_sender_recipient", + ), + models.Index( + # For analytics queries + "realm_id", + "date_sent", + name="zerver_message_realm_date_sent", + ), + models.Index( + # For users searching by topic (but not stream), which + # is done case-insensitively + "realm_id", + Upper("subject"), + F("id").desc(nulls_last=True), + name="zerver_message_realm_upper_subject", + ), + models.Index( + # Most stream/topic searches are case-insensitive by + # topic name (e.g. messages_for_topic). The "id" at + # the end makes it easy to scan the resulting messages + # in order, and perform batching. + "realm_id", + "recipient_id", + Upper("subject"), + F("id").desc(nulls_last=True), + name="zerver_message_realm_recipient_upper_subject", + ), + models.Index( + # Only used by already_sent_mirrored_message_id + "realm_id", + "recipient_id", + "subject", + F("id").desc(nulls_last=True), + name="zerver_message_realm_recipient_subject", + ), + models.Index( + # Only used by update_first_visible_message_id + "realm_id", + F("id").desc(nulls_last=True), + name="zerver_message_realm_id", + ), ] @@ -4408,6 +4476,13 @@ class ScheduledMessage(models.Model): delivered=False, ), ), + models.Index( + name="zerver_realm_unsent_scheduled_messages_by_user", + fields=["realm_id", "sender", "delivery_type", "scheduled_timestamp"], + condition=Q( + delivered=False, + ), + ), ] def __str__(self) -> str: diff --git a/zerver/tests/test_migrations.py b/zerver/tests/test_migrations.py index dbe21fad6b..e6057637a5 100644 --- a/zerver/tests/test_migrations.py +++ b/zerver/tests/test_migrations.py @@ -5,6 +5,7 @@ # https://www.caktusgroup.com/blog/2016/02/02/writing-unit-tests-django-migrations/ # to get a tutorial on the framework that inspired this feature. from datetime import datetime, timezone +from unittest import skip import orjson from django.db.migrations.state import StateApps @@ -28,6 +29,7 @@ from zerver.lib.test_helpers import use_db_models # been tested for a migration being merged. +@skip("Cannot be run because there is a non-atomic migration that has been merged after it") class ScheduledEmailData(MigrationsTestCase): migrate_from = "0467_rename_extradata_realmauditlog_extra_data_json" migrate_to = "0468_rename_followup_day_email_templates"