migrations: Add indexes with realm_id.

This is designed to help PostgreSQL have better specificity and
locality in its indexes.  Subsequent commits will adjust the code to
make sure that we use these indexes rather than the `realm_id`-less
versions.

We do not add a `realm_id` variation to the full-text index, since
it is a GIN index; multi-column GIN indexes are not terribly
performant, require the `btree_gin` extension for `int` types (which
requires superuser privileges on PostgreSQL 12 and earlier), and
cannot be consistently added concurrently on running instances.

After all indexes have been made, we also run `CREATE STATISTICS` in
order to give PostgreSQL the opportunity to realize that recipient and
sender are highly correlated with message realm, allowing it to
estimate that `(realm_id, recipient_id)` is likely as specific as
matching a given `recipient_id`, instead of as likely as matching
`realm_id` times matching a `recipient_id`.  Finally, those statistics
must be filled by `ANALYZE zerver_message`, which is run last.
This commit is contained in:
Alex Vandiver 2023-08-30 19:38:59 +00:00 committed by Tim Abbott
parent 067de6f948
commit 3518d31797
3 changed files with 179 additions and 0 deletions

View File

@ -0,0 +1,102 @@
import django.db.models.functions.text
from django.contrib.postgres.operations import AddIndexConcurrently
from django.db import migrations, models
class Migration(migrations.Migration):
atomic = False
dependencies = [
("zerver", "0471_alter_realm_create_multiuse_invite_group"),
]
# The non-realm_id-prefixed versions will be removed in the next migration.
operations = [
AddIndexConcurrently(
model_name="message",
index=models.Index(
models.F("realm_id"),
models.F("recipient_id"),
models.F("id"),
name="zerver_message_realm_recipient_id",
),
),
AddIndexConcurrently(
model_name="message",
index=models.Index(
models.F("realm_id"),
models.F("recipient_id"),
models.F("date_sent"),
name="zerver_message_realm_recipient_date_sent",
),
),
AddIndexConcurrently(
model_name="message",
index=models.Index(
models.F("realm_id"),
models.F("sender_id"),
models.F("recipient_id"),
name="zerver_message_realm_sender_recipient",
),
),
AddIndexConcurrently(
model_name="message",
index=models.Index(
models.F("realm_id"), models.F("date_sent"), name="zerver_message_realm_date_sent"
),
),
AddIndexConcurrently(
model_name="message",
index=models.Index(
models.F("realm_id"),
django.db.models.functions.text.Upper("subject"),
models.OrderBy(models.F("id"), descending=True, nulls_last=True),
name="zerver_message_realm_upper_subject",
),
),
AddIndexConcurrently(
model_name="message",
index=models.Index(
models.F("realm_id"),
models.F("recipient_id"),
django.db.models.functions.text.Upper("subject"),
models.OrderBy(models.F("id"), descending=True, nulls_last=True),
name="zerver_message_realm_recipient_upper_subject",
),
),
AddIndexConcurrently(
model_name="message",
index=models.Index(
models.F("realm_id"),
models.F("recipient_id"),
models.F("subject"),
models.OrderBy(models.F("id"), descending=True, nulls_last=True),
name="zerver_message_realm_recipient_subject",
),
),
AddIndexConcurrently(
model_name="message",
index=models.Index(
models.F("realm_id"),
models.OrderBy(models.F("id"), descending=True, nulls_last=True),
name="zerver_message_realm_id",
),
),
AddIndexConcurrently(
model_name="scheduledmessage",
index=models.Index(
condition=models.Q(("delivered", False)),
fields=["realm_id", "sender", "delivery_type", "scheduled_timestamp"],
name="zerver_realm_unsent_scheduled_messages_by_user",
),
),
migrations.RunSQL(
sql="CREATE STATISTICS IF NOT EXISTS zerver_message_realm_recipient ON realm_id, recipient_id FROM zerver_message",
reverse_sql="DROP STATISTICS IF EXISTS zerver_message_realm_recipient",
),
migrations.RunSQL(
sql="CREATE STATISTICS IF NOT EXISTS zerver_message_realm_sender ON realm_id, sender_id FROM zerver_message",
reverse_sql="DROP STATISTICS IF EXISTS zerver_message_realm_sender",
),
migrations.RunSQL("ANALYZE zerver_message"),
]

View File

@ -3116,6 +3116,74 @@ class Message(AbstractMessage):
F("id").desc(nulls_last=True), F("id").desc(nulls_last=True),
name="zerver_message_recipient_subject", name="zerver_message_recipient_subject",
), ),
# Indexes prefixed with realm_id
models.Index(
# For moving messages between streams or marking
# streams as read. The "id" at the end makes it easy
# to scan the resulting messages in order, and perform
# batching.
"realm_id",
"recipient_id",
"id",
name="zerver_message_realm_recipient_id",
),
models.Index(
# For generating digest emails and message archiving,
# which both group by stream.
"realm_id",
"recipient_id",
"date_sent",
name="zerver_message_realm_recipient_date_sent",
),
models.Index(
# For exports, which want to limit both sender and
# receiver. The prefix of this index (realm_id,
# sender_id) can be used for scrubbing users and/or
# deleting users' messages.
"realm_id",
"sender_id",
"recipient_id",
name="zerver_message_realm_sender_recipient",
),
models.Index(
# For analytics queries
"realm_id",
"date_sent",
name="zerver_message_realm_date_sent",
),
models.Index(
# For users searching by topic (but not stream), which
# is done case-insensitively
"realm_id",
Upper("subject"),
F("id").desc(nulls_last=True),
name="zerver_message_realm_upper_subject",
),
models.Index(
# Most stream/topic searches are case-insensitive by
# topic name (e.g. messages_for_topic). The "id" at
# the end makes it easy to scan the resulting messages
# in order, and perform batching.
"realm_id",
"recipient_id",
Upper("subject"),
F("id").desc(nulls_last=True),
name="zerver_message_realm_recipient_upper_subject",
),
models.Index(
# Only used by already_sent_mirrored_message_id
"realm_id",
"recipient_id",
"subject",
F("id").desc(nulls_last=True),
name="zerver_message_realm_recipient_subject",
),
models.Index(
# Only used by update_first_visible_message_id
"realm_id",
F("id").desc(nulls_last=True),
name="zerver_message_realm_id",
),
] ]
@ -4408,6 +4476,13 @@ class ScheduledMessage(models.Model):
delivered=False, delivered=False,
), ),
), ),
models.Index(
name="zerver_realm_unsent_scheduled_messages_by_user",
fields=["realm_id", "sender", "delivery_type", "scheduled_timestamp"],
condition=Q(
delivered=False,
),
),
] ]
def __str__(self) -> str: def __str__(self) -> str:

View File

@ -5,6 +5,7 @@
# https://www.caktusgroup.com/blog/2016/02/02/writing-unit-tests-django-migrations/ # https://www.caktusgroup.com/blog/2016/02/02/writing-unit-tests-django-migrations/
# to get a tutorial on the framework that inspired this feature. # to get a tutorial on the framework that inspired this feature.
from datetime import datetime, timezone from datetime import datetime, timezone
from unittest import skip
import orjson import orjson
from django.db.migrations.state import StateApps from django.db.migrations.state import StateApps
@ -28,6 +29,7 @@ from zerver.lib.test_helpers import use_db_models
# been tested for a migration being merged. # been tested for a migration being merged.
@skip("Cannot be run because there is a non-atomic migration that has been merged after it")
class ScheduledEmailData(MigrationsTestCase): class ScheduledEmailData(MigrationsTestCase):
migrate_from = "0467_rename_extradata_realmauditlog_extra_data_json" migrate_from = "0467_rename_extradata_realmauditlog_extra_data_json"
migrate_to = "0468_rename_followup_day_email_templates" migrate_to = "0468_rename_followup_day_email_templates"