zilencer: Remove duplicates before adding unique indexes.

The recent #27818 naïvely added unique indexes, despite there being a
large number of existing violations.  This makes the migration
impossible to deploy.

Update the migration to de-duplicate rows, dropping all but the
first-by-id of each unique set.  This is equivalent to what
dd954749be does with `ignore_conflicts`.  We update the migration,
rather than making a new one, as any server which has somehow
successfully applied the migration apparently did not need to
de-duplicate anything.
This commit is contained in:
Alex Vandiver 2023-11-28 20:20:54 +00:00 committed by Tim Abbott
parent 02d5740f0f
commit 98b68d7034
1 changed files with 43 additions and 2 deletions

View File

@ -1,6 +1,46 @@
# Generated by Django 4.2.7 on 2023-11-21 19:08
from django.db import migrations, models
from django.db.backends.base.schema import BaseDatabaseSchemaEditor
from django.db.migrations.state import StateApps
from django.db.models import Count, Min
def clear_duplicate_counts(apps: StateApps, schema_editor: BaseDatabaseSchemaEditor) -> None:
"""Clean up duplicated RemoteRealmCount and RemoteInstallationCount rows.
This is the equivalent of analytics' 0015_clear_duplicate_counts
migration -- but it also has additional duplicates if there are
multiple servers submitting information with the same UUID.
We drop the behaviour of rolling up and updating the value to the
sum, since the active_users_log:is_bot:day field has a subgroup
(and is thus not affected by the bug), and the few cases for
`invites_sent::day` seem more likely to be re-submissions of the
same data, not duplicates to roll up.
We must do this step before switching the non-unique indexes to be
unique, as there are currently violations.
"""
count_tables = dict(
realm=apps.get_model("zilencer", "RemoteRealmCount"),
installation=apps.get_model("zilencer", "RemoteInstallationCount"),
)
for name, count_table in count_tables.items():
value = ["realm_id", "server_id", "property", "end_time"]
if name == "installation":
value = ["server_id", "property", "end_time"]
duplicated_rows = (
count_table.objects.filter(subgroup=None)
.values(*value)
.annotate(Count("id"), Min("id"))
.filter(id__count__gt=1)
)
for duplicated_row in duplicated_rows:
duplicated_row.pop("id__count")
first_id = duplicated_row.pop("id__min")
count_table.objects.filter(**duplicated_row, id__gt=first_id).delete()
class Migration(migrations.Migration):
@ -9,6 +49,7 @@ class Migration(migrations.Migration):
]
operations = [
migrations.RunPython(clear_duplicate_counts, reverse_code=migrations.RunPython.noop),
migrations.AddConstraint(
model_name="remoteinstallationcount",
constraint=models.UniqueConstraint(