From 98b68d703466fd3565d6172b9260e3f9ebd9b758 Mon Sep 17 00:00:00 2001 From: Alex Vandiver Date: Tue, 28 Nov 2023 20:20:54 +0000 Subject: [PATCH] zilencer: Remove duplicates before adding unique indexes. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The recent #27818 naïvely added unique indexes, despite there being a large number of existing violations. This makes the migration impossible to deploy. Update the migration to de-duplicate rows, dropping all but the first-by-id of each unique set. This is equivalent to what dd954749bea1 does with `ignore_conflicts`. We update the migration, rather than making a new one, as any server which has somehow successfully applied the migration apparently did not need to de-duplicate anything. --- ...tallationcount_unique_together_and_more.py | 45 ++++++++++++++++++- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/zilencer/migrations/0037_alter_remoteinstallationcount_unique_together_and_more.py b/zilencer/migrations/0037_alter_remoteinstallationcount_unique_together_and_more.py index c67f8ea579..8ef1f79a7b 100644 --- a/zilencer/migrations/0037_alter_remoteinstallationcount_unique_together_and_more.py +++ b/zilencer/migrations/0037_alter_remoteinstallationcount_unique_together_and_more.py @@ -1,6 +1,46 @@ -# Generated by Django 4.2.7 on 2023-11-21 19:08 - from django.db import migrations, models +from django.db.backends.base.schema import BaseDatabaseSchemaEditor +from django.db.migrations.state import StateApps +from django.db.models import Count, Min + + +def clear_duplicate_counts(apps: StateApps, schema_editor: BaseDatabaseSchemaEditor) -> None: + """Clean up duplicated RemoteRealmCount and RemoteInstallationCount rows. + + This is the equivalent of analytics' 0015_clear_duplicate_counts + migration -- but it also has additional duplicates if there are + multiple servers submitting information with the same UUID. + + We drop the behaviour of rolling up and updating the value to the + sum, since the active_users_log:is_bot:day field has a subgroup + (and is thus not affected by the bug), and the few cases for + `invites_sent::day` seem more likely to be re-submissions of the + same data, not duplicates to roll up. + + We must do this step before switching the non-unique indexes to be + unique, as there are currently violations. + + """ + count_tables = dict( + realm=apps.get_model("zilencer", "RemoteRealmCount"), + installation=apps.get_model("zilencer", "RemoteInstallationCount"), + ) + + for name, count_table in count_tables.items(): + value = ["realm_id", "server_id", "property", "end_time"] + if name == "installation": + value = ["server_id", "property", "end_time"] + duplicated_rows = ( + count_table.objects.filter(subgroup=None) + .values(*value) + .annotate(Count("id"), Min("id")) + .filter(id__count__gt=1) + ) + + for duplicated_row in duplicated_rows: + duplicated_row.pop("id__count") + first_id = duplicated_row.pop("id__min") + count_table.objects.filter(**duplicated_row, id__gt=first_id).delete() class Migration(migrations.Migration): @@ -9,6 +49,7 @@ class Migration(migrations.Migration): ] operations = [ + migrations.RunPython(clear_duplicate_counts, reverse_code=migrations.RunPython.noop), migrations.AddConstraint( model_name="remoteinstallationcount", constraint=models.UniqueConstraint(