From a68d38cc5285617b7bdf6ac30ba64e5569d4c58b Mon Sep 17 00:00:00 2001 From: arpit551 Date: Fri, 31 Jul 2020 02:25:02 +0530 Subject: [PATCH] migrations: Upgrade migrations to remove duplicates in all Count tables. This commit upgrades 0015_clear_duplicate_counts migration to remove duplicate count in StreamCount, UserCount, InstallationCount as well. Fixes https://github.com/zulip/docker-zulip/issues/266 --- .../migrations/0015_clear_duplicate_counts.py | 42 +++++++++++-------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/analytics/migrations/0015_clear_duplicate_counts.py b/analytics/migrations/0015_clear_duplicate_counts.py index 9533f21bde..a7ecc3dd87 100644 --- a/analytics/migrations/0015_clear_duplicate_counts.py +++ b/analytics/migrations/0015_clear_duplicate_counts.py @@ -10,7 +10,7 @@ def clear_duplicate_counts(apps: StateApps, schema_editor: DatabaseSchemaEditor) The backstory is that Django's unique_together indexes do not properly handle the subgroup=None corner case (allowing duplicate rows that have a subgroup of None), which meant that in race conditions, rather than updating - an existing row for the property/realm/time with subgroup=None, Django would + an existing row for the property/(realm, stream, user)/time with subgroup=None, Django would create a duplicate row. In the next migration, we'll add a proper constraint to fix this bug, but @@ -20,26 +20,32 @@ def clear_duplicate_counts(apps: StateApps, schema_editor: DatabaseSchemaEditor) this means deleting the extra rows, but for LoggingCountStat objects, we need to additionally combine the sums. """ - RealmCount = apps.get_model('analytics', 'RealmCount') + count_tables = dict(realm=apps.get_model('analytics', 'RealmCount'), + user=apps.get_model('analytics', 'UserCount'), + stream=apps.get_model('analytics', 'StreamCount'), + installation=apps.get_model('analytics', 'InstallationCount')) - realm_counts = RealmCount.objects.filter(subgroup=None).values( - 'realm_id', 'property', 'end_time').annotate( + for name, count_table in count_tables.items(): + value = [name, 'property', 'end_time'] + if name == 'installation': + value = ['property', 'end_time'] + counts = count_table.objects.filter(subgroup=None).values(*value).annotate( Count('id'), Sum('value')).filter(id__count__gt=1) - for realm_count in realm_counts: - realm_count.pop('id__count') - total_value = realm_count.pop('value__sum') - duplicate_counts = list(RealmCount.objects.filter(**realm_count)) - first_count = duplicate_counts[0] - if realm_count['property'] in ["invites_sent::day", "active_users_log:is_bot:day"]: - # For LoggingCountStat objects, the right fix is to combine the totals; - # for other CountStat objects, we expect the duplicates to have the same value. - # And so all we need to do is delete them. - first_count.value = total_value - first_count.save() - to_cleanup = duplicate_counts[1:] - for duplicate_count in to_cleanup: - duplicate_count.delete() + for count in counts: + count.pop('id__count') + total_value = count.pop('value__sum') + duplicate_counts = list(count_table.objects.filter(**count)) + first_count = duplicate_counts[0] + if count['property'] in ["invites_sent::day", "active_users_log:is_bot:day"]: + # For LoggingCountStat objects, the right fix is to combine the totals; + # for other CountStat objects, we expect the duplicates to have the same value. + # And so all we need to do is delete them. + first_count.value = total_value + first_count.save() + to_cleanup = duplicate_counts[1:] + for duplicate_count in to_cleanup: + duplicate_count.delete() class Migration(migrations.Migration):