diff --git a/analytics/migrations/0015_clear_duplicate_counts.py b/analytics/migrations/0015_clear_duplicate_counts.py new file mode 100644 index 0000000000..049fc5d53e --- /dev/null +++ b/analytics/migrations/0015_clear_duplicate_counts.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- +from django.db import migrations +from django.db.backends.postgresql_psycopg2.schema import DatabaseSchemaEditor +from django.db.migrations.state import StateApps +from django.db.models import Count, Sum, Q + +def clear_duplicate_counts(apps: StateApps, schema_editor: DatabaseSchemaEditor) -> None: + """This is a preparatory migration for our Analytics tables. + + The backstory is that Django's unique_together indexes do not properly + handle the subgroup=None corner case (allowing duplicate rows that have a + subgroup of None), which meant that in race conditions, rather than updating + an existing row for the property/realm/time with subgroup=None, Django would + create a duplicate row. + + In the next migration, we'll add a proper constraint to fix this bug, but + we need to fix any existing problematic rows before we can add that constraint. + + We fix this in an appropriate fashion for each type of CountStat object; mainly + this means deleting the extra rows, but for LoggingCountStat objects, we need to + additionally combine the sums. + """ + RealmCount = apps.get_model('analytics', 'RealmCount') + + realm_counts = RealmCount.objects.filter(subgroup=None).values( + 'realm_id', 'property', 'end_time').annotate( + Count('id'), Sum('value')).filter(id__count__gt=1) + + for realm_count in realm_counts: + realm_count.pop('id__count') + total_value = realm_count.pop('value__sum') + duplicate_counts = list(RealmCount.objects.filter(**realm_count)) + first_count = duplicate_counts[0] + if realm_count['property'] in ["invites_sent::day", "active_users_log:is_bot:day"]: + # For LoggingCountStat objects, the right fix is to combine the totals; + # for other CountStat objects, we expect the duplicates to have the same value. + # And so all we need to do is delete them. + first_count.value = total_value + first_count.save() + to_cleanup = duplicate_counts[1:] + for duplicate_count in to_cleanup: + duplicate_count.delete() + +class Migration(migrations.Migration): + + dependencies = [ + ('analytics', '0014_remove_fillstate_last_modified'), + ] + + operations = [ + migrations.RunPython(clear_duplicate_counts, + reverse_code=migrations.RunPython.noop), + ] diff --git a/analytics/migrations/0016_unique_constraint_when_subgroup_null.py b/analytics/migrations/0016_unique_constraint_when_subgroup_null.py new file mode 100644 index 0000000000..726a3269b1 --- /dev/null +++ b/analytics/migrations/0016_unique_constraint_when_subgroup_null.py @@ -0,0 +1,61 @@ +# Generated by Django 2.2.10 on 2020-02-29 19:40 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('analytics', '0015_clear_duplicate_counts'), + ] + + operations = [ + migrations.AlterUniqueTogether( + name='installationcount', + unique_together=set(), + ), + migrations.AlterUniqueTogether( + name='realmcount', + unique_together=set(), + ), + migrations.AlterUniqueTogether( + name='streamcount', + unique_together=set(), + ), + migrations.AlterUniqueTogether( + name='usercount', + unique_together=set(), + ), + migrations.AddConstraint( + model_name='installationcount', + constraint=models.UniqueConstraint(condition=models.Q(subgroup__isnull=False), fields=('property', 'subgroup', 'end_time'), name='unique_installation_count'), + ), + migrations.AddConstraint( + model_name='installationcount', + constraint=models.UniqueConstraint(condition=models.Q(subgroup__isnull=True), fields=('property', 'end_time'), name='unique_installation_count_null_subgroup'), + ), + migrations.AddConstraint( + model_name='realmcount', + constraint=models.UniqueConstraint(condition=models.Q(subgroup__isnull=False), fields=('realm', 'property', 'subgroup', 'end_time'), name='unique_realm_count'), + ), + migrations.AddConstraint( + model_name='realmcount', + constraint=models.UniqueConstraint(condition=models.Q(subgroup__isnull=True), fields=('realm', 'property', 'end_time'), name='unique_realm_count_null_subgroup'), + ), + migrations.AddConstraint( + model_name='streamcount', + constraint=models.UniqueConstraint(condition=models.Q(subgroup__isnull=False), fields=('stream', 'property', 'subgroup', 'end_time'), name='unique_stream_count'), + ), + migrations.AddConstraint( + model_name='streamcount', + constraint=models.UniqueConstraint(condition=models.Q(subgroup__isnull=True), fields=('stream', 'property', 'end_time'), name='unique_stream_count_null_subgroup'), + ), + migrations.AddConstraint( + model_name='usercount', + constraint=models.UniqueConstraint(condition=models.Q(subgroup__isnull=False), fields=('user', 'property', 'subgroup', 'end_time'), name='unique_user_count'), + ), + migrations.AddConstraint( + model_name='usercount', + constraint=models.UniqueConstraint(condition=models.Q(subgroup__isnull=True), fields=('user', 'property', 'end_time'), name='unique_user_count_null_subgroup'), + ), + ] diff --git a/analytics/models.py b/analytics/models.py index 142f5df4f4..7d053db805 100644 --- a/analytics/models.py +++ b/analytics/models.py @@ -2,6 +2,7 @@ import datetime from typing import Optional from django.db import models +from django.db.models import Q, UniqueConstraint from zerver.lib.timestamp import floor_to_day from zerver.models import Realm, Stream, UserProfile @@ -47,7 +48,17 @@ class BaseCount(models.Model): class InstallationCount(BaseCount): class Meta: - unique_together = ("property", "subgroup", "end_time") + # Handles invalid duplicate InstallationCount data + constraints = [ + UniqueConstraint( + fields=["property", "subgroup", "end_time"], + condition=Q(subgroup__isnull=False), + name='unique_installation_count'), + UniqueConstraint( + fields=["property", "end_time"], + condition=Q(subgroup__isnull=True), + name='unique_installation_count_null_subgroup') + ] def __str__(self) -> str: return "" % (self.property, self.subgroup, self.value) @@ -56,7 +67,17 @@ class RealmCount(BaseCount): realm = models.ForeignKey(Realm, on_delete=models.CASCADE) class Meta: - unique_together = ("realm", "property", "subgroup", "end_time") + # Handles invalid duplicate RealmCount data + constraints = [ + UniqueConstraint( + fields=["realm", "property", "subgroup", "end_time"], + condition=Q(subgroup__isnull=False), + name='unique_realm_count'), + UniqueConstraint( + fields=["realm", "property", "end_time"], + condition=Q(subgroup__isnull=True), + name='unique_realm_count_null_subgroup') + ] index_together = ["property", "end_time"] def __str__(self) -> str: @@ -67,7 +88,17 @@ class UserCount(BaseCount): realm = models.ForeignKey(Realm, on_delete=models.CASCADE) class Meta: - unique_together = ("user", "property", "subgroup", "end_time") + # Handles invalid duplicate UserCount data + constraints = [ + UniqueConstraint( + fields=["user", "property", "subgroup", "end_time"], + condition=Q(subgroup__isnull=False), + name='unique_user_count'), + UniqueConstraint( + fields=["user", "property", "end_time"], + condition=Q(subgroup__isnull=True), + name='unique_user_count_null_subgroup') + ] # This index dramatically improves the performance of # aggregating from users to realms index_together = ["property", "realm", "end_time"] @@ -80,7 +111,17 @@ class StreamCount(BaseCount): realm = models.ForeignKey(Realm, on_delete=models.CASCADE) class Meta: - unique_together = ("stream", "property", "subgroup", "end_time") + # Handles invalid duplicate StreamCount data + constraints = [ + UniqueConstraint( + fields=["stream", "property", "subgroup", "end_time"], + condition=Q(subgroup__isnull=False), + name='unique_stream_count'), + UniqueConstraint( + fields=["stream", "property", "end_time"], + condition=Q(subgroup__isnull=True), + name='unique_stream_count_null_subgroup') + ] # This index dramatically improves the performance of # aggregating from streams to realms index_together = ["property", "realm", "end_time"] diff --git a/analytics/tests/test_counts.py b/analytics/tests/test_counts.py index 40c93ab78c..2f8be5c973 100644 --- a/analytics/tests/test_counts.py +++ b/analytics/tests/test_counts.py @@ -1359,6 +1359,7 @@ class TestRealmActiveHumans(AnalyticsTestCase): self.create_user(realm=third_realm) RealmCount.objects.all().delete() + InstallationCount.objects.all().delete() for i in [-1, 0, 1]: do_fill_count_stat_at_hour(self.stat, self.TIME_ZERO + i*self.DAY) self.assertTableState(RealmCount, ['value', 'realm', 'end_time'],