analytics: Fix missing unique constraint when subgroup is null.

Replaced unique_together with UniqueConstraint in models that
covered nullable fields as in unique_together database indexes
don't work where subgroup=None. So added conditional unique
index handling invalid duplicate Count data.

Added 0015_clear_duplicate_counts migration to handle existing
data that violates the constraints.

Also corrected a test case in test_counts.py which didn't clear its
state properly and thus was accidentally taking advantage of this
database schema bug.
This commit is contained in:
arpit551 2020-03-01 03:18:15 +05:30 committed by Tim Abbott
parent dcc92de205
commit f299f31340
4 changed files with 160 additions and 4 deletions

View File

@ -0,0 +1,53 @@
# -*- coding: utf-8 -*-
from django.db import migrations
from django.db.backends.postgresql_psycopg2.schema import DatabaseSchemaEditor
from django.db.migrations.state import StateApps
from django.db.models import Count, Sum, Q
def clear_duplicate_counts(apps: StateApps, schema_editor: DatabaseSchemaEditor) -> None:
"""This is a preparatory migration for our Analytics tables.
The backstory is that Django's unique_together indexes do not properly
handle the subgroup=None corner case (allowing duplicate rows that have a
subgroup of None), which meant that in race conditions, rather than updating
an existing row for the property/realm/time with subgroup=None, Django would
create a duplicate row.
In the next migration, we'll add a proper constraint to fix this bug, but
we need to fix any existing problematic rows before we can add that constraint.
We fix this in an appropriate fashion for each type of CountStat object; mainly
this means deleting the extra rows, but for LoggingCountStat objects, we need to
additionally combine the sums.
"""
RealmCount = apps.get_model('analytics', 'RealmCount')
realm_counts = RealmCount.objects.filter(subgroup=None).values(
'realm_id', 'property', 'end_time').annotate(
Count('id'), Sum('value')).filter(id__count__gt=1)
for realm_count in realm_counts:
realm_count.pop('id__count')
total_value = realm_count.pop('value__sum')
duplicate_counts = list(RealmCount.objects.filter(**realm_count))
first_count = duplicate_counts[0]
if realm_count['property'] in ["invites_sent::day", "active_users_log:is_bot:day"]:
# For LoggingCountStat objects, the right fix is to combine the totals;
# for other CountStat objects, we expect the duplicates to have the same value.
# And so all we need to do is delete them.
first_count.value = total_value
first_count.save()
to_cleanup = duplicate_counts[1:]
for duplicate_count in to_cleanup:
duplicate_count.delete()
class Migration(migrations.Migration):
dependencies = [
('analytics', '0014_remove_fillstate_last_modified'),
]
operations = [
migrations.RunPython(clear_duplicate_counts,
reverse_code=migrations.RunPython.noop),
]

View File

@ -0,0 +1,61 @@
# Generated by Django 2.2.10 on 2020-02-29 19:40
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('analytics', '0015_clear_duplicate_counts'),
]
operations = [
migrations.AlterUniqueTogether(
name='installationcount',
unique_together=set(),
),
migrations.AlterUniqueTogether(
name='realmcount',
unique_together=set(),
),
migrations.AlterUniqueTogether(
name='streamcount',
unique_together=set(),
),
migrations.AlterUniqueTogether(
name='usercount',
unique_together=set(),
),
migrations.AddConstraint(
model_name='installationcount',
constraint=models.UniqueConstraint(condition=models.Q(subgroup__isnull=False), fields=('property', 'subgroup', 'end_time'), name='unique_installation_count'),
),
migrations.AddConstraint(
model_name='installationcount',
constraint=models.UniqueConstraint(condition=models.Q(subgroup__isnull=True), fields=('property', 'end_time'), name='unique_installation_count_null_subgroup'),
),
migrations.AddConstraint(
model_name='realmcount',
constraint=models.UniqueConstraint(condition=models.Q(subgroup__isnull=False), fields=('realm', 'property', 'subgroup', 'end_time'), name='unique_realm_count'),
),
migrations.AddConstraint(
model_name='realmcount',
constraint=models.UniqueConstraint(condition=models.Q(subgroup__isnull=True), fields=('realm', 'property', 'end_time'), name='unique_realm_count_null_subgroup'),
),
migrations.AddConstraint(
model_name='streamcount',
constraint=models.UniqueConstraint(condition=models.Q(subgroup__isnull=False), fields=('stream', 'property', 'subgroup', 'end_time'), name='unique_stream_count'),
),
migrations.AddConstraint(
model_name='streamcount',
constraint=models.UniqueConstraint(condition=models.Q(subgroup__isnull=True), fields=('stream', 'property', 'end_time'), name='unique_stream_count_null_subgroup'),
),
migrations.AddConstraint(
model_name='usercount',
constraint=models.UniqueConstraint(condition=models.Q(subgroup__isnull=False), fields=('user', 'property', 'subgroup', 'end_time'), name='unique_user_count'),
),
migrations.AddConstraint(
model_name='usercount',
constraint=models.UniqueConstraint(condition=models.Q(subgroup__isnull=True), fields=('user', 'property', 'end_time'), name='unique_user_count_null_subgroup'),
),
]

View File

@ -2,6 +2,7 @@ import datetime
from typing import Optional
from django.db import models
from django.db.models import Q, UniqueConstraint
from zerver.lib.timestamp import floor_to_day
from zerver.models import Realm, Stream, UserProfile
@ -47,7 +48,17 @@ class BaseCount(models.Model):
class InstallationCount(BaseCount):
class Meta:
unique_together = ("property", "subgroup", "end_time")
# Handles invalid duplicate InstallationCount data
constraints = [
UniqueConstraint(
fields=["property", "subgroup", "end_time"],
condition=Q(subgroup__isnull=False),
name='unique_installation_count'),
UniqueConstraint(
fields=["property", "end_time"],
condition=Q(subgroup__isnull=True),
name='unique_installation_count_null_subgroup')
]
def __str__(self) -> str:
return "<InstallationCount: %s %s %s>" % (self.property, self.subgroup, self.value)
@ -56,7 +67,17 @@ class RealmCount(BaseCount):
realm = models.ForeignKey(Realm, on_delete=models.CASCADE)
class Meta:
unique_together = ("realm", "property", "subgroup", "end_time")
# Handles invalid duplicate RealmCount data
constraints = [
UniqueConstraint(
fields=["realm", "property", "subgroup", "end_time"],
condition=Q(subgroup__isnull=False),
name='unique_realm_count'),
UniqueConstraint(
fields=["realm", "property", "end_time"],
condition=Q(subgroup__isnull=True),
name='unique_realm_count_null_subgroup')
]
index_together = ["property", "end_time"]
def __str__(self) -> str:
@ -67,7 +88,17 @@ class UserCount(BaseCount):
realm = models.ForeignKey(Realm, on_delete=models.CASCADE)
class Meta:
unique_together = ("user", "property", "subgroup", "end_time")
# Handles invalid duplicate UserCount data
constraints = [
UniqueConstraint(
fields=["user", "property", "subgroup", "end_time"],
condition=Q(subgroup__isnull=False),
name='unique_user_count'),
UniqueConstraint(
fields=["user", "property", "end_time"],
condition=Q(subgroup__isnull=True),
name='unique_user_count_null_subgroup')
]
# This index dramatically improves the performance of
# aggregating from users to realms
index_together = ["property", "realm", "end_time"]
@ -80,7 +111,17 @@ class StreamCount(BaseCount):
realm = models.ForeignKey(Realm, on_delete=models.CASCADE)
class Meta:
unique_together = ("stream", "property", "subgroup", "end_time")
# Handles invalid duplicate StreamCount data
constraints = [
UniqueConstraint(
fields=["stream", "property", "subgroup", "end_time"],
condition=Q(subgroup__isnull=False),
name='unique_stream_count'),
UniqueConstraint(
fields=["stream", "property", "end_time"],
condition=Q(subgroup__isnull=True),
name='unique_stream_count_null_subgroup')
]
# This index dramatically improves the performance of
# aggregating from streams to realms
index_together = ["property", "realm", "end_time"]

View File

@ -1359,6 +1359,7 @@ class TestRealmActiveHumans(AnalyticsTestCase):
self.create_user(realm=third_realm)
RealmCount.objects.all().delete()
InstallationCount.objects.all().delete()
for i in [-1, 0, 1]:
do_fill_count_stat_at_hour(self.stat, self.TIME_ZERO + i*self.DAY)
self.assertTableState(RealmCount, ['value', 'realm', 'end_time'],