2020-02-29 22:48:15 +01:00
|
|
|
from django.db import migrations
|
2020-04-27 07:19:08 +02:00
|
|
|
from django.db.backends.postgresql.schema import DatabaseSchemaEditor
|
2020-02-29 22:48:15 +01:00
|
|
|
from django.db.migrations.state import StateApps
|
2020-03-06 21:08:14 +01:00
|
|
|
from django.db.models import Count, Sum
|
2020-02-29 22:48:15 +01:00
|
|
|
|
2020-06-11 00:54:34 +02:00
|
|
|
|
2020-02-29 22:48:15 +01:00
|
|
|
def clear_duplicate_counts(apps: StateApps, schema_editor: DatabaseSchemaEditor) -> None:
|
|
|
|
"""This is a preparatory migration for our Analytics tables.
|
|
|
|
|
|
|
|
The backstory is that Django's unique_together indexes do not properly
|
|
|
|
handle the subgroup=None corner case (allowing duplicate rows that have a
|
|
|
|
subgroup of None), which meant that in race conditions, rather than updating
|
2020-07-30 22:55:02 +02:00
|
|
|
an existing row for the property/(realm, stream, user)/time with subgroup=None, Django would
|
2020-02-29 22:48:15 +01:00
|
|
|
create a duplicate row.
|
|
|
|
|
|
|
|
In the next migration, we'll add a proper constraint to fix this bug, but
|
|
|
|
we need to fix any existing problematic rows before we can add that constraint.
|
|
|
|
|
|
|
|
We fix this in an appropriate fashion for each type of CountStat object; mainly
|
|
|
|
this means deleting the extra rows, but for LoggingCountStat objects, we need to
|
|
|
|
additionally combine the sums.
|
|
|
|
"""
|
2021-02-12 08:19:30 +01:00
|
|
|
count_tables = dict(
|
2021-02-12 08:20:45 +01:00
|
|
|
realm=apps.get_model("analytics", "RealmCount"),
|
|
|
|
user=apps.get_model("analytics", "UserCount"),
|
|
|
|
stream=apps.get_model("analytics", "StreamCount"),
|
|
|
|
installation=apps.get_model("analytics", "InstallationCount"),
|
2021-02-12 08:19:30 +01:00
|
|
|
)
|
2020-07-30 22:55:02 +02:00
|
|
|
|
|
|
|
for name, count_table in count_tables.items():
|
2021-02-12 08:20:45 +01:00
|
|
|
value = [name, "property", "end_time"]
|
|
|
|
if name == "installation":
|
|
|
|
value = ["property", "end_time"]
|
2021-02-12 08:19:30 +01:00
|
|
|
counts = (
|
|
|
|
count_table.objects.filter(subgroup=None)
|
|
|
|
.values(*value)
|
2021-02-12 08:20:45 +01:00
|
|
|
.annotate(Count("id"), Sum("value"))
|
2021-02-12 08:19:30 +01:00
|
|
|
.filter(id__count__gt=1)
|
|
|
|
)
|
2020-02-29 22:48:15 +01:00
|
|
|
|
2020-07-30 22:55:02 +02:00
|
|
|
for count in counts:
|
2021-02-12 08:20:45 +01:00
|
|
|
count.pop("id__count")
|
|
|
|
total_value = count.pop("value__sum")
|
2020-07-30 22:55:02 +02:00
|
|
|
duplicate_counts = list(count_table.objects.filter(**count))
|
|
|
|
first_count = duplicate_counts[0]
|
2021-02-12 08:20:45 +01:00
|
|
|
if count["property"] in ["invites_sent::day", "active_users_log:is_bot:day"]:
|
2020-07-30 22:55:02 +02:00
|
|
|
# For LoggingCountStat objects, the right fix is to combine the totals;
|
|
|
|
# for other CountStat objects, we expect the duplicates to have the same value.
|
|
|
|
# And so all we need to do is delete them.
|
|
|
|
first_count.value = total_value
|
|
|
|
first_count.save()
|
|
|
|
to_cleanup = duplicate_counts[1:]
|
|
|
|
for duplicate_count in to_cleanup:
|
|
|
|
duplicate_count.delete()
|
2020-02-29 22:48:15 +01:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2020-02-29 22:48:15 +01:00
|
|
|
class Migration(migrations.Migration):
|
|
|
|
|
|
|
|
dependencies = [
|
2021-02-12 08:20:45 +01:00
|
|
|
("analytics", "0014_remove_fillstate_last_modified"),
|
2020-02-29 22:48:15 +01:00
|
|
|
]
|
|
|
|
|
|
|
|
operations = [
|
2021-02-12 08:19:30 +01:00
|
|
|
migrations.RunPython(clear_duplicate_counts, reverse_code=migrations.RunPython.noop),
|
2020-02-29 22:48:15 +01:00
|
|
|
]
|