analytics: Rewrite realm_active_humans::day query.

This makes it no longer dependent on active_users_audit:is_bot:day,
which subsequent commits will make a RealmCount, not UserCount, query.
This folds the same behaviour of `active_users_audit` directly into
the query; however, only running over active users, using the index
from the earlier commit, and using the new `DISTINCT ON` formulation
make this a fast query compared to `active_users_audit:is_bot:day` +
the old `realm_active_humans::day`.
This commit is contained in:
Alex Vandiver 2024-06-02 15:16:44 +00:00 committed by Tim Abbott
parent e638ae44a8
commit 195defb031
2 changed files with 52 additions and 55 deletions

View File

@ -783,29 +783,45 @@ def count_realm_active_humans_query(realm: Optional[Realm]) -> QueryFn:
INSERT INTO analytics_realmcount
(realm_id, value, property, subgroup, end_time)
SELECT
usercount1.realm_id, count(*), %(property)s, NULL, %(time_end)s
active_usercount.realm_id, count(*), %(property)s, NULL, %(time_end)s
FROM (
SELECT realm_id, user_id
FROM analytics_usercount
SELECT
realm_id,
user_id
FROM
analytics_usercount
WHERE
property = 'active_users_audit:is_bot:day' AND
subgroup = 'false' AND
property = '15day_actives::day'
{realm_clause}
end_time = %(time_end)s
) usercount1
AND end_time = %(time_end)s
) active_usercount
JOIN zerver_userprofile ON active_usercount.user_id = zerver_userprofile.id
JOIN (
SELECT realm_id, user_id
FROM analytics_usercount
SELECT DISTINCT ON (modified_user_id)
modified_user_id, event_type
FROM
zerver_realmauditlog
WHERE
property = '15day_actives::day' AND
{realm_clause}
end_time = %(time_end)s
) usercount2
ON
usercount1.user_id = usercount2.user_id
GROUP BY usercount1.realm_id
event_type IN ({user_created}, {user_activated}, {user_deactivated}, {user_reactivated})
AND event_time < %(time_end)s
ORDER BY
modified_user_id,
event_time DESC
) last_user_event ON last_user_event.modified_user_id = active_usercount.user_id
WHERE
NOT zerver_userprofile.is_bot
AND event_type IN ({user_created}, {user_activated}, {user_reactivated})
GROUP BY
active_usercount.realm_id
"""
).format(**kwargs, realm_clause=realm_clause)
).format(
**kwargs,
user_created=Literal(RealmAuditLog.USER_CREATED),
user_activated=Literal(RealmAuditLog.USER_ACTIVATED),
user_deactivated=Literal(RealmAuditLog.USER_DEACTIVATED),
user_reactivated=Literal(RealmAuditLog.USER_REACTIVATED),
realm_clause=realm_clause,
)
# Currently unused and untested
@ -951,7 +967,7 @@ def get_count_stats(realm: Optional[Realm] = None) -> Dict[str, CountStat]:
"realm_active_humans::day",
sql_data_collector(RealmCount, count_realm_active_humans_query(realm), None),
CountStat.DAY,
dependencies=["active_users_audit:is_bot:day", "15day_actives::day"],
dependencies=["15day_actives::day"],
),
]

View File

@ -3,7 +3,6 @@ from datetime import datetime, timedelta, timezone
from typing import Any, Dict, Iterator, List, Optional, Tuple, Type
from unittest import mock
import orjson
import time_machine
from django.apps import apps
from django.db import models
@ -2081,18 +2080,6 @@ class TestRealmActiveHumans(AnalyticsTestCase):
self.stat = COUNT_STATS["realm_active_humans::day"]
self.current_property = self.stat.property
def mark_audit_active(self, user: UserProfile, end_time: Optional[datetime] = None) -> None:
if end_time is None:
end_time = self.TIME_ZERO
UserCount.objects.create(
user=user,
realm=user.realm,
property="active_users_audit:is_bot:day",
subgroup=orjson.dumps(user.is_bot).decode(),
end_time=end_time,
value=1,
)
def mark_15day_active(self, user: UserProfile, end_time: Optional[datetime] = None) -> None:
if end_time is None:
end_time = self.TIME_ZERO
@ -2100,38 +2087,35 @@ class TestRealmActiveHumans(AnalyticsTestCase):
user=user, realm=user.realm, property="15day_actives::day", end_time=end_time, value=1
)
def test_basic_boolean_logic(self) -> None:
def test_basic_logic(self) -> None:
user = self.create_user()
self.mark_audit_active(user, end_time=self.TIME_ZERO - self.DAY)
self.mark_15day_active(user, end_time=self.TIME_ZERO)
self.mark_audit_active(user, end_time=self.TIME_ZERO + self.DAY)
self.mark_15day_active(user, end_time=self.TIME_ZERO + self.DAY)
for i in [-1, 0, 1]:
do_fill_count_stat_at_hour(self.stat, self.TIME_ZERO + i * self.DAY)
self.assertTableState(RealmCount, ["value", "end_time"], [[1, self.TIME_ZERO + self.DAY]])
self.assertTableState(
RealmCount, ["value", "end_time"], [[1, self.TIME_ZERO], [1, self.TIME_ZERO + self.DAY]]
)
def test_bots_not_counted(self) -> None:
bot = self.create_user(is_bot=True)
self.mark_audit_active(bot)
self.mark_15day_active(bot)
do_fill_count_stat_at_hour(self.stat, self.TIME_ZERO)
self.assertTableState(RealmCount, [], [])
def test_multiple_users_realms_and_times(self) -> None:
user1 = self.create_user()
user2 = self.create_user()
user1 = self.create_user(date_joined=self.TIME_ZERO - 2 * self.DAY)
user2 = self.create_user(date_joined=self.TIME_ZERO - 2 * self.DAY)
second_realm = do_create_realm(string_id="second", name="second")
user3 = self.create_user(realm=second_realm)
user4 = self.create_user(realm=second_realm)
user5 = self.create_user(realm=second_realm)
user3 = self.create_user(date_joined=self.TIME_ZERO - 2 * self.DAY, realm=second_realm)
user4 = self.create_user(date_joined=self.TIME_ZERO - 2 * self.DAY, realm=second_realm)
user5 = self.create_user(date_joined=self.TIME_ZERO - 2 * self.DAY, realm=second_realm)
for user in [user1, user2, user3, user4, user5]:
self.mark_audit_active(user)
self.mark_15day_active(user)
for user in [user1, user3, user4]:
self.mark_audit_active(user, end_time=self.TIME_ZERO - self.DAY)
self.mark_15day_active(user, end_time=self.TIME_ZERO - self.DAY)
for user in [user1, user2, user3, user4, user5]:
self.mark_15day_active(user)
for i in [-1, 0, 1]:
do_fill_count_stat_at_hour(self.stat, self.TIME_ZERO + i * self.DAY)
@ -2139,17 +2123,14 @@ class TestRealmActiveHumans(AnalyticsTestCase):
RealmCount,
["value", "realm", "end_time"],
[
[2, self.default_realm, self.TIME_ZERO],
[3, second_realm, self.TIME_ZERO],
[1, self.default_realm, self.TIME_ZERO - self.DAY],
[2, second_realm, self.TIME_ZERO - self.DAY],
[2, self.default_realm, self.TIME_ZERO],
[3, second_realm, self.TIME_ZERO],
],
)
# Check that adding spurious entries doesn't make a difference
self.mark_audit_active(user1, end_time=self.TIME_ZERO + self.DAY)
self.mark_15day_active(user2, end_time=self.TIME_ZERO + self.DAY)
self.mark_15day_active(user2, end_time=self.TIME_ZERO - self.DAY)
self.create_user()
third_realm = do_create_realm(string_id="third", name="third")
self.create_user(realm=third_realm)
@ -2162,10 +2143,10 @@ class TestRealmActiveHumans(AnalyticsTestCase):
RealmCount,
["value", "realm", "end_time"],
[
[2, self.default_realm, self.TIME_ZERO],
[3, second_realm, self.TIME_ZERO],
[1, self.default_realm, self.TIME_ZERO - self.DAY],
[2, second_realm, self.TIME_ZERO - self.DAY],
[2, self.default_realm, self.TIME_ZERO],
[3, second_realm, self.TIME_ZERO],
],
)