presence: Improve INSERT conflict handling in do_update_user_presence.

This was discussed in the review of #29999:
https://github.com/zulip/zulip/pull/29999#discussion_r1620818568

The previous way of handling wasn't entirely correct, as unnecessary
events were omitted, with a bad guarantee of even being in the correct
order.

This is an improvement as now the function detects that it ended up
doing nothing and can skip sending an event.

The race condition is hard to make up in an automated test, but we can
hackily simulate it by injecting a side_effect which will create a
conflicting UserPresence row when the function requests a cursor. Aside
of that, the actual race was simulated in manual testing to verify the
expected behavior.
This commit is contained in:
Mateusz Mandera 2024-06-08 21:53:18 +02:00 committed by Tim Abbott
parent 2feee9d756
commit 7ff1227984
2 changed files with 69 additions and 11 deletions

View File

@ -1,3 +1,4 @@
import logging
import time import time
from datetime import datetime, timedelta from datetime import datetime, timedelta
@ -18,6 +19,8 @@ from zerver.models.clients import get_client
from zerver.models.users import active_user_ids from zerver.models.users import active_user_ids
from zerver.tornado.django_api import send_event from zerver.tornado.django_api import send_event
logger = logging.getLogger(__name__)
def send_presence_changed( def send_presence_changed(
user_profile: UserProfile, presence: UserPresence, *, force_send_update: bool = False user_profile: UserProfile, presence: UserPresence, *, force_send_update: bool = False
@ -219,20 +222,15 @@ def do_update_user_presence(
# There's a small possibility of a race where a different process may have # There's a small possibility of a race where a different process may have
# already created a row for this user. Given the extremely close timing # already created a row for this user. Given the extremely close timing
# of these events, there's no clear reason to prefer one over the other, # of these events, there's no clear reason to prefer one over the other,
# so we just go with the most direct approach of DO UPDATE, so that the # so we choose the simplest option of DO NOTHING here and let the other
# last event emitted (the one coming from our process, since we're the slower one) # concurrent transaction win.
# matches the created presence state. # This also allows us to avoid sending a spurious presence update event,
# TODO: Might be worth changing this to DO NOTHING instead with a bit of extra logic # by checking if the row was actually created.
# to skip emitting an event in such a scenario.
query += sql.SQL(""" query += sql.SQL("""
INSERT INTO zerver_userpresence (user_profile_id, last_active_time, last_connected_time, realm_id, last_update_id) INSERT INTO zerver_userpresence (user_profile_id, last_active_time, last_connected_time, realm_id, last_update_id)
VALUES ({user_profile_id}, {last_active_time}, {last_connected_time}, {realm_id}, (SELECT last_update_id FROM new_last_update_id)) VALUES ({user_profile_id}, {last_active_time}, {last_connected_time}, {realm_id}, (SELECT last_update_id FROM new_last_update_id))
ON CONFLICT (user_profile_id) DO UPDATE SET ON CONFLICT (user_profile_id) DO NOTHING
last_active_time = EXCLUDED.last_active_time, """).format(
last_connected_time = EXCLUDED.last_connected_time,
realm_id = EXCLUDED.realm_id,
last_update_id = EXCLUDED.last_update_id;
""").format(
user_profile_id=sql.Literal(user_profile.id), user_profile_id=sql.Literal(user_profile.id),
last_active_time=sql.Literal(presence.last_active_time), last_active_time=sql.Literal(presence.last_active_time),
last_connected_time=sql.Literal(presence.last_connected_time), last_connected_time=sql.Literal(presence.last_connected_time),
@ -256,6 +254,16 @@ def do_update_user_presence(
with connection.cursor() as cursor: with connection.cursor() as cursor:
cursor.execute(query) cursor.execute(query)
if creating:
# Check if the row was actually created or if we
# hit the ON CONFLICT DO NOTHING case.
actually_created = cursor.rowcount > 0
if creating and not actually_created:
# If we ended up doing nothing due to something else creating the row
# in the meantime, then we shouldn't send an event here.
logger.info("UserPresence row already created for %s, returning.", user_profile.id)
return
if force_send_update or ( if force_send_update or (
not user_profile.realm.presence_disabled and (creating or became_online) not user_profile.realm.presence_disabled and (creating or became_online)

View File

@ -4,6 +4,7 @@ from unittest import mock
import time_machine import time_machine
from django.conf import settings from django.conf import settings
from django.db import connection
from django.utils.timezone import now as timezone_now from django.utils.timezone import now as timezone_now
from typing_extensions import override from typing_extensions import override
@ -87,6 +88,55 @@ class UserPresenceModelTests(ZulipTestCase):
self.assert_length(presence_dct, 0) self.assert_length(presence_dct, 0)
self.assertEqual(last_update_id, -1) self.assertEqual(last_update_id, -1)
def test_user_presence_row_creation_simulated_race(self) -> None:
"""
There is a theoretical race condition, where while a UserPresence
row is being created for a user, a concurrent process creates it first,
right before we execute our INSERT. This conflict is handled with
ON CONFLICT DO NOTHING in the SQL query and an early return
if that happens.
"""
user_profile = self.example_user("hamlet")
UserPresence.objects.filter(user_profile=user_profile).delete()
self.login_user(user_profile)
def insert_row_and_return_cursor() -> Any:
# This is the function we will inject into connection.cursor
# to simulate the race condition.
# When the underlying code requests a cursor, we will create
# the UserPresence row for the user, before returning a real
# cursor to the caller. This ensures the caller will hit the
# INSERT conflict when it tries to execute its query.
UserPresence.objects.create(user_profile=user_profile, realm=user_profile.realm)
cursor = connection.cursor()
return cursor
with mock.patch("zerver.actions.presence.connection") as mock_connection, self.assertLogs(
"zerver.actions.presence", level="INFO"
) as mock_logs:
# This is a tricky mock. We need to set things up so that connection.cursor()
# in do_update_user_presence runs our custom code when the caller tries to
# enter the context manager.
# We also need to take care to only affect the connection that exists in
# zerver.actions.presence rather than affecting the entire django.db.connection,
# as that would break code higher up in the stack.
mock_connection.cursor.return_value.__enter__.side_effect = insert_row_and_return_cursor
result = self.client_post("/json/users/me/presence", {"status": "active"})
# The request finished gracefully and the situation was logged:
self.assert_json_success(result)
self.assertEqual(
mock_logs.output,
[
f"INFO:zerver.actions.presence:UserPresence row already created for {user_profile.id}, returning."
],
)
def test_last_update_id_logic(self) -> None: def test_last_update_id_logic(self) -> None:
slim_presence = True slim_presence = True
UserPresence.objects.all().delete() UserPresence.objects.all().delete()