From 7ff12279848bcf9a6acc1ba6a8e24f1566fb5db3 Mon Sep 17 00:00:00 2001 From: Mateusz Mandera Date: Sat, 8 Jun 2024 21:53:18 +0200 Subject: [PATCH] presence: Improve INSERT conflict handling in do_update_user_presence. This was discussed in the review of #29999: https://github.com/zulip/zulip/pull/29999#discussion_r1620818568 The previous way of handling wasn't entirely correct, as unnecessary events were omitted, with a bad guarantee of even being in the correct order. This is an improvement as now the function detects that it ended up doing nothing and can skip sending an event. The race condition is hard to make up in an automated test, but we can hackily simulate it by injecting a side_effect which will create a conflicting UserPresence row when the function requests a cursor. Aside of that, the actual race was simulated in manual testing to verify the expected behavior. --- zerver/actions/presence.py | 30 +++++++++++++-------- zerver/tests/test_presence.py | 50 +++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 11 deletions(-) diff --git a/zerver/actions/presence.py b/zerver/actions/presence.py index 44307c3825..3fa56ff248 100644 --- a/zerver/actions/presence.py +++ b/zerver/actions/presence.py @@ -1,3 +1,4 @@ +import logging import time from datetime import datetime, timedelta @@ -18,6 +19,8 @@ from zerver.models.clients import get_client from zerver.models.users import active_user_ids from zerver.tornado.django_api import send_event +logger = logging.getLogger(__name__) + def send_presence_changed( user_profile: UserProfile, presence: UserPresence, *, force_send_update: bool = False @@ -219,20 +222,15 @@ def do_update_user_presence( # There's a small possibility of a race where a different process may have # already created a row for this user. Given the extremely close timing # of these events, there's no clear reason to prefer one over the other, - # so we just go with the most direct approach of DO UPDATE, so that the - # last event emitted (the one coming from our process, since we're the slower one) - # matches the created presence state. - # TODO: Might be worth changing this to DO NOTHING instead with a bit of extra logic - # to skip emitting an event in such a scenario. + # so we choose the simplest option of DO NOTHING here and let the other + # concurrent transaction win. + # This also allows us to avoid sending a spurious presence update event, + # by checking if the row was actually created. query += sql.SQL(""" INSERT INTO zerver_userpresence (user_profile_id, last_active_time, last_connected_time, realm_id, last_update_id) VALUES ({user_profile_id}, {last_active_time}, {last_connected_time}, {realm_id}, (SELECT last_update_id FROM new_last_update_id)) - ON CONFLICT (user_profile_id) DO UPDATE SET - last_active_time = EXCLUDED.last_active_time, - last_connected_time = EXCLUDED.last_connected_time, - realm_id = EXCLUDED.realm_id, - last_update_id = EXCLUDED.last_update_id; - """).format( + ON CONFLICT (user_profile_id) DO NOTHING + """).format( user_profile_id=sql.Literal(user_profile.id), last_active_time=sql.Literal(presence.last_active_time), last_connected_time=sql.Literal(presence.last_connected_time), @@ -256,6 +254,16 @@ def do_update_user_presence( with connection.cursor() as cursor: cursor.execute(query) + if creating: + # Check if the row was actually created or if we + # hit the ON CONFLICT DO NOTHING case. + actually_created = cursor.rowcount > 0 + + if creating and not actually_created: + # If we ended up doing nothing due to something else creating the row + # in the meantime, then we shouldn't send an event here. + logger.info("UserPresence row already created for %s, returning.", user_profile.id) + return if force_send_update or ( not user_profile.realm.presence_disabled and (creating or became_online) diff --git a/zerver/tests/test_presence.py b/zerver/tests/test_presence.py index 8d3761c10f..c57333a89e 100644 --- a/zerver/tests/test_presence.py +++ b/zerver/tests/test_presence.py @@ -4,6 +4,7 @@ from unittest import mock import time_machine from django.conf import settings +from django.db import connection from django.utils.timezone import now as timezone_now from typing_extensions import override @@ -87,6 +88,55 @@ class UserPresenceModelTests(ZulipTestCase): self.assert_length(presence_dct, 0) self.assertEqual(last_update_id, -1) + def test_user_presence_row_creation_simulated_race(self) -> None: + """ + There is a theoretical race condition, where while a UserPresence + row is being created for a user, a concurrent process creates it first, + right before we execute our INSERT. This conflict is handled with + ON CONFLICT DO NOTHING in the SQL query and an early return + if that happens. + """ + + user_profile = self.example_user("hamlet") + + UserPresence.objects.filter(user_profile=user_profile).delete() + + self.login_user(user_profile) + + def insert_row_and_return_cursor() -> Any: + # This is the function we will inject into connection.cursor + # to simulate the race condition. + # When the underlying code requests a cursor, we will create + # the UserPresence row for the user, before returning a real + # cursor to the caller. This ensures the caller will hit the + # INSERT conflict when it tries to execute its query. + UserPresence.objects.create(user_profile=user_profile, realm=user_profile.realm) + + cursor = connection.cursor() + return cursor + + with mock.patch("zerver.actions.presence.connection") as mock_connection, self.assertLogs( + "zerver.actions.presence", level="INFO" + ) as mock_logs: + # This is a tricky mock. We need to set things up so that connection.cursor() + # in do_update_user_presence runs our custom code when the caller tries to + # enter the context manager. + # We also need to take care to only affect the connection that exists in + # zerver.actions.presence rather than affecting the entire django.db.connection, + # as that would break code higher up in the stack. + mock_connection.cursor.return_value.__enter__.side_effect = insert_row_and_return_cursor + + result = self.client_post("/json/users/me/presence", {"status": "active"}) + + # The request finished gracefully and the situation was logged: + self.assert_json_success(result) + self.assertEqual( + mock_logs.output, + [ + f"INFO:zerver.actions.presence:UserPresence row already created for {user_profile.id}, returning." + ], + ) + def test_last_update_id_logic(self) -> None: slim_presence = True UserPresence.objects.all().delete()