zulip/zerver/transaction_tests/test_user_groups.py

import threading
from typing import Any, List, Optional
from unittest import mock

import orjson
from django.db import OperationalError, connections, transaction
from django.http import HttpRequest

from zerver.actions.user_groups import add_subgroups_to_user_group, check_add_user_group
from zerver.lib.exceptions import JsonableError
from zerver.lib.test_classes import ZulipTransactionTestCase
from zerver.lib.test_helpers import HostRequestMock
from zerver.lib.user_groups import access_user_group_by_id
from zerver.models import Realm, UserGroup, UserProfile, get_realm
from zerver.views.user_groups import update_subgroups_of_user_group

BARRIER: Optional[threading.Barrier] = None


def dev_update_subgroups(
    request: HttpRequest,
    user_profile: UserProfile,
    user_group_id: int,
) -> Optional[str]:
    # The test is expected to set up the barrier before accessing this endpoint.
    assert BARRIER is not None
    try:
        with transaction.atomic(), mock.patch(
            "zerver.lib.user_groups.access_user_group_by_id"
        ) as m:

            def wait_after_recursive_query(*args: Any, **kwargs: Any) -> UserGroup:
                # When updating the subgroups, we access the supergroup group
                # only after finishing the recursive query.
                BARRIER.wait()
                return access_user_group_by_id(*args, **kwargs)

            m.side_effect = wait_after_recursive_query

            update_subgroups_of_user_group(request, user_profile, user_group_id=user_group_id)
    except OperationalError as err:
        msg = str(err)
        if "deadlock detected" in msg:
            return "Deadlock detected"
        else:
            assert "could not obtain lock" in msg
            # This error is possible when nowait is set the True, which only
            # applies to the recursive query on the subgroups. Because the
            # recursive query fails, this thread must have not waited on the
            # barrier yet.
            BARRIER.wait()
            return "Busy lock detected"
    except (
        threading.BrokenBarrierError
    ):  # nocoverage # This is only possible when timeout happens or there is a programming error
        raise JsonableError(
            "Broken barrier. The tester should make sure that the exact number of parties have waited on the barrier set by the previous immediate set_sync_after_first_lock call"
        )

    return None


class UserGroupRaceConditionTestCase(ZulipTransactionTestCase):
    created_user_groups: List[UserGroup] = []
    counter = 0
    CHAIN_LENGTH = 3

    def tearDown(self) -> None:
        # Clean up the user groups created to minimize leakage
        with transaction.atomic():
            for group in self.created_user_groups:
                group.delete()
            transaction.on_commit(lambda: self.created_user_groups.clear())

        super().tearDown()

    def create_user_group_chain(self, realm: Realm) -> List[UserGroup]:
        """Build a user groups forming a chain through group-group memberships
        returning a list where each group is the supergroup of its subsequent group.
        """
        groups = [
            check_add_user_group(realm, f"chain #{self.counter + i}", [], acting_user=None)
            for i in range(self.CHAIN_LENGTH)
        ]
        self.counter += self.CHAIN_LENGTH
        self.created_user_groups.extend(groups)
        prev_group = groups[0]
        for group in groups[1:]:
            add_subgroups_to_user_group(prev_group, [group], acting_user=None)
            prev_group = group
        return groups

    def test_lock_subgroups_with_respect_to_supergroup(self) -> None:
        realm = get_realm("zulip")
        self.login("iago")
        iago = self.example_user("iago")

        class RacingThread(threading.Thread):
            def __init__(
                self,
                subgroup_ids: List[int],
                supergroup_id: int,
            ) -> None:
                threading.Thread.__init__(self)
                self.response: Optional[str] = None
                self.subgroup_ids = subgroup_ids
                self.supergroup_id = supergroup_id

            def run(self) -> None:
                try:
                    self.response = dev_update_subgroups(
                        HostRequestMock({"add": orjson.dumps(self.subgroup_ids).decode()}),
                        iago,
                        user_group_id=self.supergroup_id,
                    )
                finally:
                    # Close all thread-local database connections
                    connections.close_all()

        def assert_thread_success_count(
            t1: RacingThread,
            t2: RacingThread,
            *,
            success_count: int,
            error_messsage: str = "",
        ) -> None:
            help_msg = """We access the test endpoint that wraps around the
real subgroup update endpoint by synchronizing them after the acquisition of the
first lock in the critical region. Though unlikely, this test might fail as we
have no control over the scheduler when the barrier timeouts.
""".strip()
            global BARRIER
            BARRIER = threading.Barrier(parties=2, timeout=3)
            t1.start()
            t2.start()

            succeeded = 0
            for t in [t1, t2]:
                t.join()
                response = t.response
                if response is None:
                    succeeded += 1
                    continue

                self.assertEqual(response, error_messsage)
            # Race condition resolution should only allow one thread to succeed
            self.assertEqual(
                succeeded,
                success_count,
                f"Exactly {success_count} thread(s) should succeed.\n{help_msg}",
            )

        foo_chain = self.create_user_group_chain(realm)
        bar_chain = self.create_user_group_chain(realm)
        # These two threads are conflicting because a cycle would be formed if
        # both of them succeed. There is a deadlock in such circular dependency.
        assert_thread_success_count(
            RacingThread(
                subgroup_ids=[foo_chain[0].id],
                supergroup_id=bar_chain[-1].id,
            ),
            RacingThread(
                subgroup_ids=[bar_chain[-1].id],
                supergroup_id=foo_chain[0].id,
            ),
            success_count=1,
            error_messsage="Deadlock detected",
        )

        foo_chain = self.create_user_group_chain(realm)
        bar_chain = self.create_user_group_chain(realm)
        # These two requests would succeed if they didn't race with each other.
        # However, both threads will attempt to grab a lock on overlapping rows
        # when they first do the recursive query for subgroups. In this case, we
        # expect that one of the threads fails due to nowait=True for the
        # .select_for_update() call.
        assert_thread_success_count(
            RacingThread(
                subgroup_ids=[foo_chain[0].id],
                supergroup_id=bar_chain[-1].id,
            ),
            RacingThread(
                subgroup_ids=[foo_chain[1].id],
                supergroup_id=bar_chain[-1].id,
            ),
            success_count=1,
            error_messsage="Busy lock detected",
        )

        foo_chain = self.create_user_group_chain(realm)
        bar_chain = self.create_user_group_chain(realm)
        baz_chain = self.create_user_group_chain(realm)
        # Adding non-conflicting subgroups should succeed.
        assert_thread_success_count(
            RacingThread(
                subgroup_ids=[foo_chain[1].id, foo_chain[2].id, baz_chain[2].id],
                supergroup_id=baz_chain[0].id,
            ),
            RacingThread(
                subgroup_ids=[bar_chain[1].id, bar_chain[2].id],
                supergroup_id=baz_chain[0].id,
            ),
            success_count=2,
        )