per-request caches: Add per_request_cache library.

We have historically cached two types of values on a per-request basis inside of memory: * linkifiers * display recipients Both of these caches were hand-written, and they both actually cache values that are also in memcached, so the per-request cache essentially only saves us from a few memcached hits. I think the linkifier per-request cache is a necessary evil. It's an important part of message rendering, and it's not super easy to structure the code to just get a single value up front and pass it down the stack. I'm not so sure we even need the display recipient per-request cache any more, as we are generally pretty smart now about hydrating recipient data in terms of how the code is organized. But I haven't done thorough research on that hypotheseis. Fortunately, it's not rocket science to just write a glorified memoize decorator and tie it into key places in the code: * middleware * tests (e.g. asserting db counts) * queue processors That's what I did in this commit. This commit definitely reduces the amount of code to maintain. I think it also gets us closer to possibly phasing out this whole technique, but that effort is beyond the scope of this PR. We could add some instrumentation to the decorator to see how often we get a non-trivial number of saved round trips to memcached. Note that when we flush linkifiers, we just use a big hammer and flush the entire per-request cache for linkifiers, since there is only ever one realm in the cache.
2023-07-14 17:46:50 +00:00 · 2023-07-14 17:46:50 +00:00 · 51db22c86c
parent 751b8b5bb5
commit 51db22c86c
11 changed files with 55 additions and 52 deletions
--- a/docs/subsystems/caching.md
+++ b/docs/subsystems/caching.md
@ -240,10 +240,10 @@ We generally try to avoid in-process backend caching in Zulip's Django
 codebase, because every Zulip production installation involves
 multiple servers. We do have a few, however:

- `per_request_display_recipient_cache`: A cache flushed at the start
-  of every request; this simplifies correctly implementing our goal of
-  not repeatedly fetching the "display recipient" (e.g. stream name)
-  for each message in the `GET /messages` codebase.
+- `@return_same_value_during_entire_request`: We use this decorator to
+  cache values in memory during the lifetime of a request. We use this
+  for linkifiers and display recipients. The middleware knows how to
+  flush the relevant in-memory caches at the start of a request.
 - Caches of various data, like the `SourceMap` object, that are
  expensive to construct, not needed for most requests, and don't
  change once a Zulip server has been deployed in production.
--- a/zerver/lib/per_request_cache.py
+++ b/zerver/lib/per_request_cache.py
@ -0,0 +1,32 @@
+from typing import Any, Callable, Dict, TypeVar
+
+ReturnT = TypeVar("ReturnT")
+
+FUNCTION_NAME_TO_PER_REQUEST_RESULT: Dict[str, Dict[int, Any]] = {}
+
+
+def return_same_value_during_entire_request(f: Callable[..., ReturnT]) -> Callable[..., ReturnT]:
+    cache_key = f.__name__
+
+    assert cache_key not in FUNCTION_NAME_TO_PER_REQUEST_RESULT
+    FUNCTION_NAME_TO_PER_REQUEST_RESULT[cache_key] = {}
+
+    def wrapper(key: int, *args: Any) -> ReturnT:
+        if key in FUNCTION_NAME_TO_PER_REQUEST_RESULT[cache_key]:
+            return FUNCTION_NAME_TO_PER_REQUEST_RESULT[cache_key][key]
+
+        result = f(key, *args)
+        FUNCTION_NAME_TO_PER_REQUEST_RESULT[cache_key][key] = result
+        return result
+
+    return wrapper
+
+
+def flush_per_request_cache(cache_key: str) -> None:
+    if cache_key in FUNCTION_NAME_TO_PER_REQUEST_RESULT:
+        FUNCTION_NAME_TO_PER_REQUEST_RESULT[cache_key] = {}
+
+
+def flush_per_request_caches() -> None:
+    for cache_key in FUNCTION_NAME_TO_PER_REQUEST_RESULT:
+        FUNCTION_NAME_TO_PER_REQUEST_RESULT[cache_key] = {}
--- a/zerver/lib/test_classes.py
+++ b/zerver/lib/test_classes.py
@ -55,6 +55,7 @@ from zerver.lib.cache import bounce_key_prefix_for_testing
 from zerver.lib.initial_password import initial_password
 from zerver.lib.message import access_message
 from zerver.lib.notification_data import UserMessageNotificationsData
+from zerver.lib.per_request_cache import flush_per_request_caches
 from zerver.lib.rate_limiter import bounce_redis_key_prefix_for_testing
 from zerver.lib.sessions import get_session_dict_user
 from zerver.lib.soft_deactivation import do_soft_deactivate_users
@ -99,7 +100,6 @@ from zerver.models import (
    UserProfile,
    UserStatus,
    clear_supported_auth_backends_cache,
-    flush_per_request_caches,
    get_realm,
    get_realm_stream,
    get_stream,
--- a/zerver/lib/test_helpers.py
+++ b/zerver/lib/test_helpers.py
@ -46,6 +46,7 @@ from zerver.lib.avatar import avatar_url
 from zerver.lib.cache import get_cache_backend
 from zerver.lib.db import Params, ParamsT, Query, TimeTrackingCursor
 from zerver.lib.integrations import WEBHOOK_INTEGRATIONS
+from zerver.lib.per_request_cache import flush_per_request_caches
 from zerver.lib.rate_limiter import RateLimitedIPAddr, rules
 from zerver.lib.request import RequestNotes
 from zerver.lib.upload.s3 import S3UploadBackend
@ -56,7 +57,6 @@ from zerver.models import (
    Subscription,
    UserMessage,
    UserProfile,
-    flush_per_request_caches,
    get_client,
    get_realm,
    get_stream,
--- a/zerver/middleware.py
+++ b/zerver/middleware.py
@ -32,6 +32,7 @@ from zerver.lib.debug import maybe_tracemalloc_listen
 from zerver.lib.exceptions import ErrorCode, JsonableError, MissingAuthenticationError
 from zerver.lib.html_to_text import get_content_description
 from zerver.lib.markdown import get_markdown_requests, get_markdown_time
+from zerver.lib.per_request_cache import flush_per_request_caches
 from zerver.lib.rate_limiter import RateLimitResult
 from zerver.lib.request import REQ, RequestNotes, has_request_variables
 from zerver.lib.response import (
@ -42,7 +43,7 @@ from zerver.lib.response import (
 )
 from zerver.lib.subdomains import get_subdomain
 from zerver.lib.user_agent import parse_user_agent
-from zerver.models import Realm, flush_per_request_caches, get_realm
+from zerver.models import Realm, get_realm

 ParamT = ParamSpec("ParamT")
 logger = logging.getLogger("zulip.requests")
--- a/zerver/models.py
+++ b/zerver/models.py
@ -4,7 +4,6 @@ import hashlib
 import secrets
 import time
 from collections import defaultdict
-from contextlib import suppress
 from datetime import timedelta
 from email.headerregistry import Address
 from typing import (
@ -83,6 +82,10 @@ from zerver.lib.cache import (
    user_profile_cache_key,
 )
 from zerver.lib.exceptions import JsonableError, RateLimitedError
+from zerver.lib.per_request_cache import (
+    flush_per_request_cache,
+    return_same_value_during_entire_request,
+)
 from zerver.lib.pysa import mark_sanitized
 from zerver.lib.timestamp import datetime_to_timestamp
 from zerver.lib.types import (
@ -183,17 +186,7 @@ def query_for_ids(
    return query


-# Doing 1000 remote cache requests to get_display_recipient is quite slow,
-# so add a local cache as well as the remote cache.
-#
-# This local cache has a lifetime of just a single request; it is
-# cleared inside `flush_per_request_caches` in our middleware.  It
-# could be replaced with smarter bulk-fetching logic that deduplicates
-# queries for the same recipient; this is just a convenient way to
-# write that code.
-per_request_display_recipient_cache: Dict[int, List[UserDisplayRecipient]] = {}
-
-
+@return_same_value_during_entire_request
 def get_display_recipient_by_id(
    recipient_id: int, recipient_type: int, recipient_type_id: Optional[int]
 ) -> List[UserDisplayRecipient]:
@ -205,10 +198,7 @@ def get_display_recipient_by_id(
    # Have to import here, to avoid circular dependency.
    from zerver.lib.display_recipient import get_display_recipient_remote_cache

-    if recipient_id not in per_request_display_recipient_cache:
-        result = get_display_recipient_remote_cache(recipient_id, recipient_type, recipient_type_id)
-        per_request_display_recipient_cache[recipient_id] = result
-    return per_request_display_recipient_cache[recipient_id]
+    return get_display_recipient_remote_cache(recipient_id, recipient_type, recipient_type_id)


 def get_display_recipient(recipient: "Recipient") -> List[UserDisplayRecipient]:
@ -1310,22 +1300,9 @@ def get_linkifiers_cache_key(realm_id: int) -> str:
    return f"{cache.KEY_PREFIX}:all_linkifiers_for_realm:{realm_id}"


-# We have a per-process cache to avoid doing 1000 remote cache queries during page load
-per_request_linkifiers_cache: Dict[int, List[LinkifierDict]] = {}
-
-
-def realm_in_local_linkifiers_cache(realm_id: int) -> bool:
-    return realm_id in per_request_linkifiers_cache
-
-
-def linkifiers_for_realm(realm_id: int) -> List[LinkifierDict]:
-    if not realm_in_local_linkifiers_cache(realm_id):
-        per_request_linkifiers_cache[realm_id] = linkifiers_for_realm_remote_cache(realm_id)
-    return per_request_linkifiers_cache[realm_id]
-
-
+@return_same_value_during_entire_request
@cache_with_key(get_linkifiers_cache_key, timeout=3600 * 24 * 7)
-def linkifiers_for_realm_remote_cache(realm_id: int) -> List[LinkifierDict]:
+def linkifiers_for_realm(realm_id: int) -> List[LinkifierDict]:
    return [
        LinkifierDict(
            pattern=linkifier.pattern,
@ -1339,21 +1316,13 @@ def linkifiers_for_realm_remote_cache(realm_id: int) -> List[LinkifierDict]:
 def flush_linkifiers(*, instance: RealmFilter, **kwargs: object) -> None:
    realm_id = instance.realm_id
    cache_delete(get_linkifiers_cache_key(realm_id))
-    with suppress(KeyError):
-        per_request_linkifiers_cache.pop(realm_id)
+    flush_per_request_cache("linkifiers_for_realm")


 post_save.connect(flush_linkifiers, sender=RealmFilter)
 post_delete.connect(flush_linkifiers, sender=RealmFilter)


-def flush_per_request_caches() -> None:
-    global per_request_display_recipient_cache
-    per_request_display_recipient_cache = {}
-    global per_request_linkifiers_cache
-    per_request_linkifiers_cache = {}
-
-
 class RealmPlayground(models.Model):
    """Server side storage model to store playground information needed by our
    'view code in playground' feature in code blocks.
--- a/zerver/tests/test_markdown.py
+++ b/zerver/tests/test_markdown.py
@ -54,6 +54,7 @@ from zerver.lib.mention import (
    topic_wildcards,
 )
 from zerver.lib.message import render_markdown
+from zerver.lib.per_request_cache import flush_per_request_caches
 from zerver.lib.test_classes import ZulipTestCase
 from zerver.lib.tex import render_tex
 from zerver.models import (
@ -63,7 +64,6 @@ from zerver.models import (
    UserGroup,
    UserMessage,
    UserProfile,
-    flush_per_request_caches,
    get_client,
    get_realm,
    get_stream,
--- a/zerver/tests/test_message_dict.py
+++ b/zerver/tests/test_message_dict.py
@ -6,6 +6,7 @@ from django.utils.timezone import now as timezone_now
 from zerver.lib.cache import cache_delete, to_dict_cache_key_id
 from zerver.lib.markdown import version as markdown_version
 from zerver.lib.message import MessageDict, messages_for_ids, sew_messages_and_reactions
+from zerver.lib.per_request_cache import flush_per_request_caches
 from zerver.lib.test_classes import ZulipTestCase
 from zerver.lib.test_helpers import make_client
 from zerver.lib.topic import TOPIC_LINKS
@ -18,7 +19,6 @@ from zerver.models import (
    Recipient,
    Stream,
    UserProfile,
-    flush_per_request_caches,
    get_display_recipient,
    get_realm,
    get_stream,
--- a/zerver/tests/test_message_send.py
+++ b/zerver/tests/test_message_send.py
@ -33,6 +33,7 @@ from zerver.actions.users import do_change_can_forge_sender, do_deactivate_user
 from zerver.lib.addressee import Addressee
 from zerver.lib.exceptions import JsonableError
 from zerver.lib.message import MessageDict, get_raw_unread_data, get_recent_private_conversations
+from zerver.lib.per_request_cache import flush_per_request_caches
 from zerver.lib.streams import create_stream_if_needed
 from zerver.lib.test_classes import ZulipTestCase
 from zerver.lib.test_helpers import (
@ -55,7 +56,6 @@ from zerver.models import (
    UserGroup,
    UserMessage,
    UserProfile,
-    flush_per_request_caches,
    get_or_create_huddle,
    get_realm,
    get_stream,
--- a/zerver/views/development/cache.py
+++ b/zerver/views/development/cache.py
@ -5,8 +5,9 @@ from django.views.decorators.csrf import csrf_exempt

 from zerver.decorator import require_post
 from zerver.lib.cache import get_cache_backend
+from zerver.lib.per_request_cache import flush_per_request_caches
 from zerver.lib.response import json_success
-from zerver.models import clear_client_cache, flush_per_request_caches
+from zerver.models import clear_client_cache

 ZULIP_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../../")

--- a/zerver/worker/queue_processors.py
+++ b/zerver/worker/queue_processors.py
@ -68,6 +68,7 @@ from zerver.lib.email_notifications import MissedMessageData, handle_missedmessa
 from zerver.lib.exceptions import RateLimitedError
 from zerver.lib.export import export_realm_wrapper
 from zerver.lib.outgoing_webhook import do_rest_call, get_outgoing_webhook_service_handler
+from zerver.lib.per_request_cache import flush_per_request_caches
 from zerver.lib.push_notifications import (
    clear_push_device_tokens,
    handle_push_notification,
@ -99,7 +100,6 @@ from zerver.models import (
    UserMessage,
    UserProfile,
    filter_to_valid_prereg_users,
-    flush_per_request_caches,
    get_bot_services,
    get_client,
    get_system_bot,