zulip/zerver/lib/cache.py

# See https://zulip.readthedocs.io/en/latest/subsystems/caching.html for docs
from functools import wraps

from django.utils.lru_cache import lru_cache
from django.core.cache import cache as djcache
from django.core.cache import caches
from django.conf import settings
from django.db.models import Q
from django.core.cache.backends.base import BaseCache
from django.http import HttpRequest
from django import template

from typing import cast, Any, Callable, Dict, Iterable, List, Optional, Union, Set, TypeVar, Tuple

from zerver.lib.utils import statsd, statsd_key, make_safe_digest
import time
import base64
import random
import sys
import os
import hashlib

if False:
    # These modules have to be imported for type annotations but
    # they cannot be imported at runtime due to cyclic dependency.
    from zerver.models import UserProfile, Realm, Message

ReturnT = TypeVar('ReturnT')  # Useful for matching return types via Callable[..., ReturnT]

class NotFoundInCache(Exception):
    pass


remote_cache_time_start = 0.0
remote_cache_total_time = 0.0
remote_cache_total_requests = 0

def get_remote_cache_time() -> float:
    return remote_cache_total_time

def get_remote_cache_requests() -> int:
    return remote_cache_total_requests

def remote_cache_stats_start() -> None:
    global remote_cache_time_start
    remote_cache_time_start = time.time()

def remote_cache_stats_finish() -> None:
    global remote_cache_total_time
    global remote_cache_total_requests
    global remote_cache_time_start
    remote_cache_total_requests += 1
    remote_cache_total_time += (time.time() - remote_cache_time_start)

def get_or_create_key_prefix() -> str:
    if settings.CASPER_TESTS:
        # This sets the prefix for the benefit of the Casper tests.
        #
        # Having a fixed key is OK since we don't support running
        # multiple copies of the casper tests at the same time anyway.
        return 'casper_tests:'
    elif settings.TEST_SUITE:
        # The Python tests overwrite KEY_PREFIX on each test, but use
        # this codepath as well, just to save running the more complex
        # code below for reading the normal key prefix.
        return 'django_tests_unused:'

    # directory `var` should exist in production
    os.makedirs(os.path.join(settings.DEPLOY_ROOT, "var"), exist_ok=True)

    filename = os.path.join(settings.DEPLOY_ROOT, "var", "remote_cache_prefix")
    try:
        fd = os.open(filename, os.O_CREAT | os.O_EXCL | os.O_RDWR, 0o444)
        random_hash = hashlib.sha256(str(random.getrandbits(256)).encode('utf-8')).digest()
        prefix = base64.b16encode(random_hash)[:32].decode('utf-8').lower() + ':'
        # This does close the underlying file
        with os.fdopen(fd, 'w') as f:
            f.write(prefix + "\n")
    except OSError:
        # The file already exists
        tries = 1
        while tries < 10:
            with open(filename, 'r') as f:
                prefix = f.readline()[:-1]
            if len(prefix) == 33:
                break
            tries += 1
            prefix = ''
            time.sleep(0.5)

    if not prefix:
        print("Could not read remote cache key prefix file")
        sys.exit(1)

    return prefix

KEY_PREFIX = get_or_create_key_prefix()  # type: str

def bounce_key_prefix_for_testing(test_name: str) -> None:
    global KEY_PREFIX
    KEY_PREFIX = test_name + ':' + str(os.getpid()) + ':'
    # We are taking the hash of the KEY_PREFIX to decrease the size of the key.
    # Memcached keys should have a length of less than 256.
    KEY_PREFIX = hashlib.sha1(KEY_PREFIX.encode('utf-8')).hexdigest()

def get_cache_backend(cache_name: Optional[str]) -> BaseCache:
    if cache_name is None:
        return djcache
    return caches[cache_name]

def get_cache_with_key(
        keyfunc: Callable[..., str],
        cache_name: Optional[str]=None
) -> Callable[[Callable[..., ReturnT]], Callable[..., ReturnT]]:
    """
    The main goal of this function getting value from the cache like in the "cache_with_key".
    A cache value can contain any data including the "None", so
    here used exception for case if value isn't found in the cache.
    """
    def decorator(func: Callable[..., ReturnT]) -> (Callable[..., ReturnT]):
        @wraps(func)
        def func_with_caching(*args: Any, **kwargs: Any) -> Callable[..., ReturnT]:
            key = keyfunc(*args, **kwargs)
            val = cache_get(key, cache_name=cache_name)
            if val is not None:
                return val[0]
            raise NotFoundInCache()

        return func_with_caching

    return decorator

def cache_with_key(
        keyfunc: Callable[..., str], cache_name: Optional[str]=None,
        timeout: Optional[int]=None, with_statsd_key: Optional[str]=None
) -> Callable[[Callable[..., ReturnT]], Callable[..., ReturnT]]:
    """Decorator which applies Django caching to a function.

       Decorator argument is a function which computes a cache key
       from the original function's arguments.  You are responsible
       for avoiding collisions with other uses of this decorator or
       other uses of caching."""

    def decorator(func: Callable[..., ReturnT]) -> Callable[..., ReturnT]:
        @wraps(func)
        def func_with_caching(*args: Any, **kwargs: Any) -> ReturnT:
            key = keyfunc(*args, **kwargs)

            val = cache_get(key, cache_name=cache_name)

            extra = ""
            if cache_name == 'database':
                extra = ".dbcache"

            if with_statsd_key is not None:
                metric_key = with_statsd_key
            else:
                metric_key = statsd_key(key)

            status = "hit" if val is not None else "miss"
            statsd.incr("cache%s.%s.%s" % (extra, metric_key, status))

            # Values are singleton tuples so that we can distinguish
            # a result of None from a missing key.
            if val is not None:
                return val[0]

            val = func(*args, **kwargs)

            cache_set(key, val, cache_name=cache_name, timeout=timeout)

            return val

        return func_with_caching

    return decorator

def cache_set(key: str, val: Any, cache_name: Optional[str]=None, timeout: Optional[int]=None) -> None:
    remote_cache_stats_start()
    cache_backend = get_cache_backend(cache_name)
    cache_backend.set(KEY_PREFIX + key, (val,), timeout=timeout)
    remote_cache_stats_finish()

def cache_get(key: str, cache_name: Optional[str]=None) -> Any:
    remote_cache_stats_start()
    cache_backend = get_cache_backend(cache_name)
    ret = cache_backend.get(KEY_PREFIX + key)
    remote_cache_stats_finish()
    return ret

def cache_get_many(keys: List[str], cache_name: Optional[str]=None) -> Dict[str, Any]:
    keys = [KEY_PREFIX + key for key in keys]
    remote_cache_stats_start()
    ret = get_cache_backend(cache_name).get_many(keys)
    remote_cache_stats_finish()
    return dict([(key[len(KEY_PREFIX):], value) for key, value in ret.items()])

def cache_set_many(items: Dict[str, Any], cache_name: Optional[str]=None,
                   timeout: Optional[int]=None) -> None:
    new_items = {}
    for key in items:
        new_items[KEY_PREFIX + key] = items[key]
    items = new_items
    remote_cache_stats_start()
    get_cache_backend(cache_name).set_many(items, timeout=timeout)
    remote_cache_stats_finish()

def cache_delete(key: str, cache_name: Optional[str]=None) -> None:
    remote_cache_stats_start()
    get_cache_backend(cache_name).delete(KEY_PREFIX + key)
    remote_cache_stats_finish()

def cache_delete_many(items: Iterable[str], cache_name: Optional[str]=None) -> None:
    remote_cache_stats_start()
    get_cache_backend(cache_name).delete_many(
        KEY_PREFIX + item for item in items)
    remote_cache_stats_finish()

# Generic_bulk_cached fetch and its helpers
ObjKT = TypeVar('ObjKT')
ItemT = TypeVar('ItemT')
CompressedItemT = TypeVar('CompressedItemT')

def default_extractor(obj: CompressedItemT) -> ItemT:
    return obj  # type: ignore # Need a type assert that ItemT=CompressedItemT

def default_setter(obj: ItemT) -> CompressedItemT:
    return obj  # type: ignore # Need a type assert that ItemT=CompressedItemT

def default_id_fetcher(obj: ItemT) -> ObjKT:
    return obj.id  # type: ignore # Need ItemT/CompressedItemT typevars to be a Django protocol

def default_cache_transformer(obj: ItemT) -> ItemT:
    return obj

# Required Arguments are as follows:
# * object_ids: The list of object ids to look up
# * cache_key_function: object_id => cache key
# * query_function: [object_ids] => [objects from database]
# Optional keyword arguments:
# * setter: Function to call before storing items to cache (e.g. compression)
# * extractor: Function to call on items returned from cache
#   (e.g. decompression).  Should be the inverse of the setter
#   function.
# * id_fetcher: Function mapping an object from database => object_id
#   (in case we're using a key more complex than obj.id)
# * cache_transformer: Function mapping an object from database =>
#   value for cache (in case the values that we're caching are some
#   function of the objects, not the objects themselves)
def generic_bulk_cached_fetch(
        cache_key_function: Callable[[ObjKT], str],
        query_function: Callable[[List[ObjKT]], Iterable[Any]],
        object_ids: Iterable[ObjKT],
        extractor: Callable[[CompressedItemT], ItemT] = default_extractor,
        setter: Callable[[ItemT], CompressedItemT] = default_setter,
        id_fetcher: Callable[[ItemT], ObjKT] = default_id_fetcher,
        cache_transformer: Callable[[ItemT], ItemT] = default_cache_transformer
) -> Dict[ObjKT, ItemT]:
    cache_keys = {}  # type: Dict[ObjKT, str]
    for object_id in object_ids:
        cache_keys[object_id] = cache_key_function(object_id)
    cached_objects_compressed = cache_get_many([cache_keys[object_id]
                                                for object_id in object_ids])  # type: Dict[str, Tuple[CompressedItemT]]
    cached_objects = {}  # type: Dict[str, ItemT]
    for (key, val) in cached_objects_compressed.items():
        cached_objects[key] = extractor(cached_objects_compressed[key][0])
    needed_ids = [object_id for object_id in object_ids if
                  cache_keys[object_id] not in cached_objects]
    db_objects = query_function(needed_ids)

    items_for_remote_cache = {}  # type: Dict[str, Tuple[CompressedItemT]]
    for obj in db_objects:
        key = cache_keys[id_fetcher(obj)]
        item = cache_transformer(obj)
        items_for_remote_cache[key] = (setter(item),)
        cached_objects[key] = item
    if len(items_for_remote_cache) > 0:
        cache_set_many(items_for_remote_cache)
    return dict((object_id, cached_objects[cache_keys[object_id]]) for object_id in object_ids
                if cache_keys[object_id] in cached_objects)

def cache(func: Callable[..., ReturnT]) -> Callable[..., ReturnT]:
    """Decorator which applies Django caching to a function.

       Uses a key based on the function's name, filename, and
       the repr() of its arguments."""

    func_uniqifier = '%s-%s' % (func.__code__.co_filename, func.__name__)

    @wraps(func)
    def keyfunc(*args: Any, **kwargs: Any) -> str:
        # Django complains about spaces because memcached rejects them
        key = func_uniqifier + repr((args, kwargs))
        return key.replace('-', '--').replace(' ', '-s')

    return cache_with_key(keyfunc)(func)

def preview_url_cache_key(url: str) -> str:
    return "preview_url:%s" % (make_safe_digest(url))

def display_recipient_cache_key(recipient_id: int) -> str:
    return "display_recipient_dict:%d" % (recipient_id,)

def user_profile_by_email_cache_key(email: str) -> str:
    # See the comment in zerver/lib/avatar_hash.py:gravatar_hash for why we
    # are proactively encoding email addresses even though they will
    # with high likelihood be ASCII-only for the foreseeable future.
    return 'user_profile_by_email:%s' % (make_safe_digest(email.strip()),)

def user_profile_cache_key_id(email: str, realm_id: int) -> str:
    return u"user_profile:%s:%s" % (make_safe_digest(email.strip()), realm_id,)

def user_profile_cache_key(email: str, realm: 'Realm') -> str:
    return user_profile_cache_key_id(email, realm.id)

def bot_profile_cache_key(email: str) -> str:
    return "bot_profile:%s" % (make_safe_digest(email.strip()))

def user_profile_by_id_cache_key(user_profile_id: int) -> str:
    return "user_profile_by_id:%s" % (user_profile_id,)

def user_profile_by_api_key_cache_key(api_key: str) -> str:
    return "user_profile_by_api_key:%s" % (api_key,)

realm_user_dict_fields = [
    'id', 'full_name', 'short_name', 'email',
    'avatar_source', 'avatar_version', 'is_active',
    'is_realm_admin', 'is_bot', 'realm_id', 'timezone',
    'date_joined', 'is_guest'
]  # type: List[str]

def realm_user_dicts_cache_key(realm_id: int) -> str:
    return "realm_user_dicts:%s" % (realm_id,)

def active_user_ids_cache_key(realm_id: int) -> str:
    return "active_user_ids:%s" % (realm_id,)

def active_non_guest_user_ids_cache_key(realm_id: int) -> str:
    return "active_non_guest_user_ids:%s" % (realm_id,)

bot_dict_fields = ['id', 'full_name', 'short_name', 'bot_type', 'email',
                   'is_active', 'default_sending_stream__name',
                   'realm_id',
                   'default_events_register_stream__name',
                   'default_all_public_streams', 'api_key',
                   'bot_owner__email', 'avatar_source',
                   'avatar_version']  # type: List[str]

def bot_dicts_in_realm_cache_key(realm: 'Realm') -> str:
    return "bot_dicts_in_realm:%s" % (realm.id,)

def get_stream_cache_key(stream_name: str, realm_id: int) -> str:
    return "stream_by_realm_and_name:%s:%s" % (
        realm_id, make_safe_digest(stream_name.strip().lower()))

def delete_user_profile_caches(user_profiles: Iterable['UserProfile']) -> None:
    # Imported here to avoid cyclic dependency.
    from zerver.lib.users import get_all_api_keys
    keys = []
    for user_profile in user_profiles:
        keys.append(user_profile_by_email_cache_key(user_profile.delivery_email))
        keys.append(user_profile_by_id_cache_key(user_profile.id))
        for api_key in get_all_api_keys(user_profile):
            keys.append(user_profile_by_api_key_cache_key(api_key))
        keys.append(user_profile_cache_key(user_profile.email, user_profile.realm))

    cache_delete_many(keys)

def delete_display_recipient_cache(user_profile: 'UserProfile') -> None:
    from zerver.models import Subscription  # We need to import here to avoid cyclic dependency.
    recipient_ids = Subscription.objects.filter(user_profile=user_profile)
    recipient_ids = recipient_ids.values_list('recipient_id', flat=True)
    keys = [display_recipient_cache_key(rid) for rid in recipient_ids]
    cache_delete_many(keys)

# Called by models.py to flush the user_profile cache whenever we save
# a user_profile object
def flush_user_profile(sender: Any, **kwargs: Any) -> None:
    user_profile = kwargs['instance']
    delete_user_profile_caches([user_profile])

    def changed(fields: List[str]) -> bool:
        if kwargs.get('update_fields') is None:
            # adds/deletes should invalidate the cache
            return True

        update_fields = set(kwargs['update_fields'])
        for f in fields:
            if f in update_fields:
                return True

        return False

    # Invalidate our active_users_in_realm info dict if any user has changed
    # the fields in the dict or become (in)active
    if changed(realm_user_dict_fields):
        cache_delete(realm_user_dicts_cache_key(user_profile.realm_id))

    if changed(['is_active']):
        cache_delete(active_user_ids_cache_key(user_profile.realm_id))
        cache_delete(active_non_guest_user_ids_cache_key(user_profile.realm_id))

    if changed(['is_guest']):
        cache_delete(active_non_guest_user_ids_cache_key(user_profile.realm_id))

    if changed(['email', 'full_name', 'short_name', 'id', 'is_mirror_dummy']):
        delete_display_recipient_cache(user_profile)

    # Invalidate our bots_in_realm info dict if any bot has
    # changed the fields in the dict or become (in)active
    if user_profile.is_bot and changed(bot_dict_fields):
        cache_delete(bot_dicts_in_realm_cache_key(user_profile.realm))

    # Invalidate realm-wide alert words cache if any user in the realm has changed
    # alert words
    if changed(['alert_words']):
        cache_delete(realm_alert_words_cache_key(user_profile.realm))

# Called by models.py to flush various caches whenever we save
# a Realm object.  The main tricky thing here is that Realm info is
# generally cached indirectly through user_profile objects.
def flush_realm(sender: Any, **kwargs: Any) -> None:
    realm = kwargs['instance']
    users = realm.get_active_users()
    delete_user_profile_caches(users)

    if realm.deactivated or (kwargs["update_fields"] is not None and
                             "string_id" in kwargs['update_fields']):
        cache_delete(realm_user_dicts_cache_key(realm.id))
        cache_delete(active_user_ids_cache_key(realm.id))
        cache_delete(bot_dicts_in_realm_cache_key(realm))
        cache_delete(realm_alert_words_cache_key(realm))
        cache_delete(active_non_guest_user_ids_cache_key(realm.id))

def realm_alert_words_cache_key(realm: 'Realm') -> str:
    return "realm_alert_words:%s" % (realm.string_id,)

# Called by models.py to flush the stream cache whenever we save a stream
# object.
def flush_stream(sender: Any, **kwargs: Any) -> None:
    from zerver.models import UserProfile
    stream = kwargs['instance']
    items_for_remote_cache = {}
    items_for_remote_cache[get_stream_cache_key(stream.name, stream.realm_id)] = (stream,)
    cache_set_many(items_for_remote_cache)

    if kwargs.get('update_fields') is None or 'name' in kwargs['update_fields'] and \
       UserProfile.objects.filter(
           Q(default_sending_stream=stream) |
           Q(default_events_register_stream=stream)).exists():
        cache_delete(bot_dicts_in_realm_cache_key(stream.realm))

def to_dict_cache_key_id(message_id: int) -> str:
    return 'message_dict:%d' % (message_id,)

def to_dict_cache_key(message: 'Message') -> str:
    return to_dict_cache_key_id(message.id)

def open_graph_description_cache_key(content: Any, request: HttpRequest) -> str:
    return 'open_graph_description_path:%s' % (make_safe_digest(request.META['PATH_INFO']))

def flush_message(sender: Any, **kwargs: Any) -> None:
    message = kwargs['instance']
    cache_delete(to_dict_cache_key_id(message.id))

def flush_submessage(sender: Any, **kwargs: Any) -> None:
    submessage = kwargs['instance']
    # submessages are not cached directly, they are part of their
    # parent messages
    message_id = submessage.message_id
    cache_delete(to_dict_cache_key_id(message_id))

DECORATOR = Callable[[Callable[..., Any]], Callable[..., Any]]

def ignore_unhashable_lru_cache(maxsize: int=128, typed: bool=False) -> DECORATOR:
    """
    This is a wrapper over lru_cache function. It adds following features on
    top of lru_cache:

        * It will not cache result of functions with unhashable arguments.
        * It will clear cache whenever zerver.lib.cache.KEY_PREFIX changes.
    """
    internal_decorator = lru_cache(maxsize=maxsize, typed=typed)

    def decorator(user_function: Callable[..., Any]) -> Callable[..., Any]:
        if settings.DEVELOPMENT and not settings.TEST_SUITE:  # nocoverage
            # In the development environment, we want every file
            # change to refresh the source files from disk.
            return user_function
        cache_enabled_user_function = internal_decorator(user_function)

        def wrapper(*args: Any, **kwargs: Any) -> Any:
            if not hasattr(cache_enabled_user_function, 'key_prefix'):
                cache_enabled_user_function.key_prefix = KEY_PREFIX

            if cache_enabled_user_function.key_prefix != KEY_PREFIX:
                # Clear cache when cache.KEY_PREFIX changes. This is used in
                # tests.
                cache_enabled_user_function.cache_clear()
                cache_enabled_user_function.key_prefix = KEY_PREFIX

            try:
                return cache_enabled_user_function(*args, **kwargs)
            except TypeError:
                # args or kwargs contains an element which is unhashable. In
                # this case we don't cache the result.
                pass

            # Deliberately calling this function from outside of exception
            # handler to get a more descriptive traceback. Otherise traceback
            # can include the exception from cached_enabled_user_function as
            # well.
            return user_function(*args, **kwargs)

        setattr(wrapper, 'cache_info', cache_enabled_user_function.cache_info)
        setattr(wrapper, 'cache_clear', cache_enabled_user_function.cache_clear)
        return wrapper

    return decorator