zulip/zerver/lib/message_cache.py

import copy
import zlib
from collections.abc import Iterable
from datetime import datetime
from email.headerregistry import Address
from typing import Any, TypedDict

import orjson

from zerver.lib.avatar import get_avatar_field, get_avatar_for_inaccessible_user
from zerver.lib.cache import cache_set_many, cache_with_key, to_dict_cache_key, to_dict_cache_key_id
from zerver.lib.display_recipient import bulk_fetch_display_recipients
from zerver.lib.markdown import render_message_markdown, topic_links
from zerver.lib.markdown import version as markdown_version
from zerver.lib.query_helpers import query_for_ids
from zerver.lib.timestamp import datetime_to_timestamp
from zerver.lib.topic import DB_TOPIC_NAME, TOPIC_LINKS, TOPIC_NAME
from zerver.lib.types import DisplayRecipientT, EditHistoryEvent, UserDisplayRecipient
from zerver.models import Message, Reaction, Realm, Recipient, Stream, SubMessage, UserProfile
from zerver.models.realms import get_fake_email_domain


class RawReactionRow(TypedDict):
    emoji_code: str
    emoji_name: str
    message_id: int
    reaction_type: str
    user_profile__email: str
    user_profile__full_name: str
    user_profile_id: int


def sew_messages_and_reactions(
    messages: list[dict[str, Any]], reactions: list[dict[str, Any]]
) -> list[dict[str, Any]]:
    """Given a iterable of messages and reactions stitch reactions
    into messages.
    """
    # Add all messages with empty reaction item
    for message in messages:
        message["reactions"] = []

    # Convert list of messages into dictionary to make reaction stitching easy
    converted_messages = {message["id"]: message for message in messages}

    for reaction in reactions:
        converted_messages[reaction["message_id"]]["reactions"].append(reaction)

    return list(converted_messages.values())


def sew_messages_and_submessages(
    messages: list[dict[str, Any]], submessages: list[dict[str, Any]]
) -> None:
    # This is super similar to sew_messages_and_reactions.
    for message in messages:
        message["submessages"] = []

    message_dict = {message["id"]: message for message in messages}

    for submessage in submessages:
        message_id = submessage["message_id"]
        if message_id in message_dict:
            message = message_dict[message_id]
            message["submessages"].append(submessage)


def extract_message_dict(message_bytes: bytes) -> dict[str, Any]:
    return orjson.loads(zlib.decompress(message_bytes))


def stringify_message_dict(message_dict: dict[str, Any]) -> bytes:
    return zlib.compress(orjson.dumps(message_dict))


@cache_with_key(to_dict_cache_key, timeout=3600 * 24)
def message_to_encoded_cache(message: Message, realm_id: int | None = None) -> bytes:
    return MessageDict.messages_to_encoded_cache([message], realm_id)[message.id]


def update_message_cache(
    changed_messages: Iterable[Message], realm_id: int | None = None
) -> list[int]:
    """Updates the message as stored in the to_dict cache (for serving
    messages)."""
    items_for_remote_cache = {}
    message_ids = []
    changed_messages_to_dict = MessageDict.messages_to_encoded_cache(changed_messages, realm_id)
    for msg_id, msg in changed_messages_to_dict.items():
        message_ids.append(msg_id)
        key = to_dict_cache_key_id(msg_id)
        items_for_remote_cache[key] = (msg,)

    cache_set_many(items_for_remote_cache)
    return message_ids


def save_message_rendered_content(message: Message, content: str) -> str:
    rendering_result = render_message_markdown(message, content, realm=message.get_realm())
    rendered_content = None
    if rendering_result is not None:
        rendered_content = rendering_result.rendered_content
    message.rendered_content = rendered_content
    message.rendered_content_version = markdown_version
    message.save_rendered_content()
    return rendered_content


class ReactionDict:
    @staticmethod
    def build_dict_from_raw_db_row(row: RawReactionRow) -> dict[str, Any]:
        return {
            "emoji_name": row["emoji_name"],
            "emoji_code": row["emoji_code"],
            "reaction_type": row["reaction_type"],
            # TODO: We plan to remove this redundant user dictionary once
            # clients are updated to support accessing use user_id.  See
            # https://github.com/zulip/zulip/pull/14711 for details.
            #
            # When we do that, we can likely update the `.values()` query to
            # not fetch the extra user_profile__* fields from the database
            # as a small performance optimization.
            "user": {
                "email": row["user_profile__email"],
                "id": row["user_profile_id"],
                "full_name": row["user_profile__full_name"],
            },
            "user_id": row["user_profile_id"],
        }


class MessageDict:
    """MessageDict is the core class responsible for marshalling Message
    objects obtained from the database into a format that can be sent
    to clients via the Zulip API, whether via `GET /messages`,
    outgoing webhooks, or other code paths.  There are two core flows through
    which this class is used:

    * For just-sent messages, we construct a single `wide_dict` object
      containing all the data for the message and the related
      UserProfile models (sender_info and recipient_info); this object
      can be stored in queues, caches, etc., and then later turned
      into an API-format JSONable dictionary via finalize_payload.

    * When fetching messages from the database, we fetch their data in
      bulk using messages_for_ids, which makes use of caching, bulk
      fetches that skip the Django ORM, etc., to provide an optimized
      interface for fetching hundreds of thousands of messages from
      the database and then turning them into API-format JSON
      dictionaries.

    """

    @staticmethod
    def wide_dict(message: Message, realm_id: int | None = None) -> dict[str, Any]:
        """
        The next two lines get the cacheable field related
        to our message object, with the side effect of
        populating the cache.
        """
        encoded_object_bytes = message_to_encoded_cache(message, realm_id)
        obj = extract_message_dict(encoded_object_bytes)

        """
        The steps below are similar to what we do in
        post_process_dicts(), except we don't call finalize_payload(),
        since that step happens later in the queue
        processor.
        """
        MessageDict.bulk_hydrate_sender_info([obj])
        MessageDict.bulk_hydrate_recipient_info([obj])

        return obj

    @staticmethod
    def post_process_dicts(
        objs: list[dict[str, Any]],
        apply_markdown: bool,
        client_gravatar: bool,
        realm: Realm,
    ) -> None:
        """
        NOTE: This function mutates the objects in
              the `objs` list, rather than making
              shallow copies.  It might be safer to
              make shallow copies here, but performance
              is somewhat important here, as we are
              often fetching hundreds of messages.
        """
        MessageDict.bulk_hydrate_sender_info(objs)
        MessageDict.bulk_hydrate_recipient_info(objs)

        for obj in objs:
            can_access_sender = obj.get("can_access_sender", True)
            MessageDict.finalize_payload(
                obj,
                apply_markdown,
                client_gravatar,
                skip_copy=True,
                can_access_sender=can_access_sender,
                realm_host=realm.host,
            )

    @staticmethod
    def finalize_payload(
        obj: dict[str, Any],
        apply_markdown: bool,
        client_gravatar: bool,
        keep_rendered_content: bool = False,
        skip_copy: bool = False,
        can_access_sender: bool = True,
        realm_host: str = "",
    ) -> dict[str, Any]:
        """
        By default, we make a shallow copy of the incoming dict to avoid
        mutation-related bugs.  Code paths that are passing a unique object
        can pass skip_copy=True to avoid this extra work.
        """
        if not skip_copy:
            obj = copy.copy(obj)

        if obj["sender_email_address_visibility"] != UserProfile.EMAIL_ADDRESS_VISIBILITY_EVERYONE:
            # If email address of the sender is only available to administrators,
            # clients cannot compute gravatars, so we force-set it to false.
            # If we plumbed the current user's role, we could allow client_gravatar=True
            # here if the current user's role has access to the target user's email address.
            client_gravatar = False

        if not can_access_sender:
            # Enforce inability to access details of inaccessible
            # users. We should be able to remove the realm_host and
            # can_access_user plumbing to this function if/when we
            # shift the Zulip API to not send these denormalized
            # fields about message senders favor of just sending the
            # sender's user ID.
            obj["sender_full_name"] = str(UserProfile.INACCESSIBLE_USER_NAME)
            sender_id = obj["sender_id"]
            obj["sender_email"] = Address(
                username=f"user{sender_id}", domain=get_fake_email_domain(realm_host)
            ).addr_spec

        MessageDict.set_sender_avatar(obj, client_gravatar, can_access_sender)
        if apply_markdown:
            obj["content_type"] = "text/html"
            obj["content"] = obj["rendered_content"]
        else:
            obj["content_type"] = "text/x-markdown"

        for item in obj.get("edit_history", []):
            if "prev_rendered_content_version" in item:
                del item["prev_rendered_content_version"]

        if not keep_rendered_content:
            del obj["rendered_content"]
        del obj["sender_realm_id"]
        del obj["sender_avatar_source"]
        del obj["sender_delivery_email"]
        del obj["sender_avatar_version"]

        del obj["recipient_type"]
        del obj["recipient_type_id"]
        del obj["sender_is_mirror_dummy"]
        del obj["sender_email_address_visibility"]
        if "can_access_sender" in obj:
            del obj["can_access_sender"]
        return obj

    @staticmethod
    def sew_submessages_and_reactions_to_msgs(
        messages: list[dict[str, Any]],
    ) -> list[dict[str, Any]]:
        msg_ids = [msg["id"] for msg in messages]
        submessages = SubMessage.get_raw_db_rows(msg_ids)
        sew_messages_and_submessages(messages, submessages)

        reactions = Reaction.get_raw_db_rows(msg_ids)
        return sew_messages_and_reactions(messages, reactions)

    @staticmethod
    def messages_to_encoded_cache(
        messages: Iterable[Message], realm_id: int | None = None
    ) -> dict[int, bytes]:
        messages_dict = MessageDict.messages_to_encoded_cache_helper(messages, realm_id)
        encoded_messages = {msg["id"]: stringify_message_dict(msg) for msg in messages_dict}
        return encoded_messages

    @staticmethod
    def messages_to_encoded_cache_helper(
        messages: Iterable[Message], realm_id: int | None = None
    ) -> list[dict[str, Any]]:
        # Near duplicate of the build_message_dict + get_raw_db_rows
        # code path that accepts already fetched Message objects
        # rather than message IDs.

        def get_rendering_realm_id(message: Message) -> int:
            # realm_id can differ among users, currently only possible
            # with cross realm bots.
            if realm_id is not None:
                return realm_id
            if message.recipient.type == Recipient.STREAM:
                return Stream.objects.get(id=message.recipient.type_id).realm_id
            return message.realm_id

        message_rows = [
            {
                "id": message.id,
                DB_TOPIC_NAME: message.topic_name(),
                "date_sent": message.date_sent,
                "last_edit_time": message.last_edit_time,
                "edit_history": message.edit_history,
                "content": message.content,
                "rendered_content": message.rendered_content,
                "rendered_content_version": message.rendered_content_version,
                "recipient_id": message.recipient.id,
                "recipient__type": message.recipient.type,
                "recipient__type_id": message.recipient.type_id,
                "rendering_realm_id": get_rendering_realm_id(message),
                "sender_id": message.sender.id,
                "sending_client__name": message.sending_client.name,
                "sender__realm_id": message.sender.realm_id,
            }
            for message in messages
        ]

        MessageDict.sew_submessages_and_reactions_to_msgs(message_rows)
        return [MessageDict.build_dict_from_raw_db_row(row) for row in message_rows]

    @staticmethod
    def ids_to_dict(needed_ids: list[int]) -> list[dict[str, Any]]:
        # This is a special purpose function optimized for
        # callers like get_messages_backend().
        fields = [
            "id",
            DB_TOPIC_NAME,
            "date_sent",
            "last_edit_time",
            "edit_history",
            "content",
            "rendered_content",
            "rendered_content_version",
            "recipient_id",
            "recipient__type",
            "recipient__type_id",
            "sender_id",
            "sending_client__name",
            "sender__realm_id",
        ]
        # Uses index: zerver_message_pkey
        messages = Message.objects.filter(id__in=needed_ids).values(*fields)
        MessageDict.sew_submessages_and_reactions_to_msgs(messages)
        return [MessageDict.build_dict_from_raw_db_row(row) for row in messages]

    @staticmethod
    def build_dict_from_raw_db_row(row: dict[str, Any]) -> dict[str, Any]:
        """
        row is a row from a .values() call, and it needs to have
        all the relevant fields populated
        """
        return MessageDict.build_message_dict(
            message_id=row["id"],
            last_edit_time=row["last_edit_time"],
            edit_history_json=row["edit_history"],
            content=row["content"],
            topic_name=row[DB_TOPIC_NAME],
            date_sent=row["date_sent"],
            rendered_content=row["rendered_content"],
            rendered_content_version=row["rendered_content_version"],
            sender_id=row["sender_id"],
            sender_realm_id=row["sender__realm_id"],
            sending_client_name=row["sending_client__name"],
            rendering_realm_id=row.get("rendering_realm_id", row["sender__realm_id"]),
            recipient_id=row["recipient_id"],
            recipient_type=row["recipient__type"],
            recipient_type_id=row["recipient__type_id"],
            reactions=row["reactions"],
            submessages=row["submessages"],
        )

    @staticmethod
    def build_message_dict(
        message_id: int,
        last_edit_time: datetime | None,
        edit_history_json: str | None,
        content: str,
        topic_name: str,
        date_sent: datetime,
        rendered_content: str | None,
        rendered_content_version: int | None,
        sender_id: int,
        sender_realm_id: int,
        sending_client_name: str,
        rendering_realm_id: int,
        recipient_id: int,
        recipient_type: int,
        recipient_type_id: int,
        reactions: list[RawReactionRow],
        submessages: list[dict[str, Any]],
    ) -> dict[str, Any]:
        obj = dict(
            id=message_id,
            sender_id=sender_id,
            content=content,
            recipient_type_id=recipient_type_id,
            recipient_type=recipient_type,
            recipient_id=recipient_id,
            timestamp=datetime_to_timestamp(date_sent),
            client=sending_client_name,
        )

        obj[TOPIC_NAME] = topic_name
        obj["sender_realm_id"] = sender_realm_id

        # Render topic_links with the stream's realm instead of the
        # sender's realm; this is important for messages sent by
        # cross-realm bots like NOTIFICATION_BOT.
        obj[TOPIC_LINKS] = topic_links(rendering_realm_id, topic_name)

        if last_edit_time is not None:
            obj["last_edit_timestamp"] = datetime_to_timestamp(last_edit_time)
            assert edit_history_json is not None
            edit_history: list[EditHistoryEvent] = orjson.loads(edit_history_json)
            obj["edit_history"] = edit_history

        if Message.need_to_render_content(
            rendered_content, rendered_content_version, markdown_version
        ):
            # We really shouldn't be rendering objects in this method, but there is
            # a scenario where we upgrade the version of Markdown and fail to run
            # management commands to re-render historical messages, and then we
            # need to have side effects.  This method is optimized to not need full
            # blown ORM objects, but the Markdown renderer is unfortunately highly
            # coupled to Message, and we also need to persist the new rendered content.
            # If we don't have a message object passed in, we get one here.  The cost
            # of going to the DB here should be overshadowed by the cost of rendering
            # and updating the row.
            # TODO: see #1379 to eliminate Markdown dependencies
            message = Message.objects.select_related("sender").get(id=message_id)

            assert message is not None  # Hint for mypy.
            # It's unfortunate that we need to have side effects on the message
            # in some cases.
            rendered_content = save_message_rendered_content(message, content)

        if rendered_content is not None:
            obj["rendered_content"] = rendered_content
        else:
            obj["rendered_content"] = (
                "<p>[Zulip note: Sorry, we could not understand the formatting of your message]</p>"
            )

        if rendered_content is not None:
            obj["is_me_message"] = Message.is_status_message(content, rendered_content)
        else:
            obj["is_me_message"] = False

        obj["reactions"] = [
            ReactionDict.build_dict_from_raw_db_row(reaction) for reaction in reactions
        ]
        obj["submessages"] = submessages
        return obj

    @staticmethod
    def bulk_hydrate_sender_info(objs: list[dict[str, Any]]) -> None:
        sender_ids = list({obj["sender_id"] for obj in objs})

        if not sender_ids:
            return

        query = UserProfile.objects.values(
            "id",
            "full_name",
            "delivery_email",
            "email",
            "realm__string_id",
            "avatar_source",
            "avatar_version",
            "is_mirror_dummy",
            "email_address_visibility",
        )

        rows = query_for_ids(query, sender_ids, "zerver_userprofile.id")

        sender_dict = {row["id"]: row for row in rows}

        for obj in objs:
            sender_id = obj["sender_id"]
            user_row = sender_dict[sender_id]
            obj["sender_full_name"] = user_row["full_name"]
            obj["sender_email"] = user_row["email"]
            obj["sender_delivery_email"] = user_row["delivery_email"]
            obj["sender_realm_str"] = user_row["realm__string_id"]
            obj["sender_avatar_source"] = user_row["avatar_source"]
            obj["sender_avatar_version"] = user_row["avatar_version"]
            obj["sender_is_mirror_dummy"] = user_row["is_mirror_dummy"]
            obj["sender_email_address_visibility"] = user_row["email_address_visibility"]

    @staticmethod
    def hydrate_recipient_info(obj: dict[str, Any], display_recipient: DisplayRecipientT) -> None:
        """
        This method hyrdrates recipient info with things
        like full names and emails of senders.  Eventually
        our clients should be able to hyrdrate these fields
        themselves with info they already have on users.
        """

        recipient_type = obj["recipient_type"]
        recipient_type_id = obj["recipient_type_id"]
        sender_is_mirror_dummy = obj["sender_is_mirror_dummy"]
        sender_email = obj["sender_email"]
        sender_full_name = obj["sender_full_name"]
        sender_id = obj["sender_id"]

        if recipient_type == Recipient.STREAM:
            display_type = "stream"
        elif recipient_type in (Recipient.DIRECT_MESSAGE_GROUP, Recipient.PERSONAL):
            assert not isinstance(display_recipient, str)
            display_type = "private"
            if len(display_recipient) == 1:
                # add the sender in if this isn't a message between
                # someone and themself, preserving ordering
                recip: UserDisplayRecipient = {
                    "email": sender_email,
                    "full_name": sender_full_name,
                    "id": sender_id,
                    "is_mirror_dummy": sender_is_mirror_dummy,
                }
                if recip["email"] < display_recipient[0]["email"]:
                    display_recipient = [recip, display_recipient[0]]
                elif recip["email"] > display_recipient[0]["email"]:
                    display_recipient = [display_recipient[0], recip]
        else:
            raise AssertionError(f"Invalid recipient type {recipient_type}")

        obj["display_recipient"] = display_recipient
        obj["type"] = display_type
        if obj["type"] == "stream":
            obj["stream_id"] = recipient_type_id

    @staticmethod
    def bulk_hydrate_recipient_info(objs: list[dict[str, Any]]) -> None:
        recipient_tuples = {  # We use set to eliminate duplicate tuples.
            (
                obj["recipient_id"],
                obj["recipient_type"],
                obj["recipient_type_id"],
            )
            for obj in objs
        }
        display_recipients = bulk_fetch_display_recipients(recipient_tuples)

        for obj in objs:
            MessageDict.hydrate_recipient_info(obj, display_recipients[obj["recipient_id"]])

    @staticmethod
    def set_sender_avatar(
        obj: dict[str, Any], client_gravatar: bool, can_access_sender: bool = True
    ) -> None:
        if not can_access_sender:
            obj["avatar_url"] = get_avatar_for_inaccessible_user()
            return

        sender_id = obj["sender_id"]
        sender_realm_id = obj["sender_realm_id"]
        sender_delivery_email = obj["sender_delivery_email"]
        sender_avatar_source = obj["sender_avatar_source"]
        sender_avatar_version = obj["sender_avatar_version"]

        obj["avatar_url"] = get_avatar_field(
            user_id=sender_id,
            realm_id=sender_realm_id,
            email=sender_delivery_email,
            avatar_source=sender_avatar_source,
            avatar_version=sender_avatar_version,
            medium=False,
            client_gravatar=client_gravatar,
        )