zulip/zerver/lib/message_cache.py

577 lines
23 KiB
Python

import copy
import zlib
from collections.abc import Iterable
from datetime import datetime
from email.headerregistry import Address
from typing import Any, TypedDict
import orjson
from zerver.lib.avatar import get_avatar_field, get_avatar_for_inaccessible_user
from zerver.lib.cache import cache_set_many, cache_with_key, to_dict_cache_key, to_dict_cache_key_id
from zerver.lib.display_recipient import bulk_fetch_display_recipients
from zerver.lib.markdown import render_message_markdown, topic_links
from zerver.lib.markdown import version as markdown_version
from zerver.lib.query_helpers import query_for_ids
from zerver.lib.timestamp import datetime_to_timestamp
from zerver.lib.topic import DB_TOPIC_NAME, TOPIC_LINKS, TOPIC_NAME
from zerver.lib.types import DisplayRecipientT, EditHistoryEvent, UserDisplayRecipient
from zerver.models import Message, Reaction, Realm, Recipient, Stream, SubMessage, UserProfile
from zerver.models.realms import get_fake_email_domain
class RawReactionRow(TypedDict):
emoji_code: str
emoji_name: str
message_id: int
reaction_type: str
user_profile__email: str
user_profile__full_name: str
user_profile_id: int
def sew_messages_and_reactions(
messages: list[dict[str, Any]], reactions: list[dict[str, Any]]
) -> list[dict[str, Any]]:
"""Given a iterable of messages and reactions stitch reactions
into messages.
"""
# Add all messages with empty reaction item
for message in messages:
message["reactions"] = []
# Convert list of messages into dictionary to make reaction stitching easy
converted_messages = {message["id"]: message for message in messages}
for reaction in reactions:
converted_messages[reaction["message_id"]]["reactions"].append(reaction)
return list(converted_messages.values())
def sew_messages_and_submessages(
messages: list[dict[str, Any]], submessages: list[dict[str, Any]]
) -> None:
# This is super similar to sew_messages_and_reactions.
for message in messages:
message["submessages"] = []
message_dict = {message["id"]: message for message in messages}
for submessage in submessages:
message_id = submessage["message_id"]
if message_id in message_dict:
message = message_dict[message_id]
message["submessages"].append(submessage)
def extract_message_dict(message_bytes: bytes) -> dict[str, Any]:
return orjson.loads(zlib.decompress(message_bytes))
def stringify_message_dict(message_dict: dict[str, Any]) -> bytes:
return zlib.compress(orjson.dumps(message_dict))
@cache_with_key(to_dict_cache_key, timeout=3600 * 24)
def message_to_encoded_cache(message: Message, realm_id: int | None = None) -> bytes:
return MessageDict.messages_to_encoded_cache([message], realm_id)[message.id]
def update_message_cache(
changed_messages: Iterable[Message], realm_id: int | None = None
) -> list[int]:
"""Updates the message as stored in the to_dict cache (for serving
messages)."""
items_for_remote_cache = {}
message_ids = []
changed_messages_to_dict = MessageDict.messages_to_encoded_cache(changed_messages, realm_id)
for msg_id, msg in changed_messages_to_dict.items():
message_ids.append(msg_id)
key = to_dict_cache_key_id(msg_id)
items_for_remote_cache[key] = (msg,)
cache_set_many(items_for_remote_cache)
return message_ids
def save_message_rendered_content(message: Message, content: str) -> str:
rendering_result = render_message_markdown(message, content, realm=message.get_realm())
rendered_content = None
if rendering_result is not None:
rendered_content = rendering_result.rendered_content
message.rendered_content = rendered_content
message.rendered_content_version = markdown_version
message.save_rendered_content()
return rendered_content
class ReactionDict:
@staticmethod
def build_dict_from_raw_db_row(row: RawReactionRow) -> dict[str, Any]:
return {
"emoji_name": row["emoji_name"],
"emoji_code": row["emoji_code"],
"reaction_type": row["reaction_type"],
# TODO: We plan to remove this redundant user dictionary once
# clients are updated to support accessing use user_id. See
# https://github.com/zulip/zulip/pull/14711 for details.
#
# When we do that, we can likely update the `.values()` query to
# not fetch the extra user_profile__* fields from the database
# as a small performance optimization.
"user": {
"email": row["user_profile__email"],
"id": row["user_profile_id"],
"full_name": row["user_profile__full_name"],
},
"user_id": row["user_profile_id"],
}
class MessageDict:
"""MessageDict is the core class responsible for marshalling Message
objects obtained from the database into a format that can be sent
to clients via the Zulip API, whether via `GET /messages`,
outgoing webhooks, or other code paths. There are two core flows through
which this class is used:
* For just-sent messages, we construct a single `wide_dict` object
containing all the data for the message and the related
UserProfile models (sender_info and recipient_info); this object
can be stored in queues, caches, etc., and then later turned
into an API-format JSONable dictionary via finalize_payload.
* When fetching messages from the database, we fetch their data in
bulk using messages_for_ids, which makes use of caching, bulk
fetches that skip the Django ORM, etc., to provide an optimized
interface for fetching hundreds of thousands of messages from
the database and then turning them into API-format JSON
dictionaries.
"""
@staticmethod
def wide_dict(message: Message, realm_id: int | None = None) -> dict[str, Any]:
"""
The next two lines get the cacheable field related
to our message object, with the side effect of
populating the cache.
"""
encoded_object_bytes = message_to_encoded_cache(message, realm_id)
obj = extract_message_dict(encoded_object_bytes)
"""
The steps below are similar to what we do in
post_process_dicts(), except we don't call finalize_payload(),
since that step happens later in the queue
processor.
"""
MessageDict.bulk_hydrate_sender_info([obj])
MessageDict.bulk_hydrate_recipient_info([obj])
return obj
@staticmethod
def post_process_dicts(
objs: list[dict[str, Any]],
apply_markdown: bool,
client_gravatar: bool,
realm: Realm,
) -> None:
"""
NOTE: This function mutates the objects in
the `objs` list, rather than making
shallow copies. It might be safer to
make shallow copies here, but performance
is somewhat important here, as we are
often fetching hundreds of messages.
"""
MessageDict.bulk_hydrate_sender_info(objs)
MessageDict.bulk_hydrate_recipient_info(objs)
for obj in objs:
can_access_sender = obj.get("can_access_sender", True)
MessageDict.finalize_payload(
obj,
apply_markdown,
client_gravatar,
skip_copy=True,
can_access_sender=can_access_sender,
realm_host=realm.host,
)
@staticmethod
def finalize_payload(
obj: dict[str, Any],
apply_markdown: bool,
client_gravatar: bool,
keep_rendered_content: bool = False,
skip_copy: bool = False,
can_access_sender: bool = True,
realm_host: str = "",
) -> dict[str, Any]:
"""
By default, we make a shallow copy of the incoming dict to avoid
mutation-related bugs. Code paths that are passing a unique object
can pass skip_copy=True to avoid this extra work.
"""
if not skip_copy:
obj = copy.copy(obj)
if obj["sender_email_address_visibility"] != UserProfile.EMAIL_ADDRESS_VISIBILITY_EVERYONE:
# If email address of the sender is only available to administrators,
# clients cannot compute gravatars, so we force-set it to false.
# If we plumbed the current user's role, we could allow client_gravatar=True
# here if the current user's role has access to the target user's email address.
client_gravatar = False
if not can_access_sender:
# Enforce inability to access details of inaccessible
# users. We should be able to remove the realm_host and
# can_access_user plumbing to this function if/when we
# shift the Zulip API to not send these denormalized
# fields about message senders favor of just sending the
# sender's user ID.
obj["sender_full_name"] = str(UserProfile.INACCESSIBLE_USER_NAME)
sender_id = obj["sender_id"]
obj["sender_email"] = Address(
username=f"user{sender_id}", domain=get_fake_email_domain(realm_host)
).addr_spec
MessageDict.set_sender_avatar(obj, client_gravatar, can_access_sender)
if apply_markdown:
obj["content_type"] = "text/html"
obj["content"] = obj["rendered_content"]
else:
obj["content_type"] = "text/x-markdown"
for item in obj.get("edit_history", []):
if "prev_rendered_content_version" in item:
del item["prev_rendered_content_version"]
if not keep_rendered_content:
del obj["rendered_content"]
del obj["sender_realm_id"]
del obj["sender_avatar_source"]
del obj["sender_delivery_email"]
del obj["sender_avatar_version"]
del obj["recipient_type"]
del obj["recipient_type_id"]
del obj["sender_is_mirror_dummy"]
del obj["sender_email_address_visibility"]
if "can_access_sender" in obj:
del obj["can_access_sender"]
return obj
@staticmethod
def sew_submessages_and_reactions_to_msgs(
messages: list[dict[str, Any]],
) -> list[dict[str, Any]]:
msg_ids = [msg["id"] for msg in messages]
submessages = SubMessage.get_raw_db_rows(msg_ids)
sew_messages_and_submessages(messages, submessages)
reactions = Reaction.get_raw_db_rows(msg_ids)
return sew_messages_and_reactions(messages, reactions)
@staticmethod
def messages_to_encoded_cache(
messages: Iterable[Message], realm_id: int | None = None
) -> dict[int, bytes]:
messages_dict = MessageDict.messages_to_encoded_cache_helper(messages, realm_id)
encoded_messages = {msg["id"]: stringify_message_dict(msg) for msg in messages_dict}
return encoded_messages
@staticmethod
def messages_to_encoded_cache_helper(
messages: Iterable[Message], realm_id: int | None = None
) -> list[dict[str, Any]]:
# Near duplicate of the build_message_dict + get_raw_db_rows
# code path that accepts already fetched Message objects
# rather than message IDs.
def get_rendering_realm_id(message: Message) -> int:
# realm_id can differ among users, currently only possible
# with cross realm bots.
if realm_id is not None:
return realm_id
if message.recipient.type == Recipient.STREAM:
return Stream.objects.get(id=message.recipient.type_id).realm_id
return message.realm_id
message_rows = [
{
"id": message.id,
DB_TOPIC_NAME: message.topic_name(),
"date_sent": message.date_sent,
"last_edit_time": message.last_edit_time,
"edit_history": message.edit_history,
"content": message.content,
"rendered_content": message.rendered_content,
"rendered_content_version": message.rendered_content_version,
"recipient_id": message.recipient.id,
"recipient__type": message.recipient.type,
"recipient__type_id": message.recipient.type_id,
"rendering_realm_id": get_rendering_realm_id(message),
"sender_id": message.sender.id,
"sending_client__name": message.sending_client.name,
"sender__realm_id": message.sender.realm_id,
}
for message in messages
]
MessageDict.sew_submessages_and_reactions_to_msgs(message_rows)
return [MessageDict.build_dict_from_raw_db_row(row) for row in message_rows]
@staticmethod
def ids_to_dict(needed_ids: list[int]) -> list[dict[str, Any]]:
# This is a special purpose function optimized for
# callers like get_messages_backend().
fields = [
"id",
DB_TOPIC_NAME,
"date_sent",
"last_edit_time",
"edit_history",
"content",
"rendered_content",
"rendered_content_version",
"recipient_id",
"recipient__type",
"recipient__type_id",
"sender_id",
"sending_client__name",
"sender__realm_id",
]
# Uses index: zerver_message_pkey
messages = Message.objects.filter(id__in=needed_ids).values(*fields)
MessageDict.sew_submessages_and_reactions_to_msgs(messages)
return [MessageDict.build_dict_from_raw_db_row(row) for row in messages]
@staticmethod
def build_dict_from_raw_db_row(row: dict[str, Any]) -> dict[str, Any]:
"""
row is a row from a .values() call, and it needs to have
all the relevant fields populated
"""
return MessageDict.build_message_dict(
message_id=row["id"],
last_edit_time=row["last_edit_time"],
edit_history_json=row["edit_history"],
content=row["content"],
topic_name=row[DB_TOPIC_NAME],
date_sent=row["date_sent"],
rendered_content=row["rendered_content"],
rendered_content_version=row["rendered_content_version"],
sender_id=row["sender_id"],
sender_realm_id=row["sender__realm_id"],
sending_client_name=row["sending_client__name"],
rendering_realm_id=row.get("rendering_realm_id", row["sender__realm_id"]),
recipient_id=row["recipient_id"],
recipient_type=row["recipient__type"],
recipient_type_id=row["recipient__type_id"],
reactions=row["reactions"],
submessages=row["submessages"],
)
@staticmethod
def build_message_dict(
message_id: int,
last_edit_time: datetime | None,
edit_history_json: str | None,
content: str,
topic_name: str,
date_sent: datetime,
rendered_content: str | None,
rendered_content_version: int | None,
sender_id: int,
sender_realm_id: int,
sending_client_name: str,
rendering_realm_id: int,
recipient_id: int,
recipient_type: int,
recipient_type_id: int,
reactions: list[RawReactionRow],
submessages: list[dict[str, Any]],
) -> dict[str, Any]:
obj = dict(
id=message_id,
sender_id=sender_id,
content=content,
recipient_type_id=recipient_type_id,
recipient_type=recipient_type,
recipient_id=recipient_id,
timestamp=datetime_to_timestamp(date_sent),
client=sending_client_name,
)
obj[TOPIC_NAME] = topic_name
obj["sender_realm_id"] = sender_realm_id
# Render topic_links with the stream's realm instead of the
# sender's realm; this is important for messages sent by
# cross-realm bots like NOTIFICATION_BOT.
obj[TOPIC_LINKS] = topic_links(rendering_realm_id, topic_name)
if last_edit_time is not None:
obj["last_edit_timestamp"] = datetime_to_timestamp(last_edit_time)
assert edit_history_json is not None
edit_history: list[EditHistoryEvent] = orjson.loads(edit_history_json)
obj["edit_history"] = edit_history
if Message.need_to_render_content(
rendered_content, rendered_content_version, markdown_version
):
# We really shouldn't be rendering objects in this method, but there is
# a scenario where we upgrade the version of Markdown and fail to run
# management commands to re-render historical messages, and then we
# need to have side effects. This method is optimized to not need full
# blown ORM objects, but the Markdown renderer is unfortunately highly
# coupled to Message, and we also need to persist the new rendered content.
# If we don't have a message object passed in, we get one here. The cost
# of going to the DB here should be overshadowed by the cost of rendering
# and updating the row.
# TODO: see #1379 to eliminate Markdown dependencies
message = Message.objects.select_related("sender").get(id=message_id)
assert message is not None # Hint for mypy.
# It's unfortunate that we need to have side effects on the message
# in some cases.
rendered_content = save_message_rendered_content(message, content)
if rendered_content is not None:
obj["rendered_content"] = rendered_content
else:
obj["rendered_content"] = (
"<p>[Zulip note: Sorry, we could not understand the formatting of your message]</p>"
)
if rendered_content is not None:
obj["is_me_message"] = Message.is_status_message(content, rendered_content)
else:
obj["is_me_message"] = False
obj["reactions"] = [
ReactionDict.build_dict_from_raw_db_row(reaction) for reaction in reactions
]
obj["submessages"] = submessages
return obj
@staticmethod
def bulk_hydrate_sender_info(objs: list[dict[str, Any]]) -> None:
sender_ids = list({obj["sender_id"] for obj in objs})
if not sender_ids:
return
query = UserProfile.objects.values(
"id",
"full_name",
"delivery_email",
"email",
"realm__string_id",
"avatar_source",
"avatar_version",
"is_mirror_dummy",
"email_address_visibility",
)
rows = query_for_ids(query, sender_ids, "zerver_userprofile.id")
sender_dict = {row["id"]: row for row in rows}
for obj in objs:
sender_id = obj["sender_id"]
user_row = sender_dict[sender_id]
obj["sender_full_name"] = user_row["full_name"]
obj["sender_email"] = user_row["email"]
obj["sender_delivery_email"] = user_row["delivery_email"]
obj["sender_realm_str"] = user_row["realm__string_id"]
obj["sender_avatar_source"] = user_row["avatar_source"]
obj["sender_avatar_version"] = user_row["avatar_version"]
obj["sender_is_mirror_dummy"] = user_row["is_mirror_dummy"]
obj["sender_email_address_visibility"] = user_row["email_address_visibility"]
@staticmethod
def hydrate_recipient_info(obj: dict[str, Any], display_recipient: DisplayRecipientT) -> None:
"""
This method hyrdrates recipient info with things
like full names and emails of senders. Eventually
our clients should be able to hyrdrate these fields
themselves with info they already have on users.
"""
recipient_type = obj["recipient_type"]
recipient_type_id = obj["recipient_type_id"]
sender_is_mirror_dummy = obj["sender_is_mirror_dummy"]
sender_email = obj["sender_email"]
sender_full_name = obj["sender_full_name"]
sender_id = obj["sender_id"]
if recipient_type == Recipient.STREAM:
display_type = "stream"
elif recipient_type in (Recipient.DIRECT_MESSAGE_GROUP, Recipient.PERSONAL):
assert not isinstance(display_recipient, str)
display_type = "private"
if len(display_recipient) == 1:
# add the sender in if this isn't a message between
# someone and themself, preserving ordering
recip: UserDisplayRecipient = {
"email": sender_email,
"full_name": sender_full_name,
"id": sender_id,
"is_mirror_dummy": sender_is_mirror_dummy,
}
if recip["email"] < display_recipient[0]["email"]:
display_recipient = [recip, display_recipient[0]]
elif recip["email"] > display_recipient[0]["email"]:
display_recipient = [display_recipient[0], recip]
else:
raise AssertionError(f"Invalid recipient type {recipient_type}")
obj["display_recipient"] = display_recipient
obj["type"] = display_type
if obj["type"] == "stream":
obj["stream_id"] = recipient_type_id
@staticmethod
def bulk_hydrate_recipient_info(objs: list[dict[str, Any]]) -> None:
recipient_tuples = { # We use set to eliminate duplicate tuples.
(
obj["recipient_id"],
obj["recipient_type"],
obj["recipient_type_id"],
)
for obj in objs
}
display_recipients = bulk_fetch_display_recipients(recipient_tuples)
for obj in objs:
MessageDict.hydrate_recipient_info(obj, display_recipients[obj["recipient_id"]])
@staticmethod
def set_sender_avatar(
obj: dict[str, Any], client_gravatar: bool, can_access_sender: bool = True
) -> None:
if not can_access_sender:
obj["avatar_url"] = get_avatar_for_inaccessible_user()
return
sender_id = obj["sender_id"]
sender_realm_id = obj["sender_realm_id"]
sender_delivery_email = obj["sender_delivery_email"]
sender_avatar_source = obj["sender_avatar_source"]
sender_avatar_version = obj["sender_avatar_version"]
obj["avatar_url"] = get_avatar_field(
user_id=sender_id,
realm_id=sender_realm_id,
email=sender_delivery_email,
avatar_source=sender_avatar_source,
avatar_version=sender_avatar_version,
medium=False,
client_gravatar=client_gravatar,
)