zulip/zerver/lib/subscription_info.py

import itertools
from collections.abc import Callable, Collection, Iterable, Mapping
from operator import itemgetter
from typing import Any

from django.core.exceptions import ValidationError
from django.db import connection
from django.db.models import QuerySet
from django.utils.translation import gettext as _
from psycopg2.sql import SQL

from zerver.lib.exceptions import JsonableError
from zerver.lib.stream_color import STREAM_ASSIGNMENT_COLORS
from zerver.lib.stream_subscription import (
    get_active_subscriptions_for_stream_id,
    get_stream_subscriptions_for_user,
)
from zerver.lib.stream_traffic import get_average_weekly_stream_traffic, get_streams_traffic
from zerver.lib.streams import get_web_public_streams_queryset, subscribed_to_stream
from zerver.lib.timestamp import datetime_to_timestamp, timestamp_to_datetime
from zerver.lib.types import (
    APIStreamDict,
    NeverSubscribedStreamDict,
    RawStreamDict,
    RawSubscriptionDict,
    SubscriptionInfo,
    SubscriptionStreamDict,
)
from zerver.models import Realm, Stream, Subscription, UserProfile
from zerver.models.streams import get_all_streams


def get_web_public_subs(realm: Realm) -> SubscriptionInfo:
    color_idx = 0

    def get_next_color() -> str:
        nonlocal color_idx
        color = STREAM_ASSIGNMENT_COLORS[color_idx]
        color_idx = (color_idx + 1) % len(STREAM_ASSIGNMENT_COLORS)
        return color

    subscribed = []
    for stream in get_web_public_streams_queryset(realm):
        # Add Stream fields.
        is_archived = stream.deactivated
        can_remove_subscribers_group_id = stream.can_remove_subscribers_group_id
        creator_id = stream.creator_id
        date_created = datetime_to_timestamp(stream.date_created)
        description = stream.description
        first_message_id = stream.first_message_id
        history_public_to_subscribers = stream.history_public_to_subscribers
        invite_only = stream.invite_only
        is_announcement_only = stream.stream_post_policy == Stream.STREAM_POST_POLICY_ADMINS
        is_web_public = stream.is_web_public
        message_retention_days = stream.message_retention_days
        name = stream.name
        rendered_description = stream.rendered_description
        stream_id = stream.id
        stream_post_policy = stream.stream_post_policy

        # Add versions of the Subscription fields based on a simulated
        # new user subscription set.
        audible_notifications = True
        color = get_next_color()
        desktop_notifications = True
        email_notifications = True
        in_home_view = True
        is_muted = False
        pin_to_top = False
        push_notifications = True
        stream_weekly_traffic = get_average_weekly_stream_traffic(
            stream.id, stream.date_created, {}
        )
        wildcard_mentions_notify = True

        sub = SubscriptionStreamDict(
            is_archived=is_archived,
            audible_notifications=audible_notifications,
            can_remove_subscribers_group=can_remove_subscribers_group_id,
            color=color,
            creator_id=creator_id,
            date_created=date_created,
            description=description,
            desktop_notifications=desktop_notifications,
            email_notifications=email_notifications,
            first_message_id=first_message_id,
            history_public_to_subscribers=history_public_to_subscribers,
            in_home_view=in_home_view,
            invite_only=invite_only,
            is_announcement_only=is_announcement_only,
            is_muted=is_muted,
            is_web_public=is_web_public,
            message_retention_days=message_retention_days,
            name=name,
            pin_to_top=pin_to_top,
            push_notifications=push_notifications,
            rendered_description=rendered_description,
            stream_id=stream_id,
            stream_post_policy=stream_post_policy,
            stream_weekly_traffic=stream_weekly_traffic,
            wildcard_mentions_notify=wildcard_mentions_notify,
        )
        subscribed.append(sub)

    return SubscriptionInfo(
        subscriptions=subscribed,
        unsubscribed=[],
        never_subscribed=[],
    )


def build_unsubscribed_sub_from_stream_dict(
    user: UserProfile, sub_dict: RawSubscriptionDict, stream_dict: APIStreamDict
) -> SubscriptionStreamDict:
    # This function is only called from `apply_event` code.
    raw_stream_dict = RawStreamDict(
        can_remove_subscribers_group_id=stream_dict["can_remove_subscribers_group"],
        creator_id=stream_dict["creator_id"],
        date_created=timestamp_to_datetime(stream_dict["date_created"]),
        deactivated=stream_dict["is_archived"],
        description=stream_dict["description"],
        first_message_id=stream_dict["first_message_id"],
        history_public_to_subscribers=stream_dict["history_public_to_subscribers"],
        invite_only=stream_dict["invite_only"],
        is_web_public=stream_dict["is_web_public"],
        message_retention_days=stream_dict["message_retention_days"],
        name=stream_dict["name"],
        rendered_description=stream_dict["rendered_description"],
        id=stream_dict["stream_id"],
        stream_post_policy=stream_dict["stream_post_policy"],
    )

    # We pass recent_traffic as None and avoid extra database query since we
    # already have the traffic data from stream_dict sent with creation event.
    subscription_stream_dict = build_stream_dict_for_sub(
        user, sub_dict, raw_stream_dict, recent_traffic=None
    )
    subscription_stream_dict["stream_weekly_traffic"] = stream_dict["stream_weekly_traffic"]

    return subscription_stream_dict


def build_stream_dict_for_sub(
    user: UserProfile,
    sub_dict: RawSubscriptionDict,
    raw_stream_dict: RawStreamDict,
    recent_traffic: dict[int, int] | None,
) -> SubscriptionStreamDict:
    # Handle Stream.API_FIELDS
    is_archived = raw_stream_dict["deactivated"]
    can_remove_subscribers_group_id = raw_stream_dict["can_remove_subscribers_group_id"]
    creator_id = raw_stream_dict["creator_id"]
    date_created = datetime_to_timestamp(raw_stream_dict["date_created"])
    description = raw_stream_dict["description"]
    first_message_id = raw_stream_dict["first_message_id"]
    history_public_to_subscribers = raw_stream_dict["history_public_to_subscribers"]
    invite_only = raw_stream_dict["invite_only"]
    is_web_public = raw_stream_dict["is_web_public"]
    message_retention_days = raw_stream_dict["message_retention_days"]
    name = raw_stream_dict["name"]
    rendered_description = raw_stream_dict["rendered_description"]
    stream_id = raw_stream_dict["id"]
    stream_post_policy = raw_stream_dict["stream_post_policy"]

    # Handle Subscription.API_FIELDS.
    color = sub_dict["color"]
    is_muted = sub_dict["is_muted"]
    pin_to_top = sub_dict["pin_to_top"]
    audible_notifications = sub_dict["audible_notifications"]
    desktop_notifications = sub_dict["desktop_notifications"]
    email_notifications = sub_dict["email_notifications"]
    push_notifications = sub_dict["push_notifications"]
    wildcard_mentions_notify = sub_dict["wildcard_mentions_notify"]

    # Backwards-compatibility for clients that haven't been
    # updated for the in_home_view => is_muted API migration.
    in_home_view = not is_muted

    # Backwards-compatibility for clients that haven't been
    # updated for the is_announcement_only -> stream_post_policy
    # migration.
    is_announcement_only = raw_stream_dict["stream_post_policy"] == Stream.STREAM_POST_POLICY_ADMINS

    # Add a few computed fields not directly from the data models.
    if recent_traffic is not None:
        stream_weekly_traffic = get_average_weekly_stream_traffic(
            raw_stream_dict["id"], raw_stream_dict["date_created"], recent_traffic
        )
    else:
        stream_weekly_traffic = None

    # Our caller may add a subscribers field.
    return SubscriptionStreamDict(
        is_archived=is_archived,
        audible_notifications=audible_notifications,
        can_remove_subscribers_group=can_remove_subscribers_group_id,
        color=color,
        creator_id=creator_id,
        date_created=date_created,
        description=description,
        desktop_notifications=desktop_notifications,
        email_notifications=email_notifications,
        first_message_id=first_message_id,
        history_public_to_subscribers=history_public_to_subscribers,
        in_home_view=in_home_view,
        invite_only=invite_only,
        is_announcement_only=is_announcement_only,
        is_muted=is_muted,
        is_web_public=is_web_public,
        message_retention_days=message_retention_days,
        name=name,
        pin_to_top=pin_to_top,
        push_notifications=push_notifications,
        rendered_description=rendered_description,
        stream_id=stream_id,
        stream_post_policy=stream_post_policy,
        stream_weekly_traffic=stream_weekly_traffic,
        wildcard_mentions_notify=wildcard_mentions_notify,
    )


def build_stream_dict_for_never_sub(
    raw_stream_dict: RawStreamDict,
    recent_traffic: dict[int, int] | None,
) -> NeverSubscribedStreamDict:
    is_archived = raw_stream_dict["deactivated"]
    can_remove_subscribers_group_id = raw_stream_dict["can_remove_subscribers_group_id"]
    creator_id = raw_stream_dict["creator_id"]
    date_created = datetime_to_timestamp(raw_stream_dict["date_created"])
    description = raw_stream_dict["description"]
    first_message_id = raw_stream_dict["first_message_id"]
    history_public_to_subscribers = raw_stream_dict["history_public_to_subscribers"]
    invite_only = raw_stream_dict["invite_only"]
    is_web_public = raw_stream_dict["is_web_public"]
    message_retention_days = raw_stream_dict["message_retention_days"]
    name = raw_stream_dict["name"]
    rendered_description = raw_stream_dict["rendered_description"]
    stream_id = raw_stream_dict["id"]
    stream_post_policy = raw_stream_dict["stream_post_policy"]

    if recent_traffic is not None:
        stream_weekly_traffic = get_average_weekly_stream_traffic(
            raw_stream_dict["id"], raw_stream_dict["date_created"], recent_traffic
        )
    else:
        stream_weekly_traffic = None

    # Backwards-compatibility addition of removed field.
    is_announcement_only = raw_stream_dict["stream_post_policy"] == Stream.STREAM_POST_POLICY_ADMINS

    # Our caller may add a subscribers field.
    return NeverSubscribedStreamDict(
        is_archived=is_archived,
        can_remove_subscribers_group=can_remove_subscribers_group_id,
        creator_id=creator_id,
        date_created=date_created,
        description=description,
        first_message_id=first_message_id,
        history_public_to_subscribers=history_public_to_subscribers,
        invite_only=invite_only,
        is_announcement_only=is_announcement_only,
        is_web_public=is_web_public,
        message_retention_days=message_retention_days,
        name=name,
        rendered_description=rendered_description,
        stream_id=stream_id,
        stream_post_policy=stream_post_policy,
        stream_weekly_traffic=stream_weekly_traffic,
    )


def validate_user_access_to_subscribers(user_profile: UserProfile | None, stream: Stream) -> None:
    """Validates whether the user can view the subscribers of a stream.  Raises a JsonableError if:
    * The user and the stream are in different realms
    * The realm is MIT and the stream is not invite only.
    * The stream is invite only, requesting_user is passed, and that user
      does not subscribe to the stream.
    """
    validate_user_access_to_subscribers_helper(
        user_profile,
        {
            "realm_id": stream.realm_id,
            "is_web_public": stream.is_web_public,
            "invite_only": stream.invite_only,
        },
        # We use a lambda here so that we only compute whether the
        # user is subscribed if we have to
        lambda user_profile: subscribed_to_stream(user_profile, stream.id),
    )


def validate_user_access_to_subscribers_helper(
    user_profile: UserProfile | None,
    stream_dict: Mapping[str, Any],
    check_user_subscribed: Callable[[UserProfile], bool],
) -> None:
    """Helper for validate_user_access_to_subscribers that doesn't require
    a full stream object.  This function is a bit hard to read,
    because it is carefully optimized for performance in the two code
    paths we call it from:

    * In `bulk_get_subscriber_user_ids`, we already know whether the
    user was subscribed via `sub_dict`, and so we want to avoid a
    database query at all (especially since it calls this in a loop);
    * In `validate_user_access_to_subscribers`, we want to only check
    if the user is subscribed when we absolutely have to, since it
    costs a database query.

    The `check_user_subscribed` argument is a function that reports
    whether the user is subscribed to the stream.

    Note also that we raise a ValidationError in cases where the
    caller is doing the wrong thing (maybe these should be
    AssertionErrors), and JsonableError for 400 type errors.
    """
    if user_profile is None:
        raise ValidationError("Missing user to validate access for")

    if user_profile.realm_id != stream_dict["realm_id"]:
        raise ValidationError("Requesting user not in given realm")

    # Even guest users can access subscribers to web-public streams,
    # since they can freely become subscribers to these streams.
    if stream_dict["is_web_public"]:
        return

    # With the exception of web-public streams, a guest must
    # be subscribed to a stream (even a public one) in order
    # to see subscribers.
    if user_profile.is_guest and check_user_subscribed(user_profile):
        return
        # We could explicitly handle the case where guests aren't
        # subscribed here in an `else` statement or we can fall
        # through to the subsequent logic.  Tim prefers the latter.
        # Adding an `else` would ensure better code coverage.

    if not user_profile.can_access_public_streams() and not stream_dict["invite_only"]:
        raise JsonableError(_("Subscriber data is not available for this channel"))

    # Organization administrators can view subscribers for all streams.
    if user_profile.is_realm_admin:
        return

    if stream_dict["invite_only"] and not check_user_subscribed(user_profile):
        raise JsonableError(_("Unable to retrieve subscribers for private channel"))


def bulk_get_subscriber_user_ids(
    stream_dicts: Collection[Mapping[str, Any]],
    user_profile: UserProfile,
    subscribed_stream_ids: set[int],
) -> dict[int, list[int]]:
    """sub_dict maps stream_id => whether the user is subscribed to that stream."""
    target_stream_dicts = []
    is_subscribed: bool
    check_user_subscribed = lambda user_profile: is_subscribed

    for stream_dict in stream_dicts:
        stream_id = stream_dict["id"]
        is_subscribed = stream_id in subscribed_stream_ids

        try:
            validate_user_access_to_subscribers_helper(
                user_profile,
                stream_dict,
                check_user_subscribed,
            )
        except JsonableError:
            continue
        target_stream_dicts.append(stream_dict)

    recip_to_stream_id = {stream["recipient_id"]: stream["id"] for stream in target_stream_dicts}
    recipient_ids = sorted(stream["recipient_id"] for stream in target_stream_dicts)

    result: dict[int, list[int]] = {stream["id"]: [] for stream in stream_dicts}
    if not recipient_ids:
        return result

    """
    The raw SQL below leads to more than a 2x speedup when tested with
    20k+ total subscribers.  (For large realms with lots of default
    streams, this function deals with LOTS of data, so it is important
    to optimize.)
    """

    query = SQL(
        """
        SELECT
            zerver_subscription.recipient_id,
            zerver_subscription.user_profile_id
        FROM
            zerver_subscription
        WHERE
            zerver_subscription.recipient_id in %(recipient_ids)s AND
            zerver_subscription.active AND
            zerver_subscription.is_user_active
        ORDER BY
            zerver_subscription.recipient_id,
            zerver_subscription.user_profile_id
        """
    )

    cursor = connection.cursor()
    cursor.execute(query, {"recipient_ids": tuple(recipient_ids)})
    rows = cursor.fetchall()
    cursor.close()

    """
    Using groupby/itemgetter here is important for performance, at scale.
    It makes it so that all interpreter overhead is just O(N) in nature.
    """
    for recip_id, recip_rows in itertools.groupby(rows, itemgetter(0)):
        user_profile_ids = [r[1] for r in recip_rows]
        stream_id = recip_to_stream_id[recip_id]
        result[stream_id] = list(user_profile_ids)

    return result


def get_subscribers_query(
    stream: Stream, requesting_user: UserProfile | None
) -> QuerySet[Subscription]:
    """Build a query to get the subscribers list for a stream, raising a JsonableError if:

    'realm' is optional in stream.

    The caller can refine this query with select_related(), values(), etc. depending
    on whether it wants objects or just certain fields
    """
    validate_user_access_to_subscribers(requesting_user, stream)

    return get_active_subscriptions_for_stream_id(stream.id, include_deactivated_users=False)


def has_metadata_access_to_previously_subscribed_stream(
    user_profile: UserProfile, stream_dict: SubscriptionStreamDict
) -> bool:
    if stream_dict["is_web_public"]:
        return True

    if not user_profile.can_access_public_streams():
        return False

    if stream_dict["invite_only"]:
        return user_profile.is_realm_admin

    return True


# In general, it's better to avoid using .values() because it makes
# the code pretty ugly, but in this case, it has significant
# performance impact for loading / for users with large numbers of
# subscriptions, so it's worth optimizing.
def gather_subscriptions_helper(
    user_profile: UserProfile,
    include_subscribers: bool = True,
    include_archived_channels: bool = False,
) -> SubscriptionInfo:
    realm = user_profile.realm
    all_streams = get_all_streams(
        realm, include_archived_channels=include_archived_channels
    ).values(
        *Stream.API_FIELDS,
        # The realm_id and recipient_id are generally not needed in the API.
        "realm_id",
        "recipient_id",
    )
    recip_id_to_stream_id: dict[int, int] = {
        stream["recipient_id"]: stream["id"] for stream in all_streams
    }
    all_streams_map: dict[int, RawStreamDict] = {stream["id"]: stream for stream in all_streams}

    sub_dicts_query: Iterable[RawSubscriptionDict] = (
        get_stream_subscriptions_for_user(user_profile)
        .values(
            *Subscription.API_FIELDS,
            "recipient_id",
            "active",
        )
        .order_by("recipient_id")
    )

    # We only care about subscriptions for active streams.
    sub_dicts: list[RawSubscriptionDict] = [
        sub_dict
        for sub_dict in sub_dicts_query
        if recip_id_to_stream_id.get(sub_dict["recipient_id"])
    ]

    def get_stream_id(sub_dict: RawSubscriptionDict) -> int:
        return recip_id_to_stream_id[sub_dict["recipient_id"]]

    traffic_stream_ids = {get_stream_id(sub_dict) for sub_dict in sub_dicts}
    recent_traffic = get_streams_traffic(stream_ids=traffic_stream_ids, realm=realm)

    # Okay, now we finally get to populating our main results, which
    # will be these three lists.
    subscribed: list[SubscriptionStreamDict] = []
    unsubscribed: list[SubscriptionStreamDict] = []
    never_subscribed: list[NeverSubscribedStreamDict] = []

    sub_unsub_stream_ids = set()
    for sub_dict in sub_dicts:
        stream_id = get_stream_id(sub_dict)
        sub_unsub_stream_ids.add(stream_id)
        raw_stream_dict = all_streams_map[stream_id]
        stream_dict = build_stream_dict_for_sub(
            user=user_profile,
            sub_dict=sub_dict,
            raw_stream_dict=raw_stream_dict,
            recent_traffic=recent_traffic,
        )

        # is_active is represented in this structure by which list we include it in.
        is_active = sub_dict["active"]
        if is_active:
            subscribed.append(stream_dict)
        else:
            if has_metadata_access_to_previously_subscribed_stream(user_profile, stream_dict):
                """
                User who are no longer subscribed to a stream that they don't have
                metadata access to will not receive metadata related to this stream
                and their clients will see it as an unknown stream if referenced
                somewhere (e.g. a markdown stream link), just like they would see
                a reference to a private stream they had never been subscribed to.
                """
                unsubscribed.append(stream_dict)

    if user_profile.can_access_public_streams():
        never_subscribed_stream_ids = {
            stream["id"] for stream in all_streams if not stream["deactivated"]
        } - sub_unsub_stream_ids
    else:
        web_public_stream_ids = {stream["id"] for stream in all_streams if stream["is_web_public"]}
        never_subscribed_stream_ids = web_public_stream_ids - sub_unsub_stream_ids

    never_subscribed_streams = [
        all_streams_map[stream_id] for stream_id in never_subscribed_stream_ids
    ]

    for raw_stream_dict in never_subscribed_streams:
        is_public = not raw_stream_dict["invite_only"]
        if is_public or user_profile.is_realm_admin:
            slim_stream_dict = build_stream_dict_for_never_sub(
                raw_stream_dict=raw_stream_dict, recent_traffic=recent_traffic
            )

            never_subscribed.append(slim_stream_dict)

    if include_subscribers:
        # The highly optimized bulk_get_subscriber_user_ids wants to know which
        # streams we are subscribed to, for validation purposes, and it uses that
        # info to know if it's allowed to find OTHER subscribers.
        subscribed_stream_ids = {
            get_stream_id(sub_dict) for sub_dict in sub_dicts if sub_dict["active"]
        }

        subscriber_map = bulk_get_subscriber_user_ids(
            all_streams,
            user_profile,
            subscribed_stream_ids,
        )

        for lst in [subscribed, unsubscribed]:
            for stream_dict in lst:
                assert isinstance(stream_dict["stream_id"], int)
                stream_id = stream_dict["stream_id"]
                stream_dict["subscribers"] = subscriber_map[stream_id]

        for slim_stream_dict in never_subscribed:
            assert isinstance(slim_stream_dict["stream_id"], int)
            stream_id = slim_stream_dict["stream_id"]
            slim_stream_dict["subscribers"] = subscriber_map[stream_id]

    subscribed.sort(key=lambda x: x["name"])
    unsubscribed.sort(key=lambda x: x["name"])
    never_subscribed.sort(key=lambda x: x["name"])

    return SubscriptionInfo(
        subscriptions=subscribed,
        unsubscribed=unsubscribed,
        never_subscribed=never_subscribed,
    )


def gather_subscriptions(
    user_profile: UserProfile,
    include_subscribers: bool = False,
) -> tuple[list[SubscriptionStreamDict], list[SubscriptionStreamDict]]:
    helper_result = gather_subscriptions_helper(
        user_profile,
        include_subscribers=include_subscribers,
    )
    subscribed = helper_result.subscriptions
    unsubscribed = helper_result.unsubscribed
    return (subscribed, unsubscribed)