2020-08-11 01:47:49 +02:00
|
|
|
# Zulip's main Markdown implementation. See docs/subsystems/markdown.md for
|
|
|
|
# detailed documentation on our Markdown syntax.
|
2021-08-04 10:10:20 +02:00
|
|
|
import cgi
|
2020-06-18 23:14:53 +02:00
|
|
|
import datetime
|
2020-06-11 00:54:34 +02:00
|
|
|
import html
|
2012-10-22 05:06:28 +02:00
|
|
|
import logging
|
2020-06-11 00:54:34 +02:00
|
|
|
import re
|
|
|
|
import time
|
2017-11-05 05:30:31 +01:00
|
|
|
import urllib
|
2019-12-12 01:28:29 +01:00
|
|
|
import urllib.parse
|
2021-06-13 00:51:30 +02:00
|
|
|
from collections import deque
|
2020-06-11 21:44:23 +02:00
|
|
|
from dataclasses import dataclass
|
2023-02-09 23:39:41 +01:00
|
|
|
from functools import lru_cache
|
2021-03-18 02:11:28 +01:00
|
|
|
from typing import (
|
|
|
|
Any,
|
|
|
|
Callable,
|
|
|
|
Dict,
|
|
|
|
Generic,
|
|
|
|
List,
|
|
|
|
Match,
|
|
|
|
Optional,
|
|
|
|
Pattern,
|
|
|
|
Set,
|
|
|
|
Tuple,
|
2022-04-27 02:23:56 +02:00
|
|
|
TypedDict,
|
2021-03-18 02:11:28 +01:00
|
|
|
TypeVar,
|
|
|
|
Union,
|
2023-08-10 21:00:45 +02:00
|
|
|
cast,
|
2021-03-18 02:11:28 +01:00
|
|
|
)
|
2023-02-10 00:45:11 +01:00
|
|
|
from urllib.parse import parse_qs, urlencode, urljoin, urlsplit
|
2020-06-03 06:37:07 +02:00
|
|
|
from xml.etree.ElementTree import Element, SubElement
|
2013-05-01 22:49:16 +02:00
|
|
|
|
2020-06-11 00:54:34 +02:00
|
|
|
import ahocorasick
|
|
|
|
import dateutil.parser
|
|
|
|
import dateutil.tz
|
2021-08-04 10:10:20 +02:00
|
|
|
import lxml.etree
|
2020-06-11 00:54:34 +02:00
|
|
|
import markdown
|
2020-10-19 06:37:43 +02:00
|
|
|
import markdown.blockprocessors
|
|
|
|
import markdown.inlinepatterns
|
|
|
|
import markdown.postprocessors
|
|
|
|
import markdown.treeprocessors
|
|
|
|
import markdown.util
|
CVE-2021-41115: Use re2 for user-supplied linkifier patterns.
Zulip attempts to validate that the regular expressions that admins
enter for linkifiers are well-formatted, and only contain a specific
subset of regex grammar. The process of checking these
properties (via a regex!) can cause denial-of-service via
backtracking.
Furthermore, this validation itself does not prevent the creation of
linkifiers which themselves cause denial-of-service when they are
executed. As the validator accepts literally anything inside of a
`(?P<word>...)` block, any quadratic backtracking expression can be
hidden therein.
Switch user-provided linkifier patterns to be matched in the Markdown
processor by the `re2` library, which is guaranteed constant-time.
This somewhat limits the possible features of the regular
expression (notably, look-head and -behind, and back-references);
however, these features had never been advertised as working in the
context of linkifiers.
A migration removes any existing linkifiers which would not function
under re2, after printing them for posterity during the upgrade; they
are unlikely to be common, and are impossible to fix automatically.
The denial-of-service in the linkifier validator was discovered by
@erik-krogh and @yoff, as GHSL-2021-118.
2021-09-29 01:27:54 +02:00
|
|
|
import re2
|
2023-08-10 21:00:45 +02:00
|
|
|
import regex
|
2014-05-21 08:11:29 +02:00
|
|
|
import requests
|
linkifier: Support URL templates for linkifiers.
This swaps out url_format_string from all of our APIs and replaces it
with url_template. Note that the documentation changes in the following
commits will be squashed with this commit.
We change the "url_format" key to "url_template" for the
realm_linkifiers events in event_schema, along with updating
LinkifierDict. "url_template" is the name chosen to normalize
mixed usages of "url_format_string" and "url_format" throughout
the backend.
The markdown processor is updated to stop handling the format string
interpolation and delegate the task template expansion to the uri_template
library instead.
This change affects many test cases. We mostly just replace "%(name)s"
with "{name}", "url_format_string" with "url_template" to make sure that
they still pass. There are some test cases dedicated for testing "%"
escaping, which aren't relevant anymore and are subject to removal.
But for now we keep most of them as-is, and make sure that "%" is always
escaped since we do not use it for variable substitution any more.
Since url_format_string is not populated anymore, a migration is created
to remove this field entirely, and make url_template non-nullable since
we will always populate it. Note that it is possible to have
url_template being null after migration 0422 and before 0424, but
in practice, url_template will not be None after backfilling and the
backend now is always setting url_template.
With the removal of url_format_string, RealmFilter model will now be cleaned
with URL template checks, and the old checks for escapes are removed.
We also modified RealmFilter.clean to skip the validation when the
url_template is invalid. This avoids raising mulitple ValidationError's
when calling full_clean on a linkifier. But we might eventually want to
have a more centric approach to data validation instead of having
the same validation in both the clean method and the validator.
Fixes #23124.
Signed-off-by: Zixuan James Li <p359101898@gmail.com>
2022-10-05 20:55:31 +02:00
|
|
|
import uri_template
|
2013-03-08 06:27:16 +01:00
|
|
|
from django.conf import settings
|
2020-11-11 00:39:09 +01:00
|
|
|
from markdown.blockparser import BlockParser
|
2020-06-11 00:54:34 +02:00
|
|
|
from markdown.extensions import codehilite, nl2br, sane_lists, tables
|
2021-10-21 23:20:56 +02:00
|
|
|
from soupsieve import escape as css_escape
|
2020-09-22 03:10:16 +02:00
|
|
|
from tlds import tld_set
|
2023-08-02 23:53:10 +02:00
|
|
|
from typing_extensions import TypeAlias
|
2013-01-31 19:57:25 +01:00
|
|
|
|
2022-12-04 20:51:44 +01:00
|
|
|
from zerver.lib import mention
|
2022-04-14 21:57:20 +02:00
|
|
|
from zerver.lib.cache import cache_with_key
|
2016-04-28 05:40:58 +02:00
|
|
|
from zerver.lib.camo import get_camo_url
|
2021-05-15 12:02:50 +02:00
|
|
|
from zerver.lib.emoji import EMOTICON_RE, codepoint_to_name, name_to_codepoint, translate_emoticons
|
2023-08-10 21:00:45 +02:00
|
|
|
from zerver.lib.emoji_utils import emoji_to_hex_codepoint, unqualify_emoji
|
2022-11-17 09:30:48 +01:00
|
|
|
from zerver.lib.exceptions import MarkdownRenderingError
|
2020-06-25 15:00:33 +02:00
|
|
|
from zerver.lib.markdown import fenced_code
|
|
|
|
from zerver.lib.markdown.fenced_code import FENCE_RE
|
2022-06-13 06:02:57 +02:00
|
|
|
from zerver.lib.mention import (
|
|
|
|
BEFORE_MENTION_ALLOWED_REGEX,
|
|
|
|
FullNameInfo,
|
|
|
|
MentionBackend,
|
|
|
|
MentionData,
|
|
|
|
)
|
2021-05-07 03:54:25 +02:00
|
|
|
from zerver.lib.outgoing_http import OutgoingSession
|
2021-03-23 10:34:55 +01:00
|
|
|
from zerver.lib.subdomains import is_static_or_current_realm_url
|
2020-06-11 00:54:34 +02:00
|
|
|
from zerver.lib.tex import render_tex
|
2019-01-04 16:22:04 +01:00
|
|
|
from zerver.lib.thumbnail import user_uploads_or_external
|
2023-05-29 18:19:45 +02:00
|
|
|
from zerver.lib.timeout import timeout
|
timezone: Correct common_timezones dictionary.
The changes are as follows:
• Fix one day offset in all western zones.
• Correct CST from -64800 to -21600 and CDT from -68400 to -18000.
• Disambiguate PST in favor of -28000 over +28000.
• Add GMT, UTC, WET, previously excluded for being at offset 0.
• Add ACDT, AEDT, AKST, MET, MSK, NST, NZDT, PKT, which the previous
code did not find.
• Remove numbered abbreviations -12, …, +14, which are unnecessary.
• Remove MSD and PKST, which are no longer used.
Hardcode the dict and verify it with a test, so that future
discrepancies won’t go silently unnoticed.
Signed-off-by: Anders Kaseorg <anders@zulip.com>
2021-01-27 22:12:36 +01:00
|
|
|
from zerver.lib.timezone import common_timezones
|
2021-03-30 12:38:49 +02:00
|
|
|
from zerver.lib.types import LinkifierDict
|
2020-06-11 00:54:34 +02:00
|
|
|
from zerver.lib.url_encoding import encode_stream, hash_util_encode
|
2022-04-14 21:52:41 +02:00
|
|
|
from zerver.lib.url_preview.types import UrlEmbedData, UrlOEmbedData
|
2023-07-14 12:37:29 +02:00
|
|
|
from zerver.models import (
|
|
|
|
EmojiInfo,
|
|
|
|
Message,
|
|
|
|
Realm,
|
|
|
|
get_name_keyed_dict_for_active_realm_emoji,
|
|
|
|
linkifiers_for_realm,
|
|
|
|
)
|
2013-04-29 22:22:07 +02:00
|
|
|
|
2021-02-12 08:20:45 +01:00
|
|
|
ReturnT = TypeVar("ReturnT")
|
2019-01-22 19:31:25 +01:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2021-08-05 09:01:18 +02:00
|
|
|
# Taken from
|
|
|
|
# https://html.spec.whatwg.org/multipage/system-state.html#safelisted-scheme
|
|
|
|
html_safelisted_schemes = (
|
|
|
|
"bitcoin",
|
|
|
|
"geo",
|
|
|
|
"im",
|
|
|
|
"irc",
|
|
|
|
"ircs",
|
|
|
|
"magnet",
|
|
|
|
"mailto",
|
|
|
|
"matrix",
|
|
|
|
"mms",
|
|
|
|
"news",
|
|
|
|
"nntp",
|
|
|
|
"openpgp4fpr",
|
|
|
|
"sip",
|
|
|
|
"sms",
|
|
|
|
"smsto",
|
|
|
|
"ssh",
|
|
|
|
"tel",
|
|
|
|
"urn",
|
|
|
|
"webcal",
|
|
|
|
"wtai",
|
|
|
|
"xmpp",
|
|
|
|
)
|
2023-01-26 00:12:09 +01:00
|
|
|
allowed_schemes = ("http", "https", "ftp", "file", *html_safelisted_schemes)
|
2021-08-05 09:01:18 +02:00
|
|
|
|
|
|
|
|
2020-07-13 17:54:58 +02:00
|
|
|
class LinkInfo(TypedDict):
|
|
|
|
parent: Element
|
|
|
|
title: Optional[str]
|
|
|
|
index: Optional[int]
|
|
|
|
remove: Optional[Element]
|
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2021-06-17 12:20:40 +02:00
|
|
|
@dataclass
|
|
|
|
class MessageRenderingResult:
|
|
|
|
rendered_content: str
|
2023-06-06 09:23:01 +02:00
|
|
|
mentions_topic_wildcard: bool
|
2023-06-03 16:51:38 +02:00
|
|
|
mentions_stream_wildcard: bool
|
2021-06-17 12:20:40 +02:00
|
|
|
mentions_user_ids: Set[int]
|
|
|
|
mentions_user_group_ids: Set[int]
|
|
|
|
alert_words: Set[str]
|
|
|
|
links_for_preview: Set[str]
|
|
|
|
user_ids_with_alert_words: Set[int]
|
|
|
|
potential_attachment_path_ids: List[str]
|
|
|
|
|
2023-06-07 19:19:33 +02:00
|
|
|
def has_wildcard_mention(self) -> bool:
|
|
|
|
return self.mentions_stream_wildcard or self.mentions_topic_wildcard
|
|
|
|
|
2021-06-17 12:20:40 +02:00
|
|
|
|
2021-12-27 19:17:49 +01:00
|
|
|
@dataclass
|
|
|
|
class DbData:
|
|
|
|
mention_data: MentionData
|
|
|
|
realm_uri: str
|
|
|
|
realm_alert_words_automaton: Optional[ahocorasick.Automaton]
|
|
|
|
active_realm_emoji: Dict[str, EmojiInfo]
|
|
|
|
sent_by_bot: bool
|
|
|
|
stream_names: Dict[str, int]
|
|
|
|
translate_emoticons: bool
|
|
|
|
|
2018-11-07 16:07:34 +01:00
|
|
|
|
2020-08-11 01:47:49 +02:00
|
|
|
# Format version of the Markdown rendering; stored along with rendered
|
2013-03-18 22:51:08 +01:00
|
|
|
# messages so that we can efficiently determine what needs to be re-rendered
|
|
|
|
version = 1
|
|
|
|
|
2021-02-12 08:20:45 +01:00
|
|
|
_T = TypeVar("_T")
|
2023-08-02 23:53:10 +02:00
|
|
|
ElementStringNone: TypeAlias = Union[Element, Optional[str]]
|
2016-06-03 18:38:34 +02:00
|
|
|
|
2021-02-12 08:20:45 +01:00
|
|
|
EMOJI_REGEX = r"(?P<syntax>:[\w\-\+]+:)"
|
2017-09-14 22:11:34 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2021-03-18 02:11:28 +01:00
|
|
|
def verbose_compile(pattern: str) -> Pattern[str]:
|
2018-11-03 17:12:15 +01:00
|
|
|
return re.compile(
|
2020-06-13 08:59:37 +02:00
|
|
|
f"^(.*?){pattern}(.*?)$",
|
2021-10-21 06:00:38 +02:00
|
|
|
re.DOTALL | re.VERBOSE,
|
2018-11-03 17:12:15 +01:00
|
|
|
)
|
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2022-06-13 06:02:57 +02:00
|
|
|
STREAM_LINK_REGEX = rf"""
|
|
|
|
{BEFORE_MENTION_ALLOWED_REGEX} # Start after whitespace or specified chars
|
|
|
|
\#\*\* # and after hash sign followed by double asterisks
|
|
|
|
(?P<stream_name>[^\*]+) # stream name can contain anything
|
|
|
|
\*\* # ends by double asterisks
|
2017-09-15 00:25:38 +02:00
|
|
|
"""
|
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2023-02-09 23:39:41 +01:00
|
|
|
@lru_cache(None)
|
2021-03-18 02:11:28 +01:00
|
|
|
def get_compiled_stream_link_regex() -> Pattern[str]:
|
2021-03-18 11:43:52 +01:00
|
|
|
# Not using verbose_compile as it adds ^(.*?) and
|
|
|
|
# (.*?)$ which cause extra overhead of matching
|
|
|
|
# pattern which is not required.
|
|
|
|
# With new InlineProcessor these extra patterns
|
|
|
|
# are not required.
|
|
|
|
return re.compile(
|
|
|
|
STREAM_LINK_REGEX,
|
2021-10-21 06:00:38 +02:00
|
|
|
re.DOTALL | re.VERBOSE,
|
2021-03-18 11:43:52 +01:00
|
|
|
)
|
2019-01-22 20:16:39 +01:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2022-06-13 06:02:57 +02:00
|
|
|
STREAM_TOPIC_LINK_REGEX = rf"""
|
|
|
|
{BEFORE_MENTION_ALLOWED_REGEX} # Start after whitespace or specified chars
|
|
|
|
\#\*\* # and after hash sign followed by double asterisks
|
|
|
|
(?P<stream_name>[^\*>]+) # stream name can contain anything except >
|
|
|
|
> # > acts as separator
|
|
|
|
(?P<topic_name>[^\*]+) # topic name can contain anything
|
|
|
|
\*\* # ends by double asterisks
|
2019-06-21 17:31:16 +02:00
|
|
|
"""
|
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2023-02-09 23:39:41 +01:00
|
|
|
@lru_cache(None)
|
2021-03-18 02:11:28 +01:00
|
|
|
def get_compiled_stream_topic_link_regex() -> Pattern[str]:
|
2021-03-18 11:43:52 +01:00
|
|
|
# Not using verbose_compile as it adds ^(.*?) and
|
|
|
|
# (.*?)$ which cause extra overhead of matching
|
|
|
|
# pattern which is not required.
|
|
|
|
# With new InlineProcessor these extra patterns
|
|
|
|
# are not required.
|
|
|
|
return re.compile(
|
|
|
|
STREAM_TOPIC_LINK_REGEX,
|
2021-10-21 06:00:38 +02:00
|
|
|
re.DOTALL | re.VERBOSE,
|
2021-03-18 11:43:52 +01:00
|
|
|
)
|
2019-06-21 17:31:16 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2023-02-09 23:39:41 +01:00
|
|
|
@lru_cache(None)
|
2021-03-18 02:11:28 +01:00
|
|
|
def get_web_link_regex() -> Pattern[str]:
|
2018-11-03 17:12:15 +01:00
|
|
|
# We create this one time, but not at startup. So the
|
|
|
|
# first message rendered in any process will have some
|
2019-01-22 19:35:41 +01:00
|
|
|
# extra costs. It's roughly 75ms to run this code, so
|
2023-02-09 23:39:41 +01:00
|
|
|
# caching the value is super important here.
|
2019-01-22 19:35:41 +01:00
|
|
|
|
2021-02-12 08:20:45 +01:00
|
|
|
tlds = "|".join(list_of_tlds())
|
2019-01-22 19:35:41 +01:00
|
|
|
|
|
|
|
# A link starts at a word boundary, and ends at space, punctuation, or end-of-input.
|
|
|
|
#
|
2020-10-23 02:43:28 +02:00
|
|
|
# We detect a URL either by the `https?://` or by building around the TLD.
|
2019-01-22 19:35:41 +01:00
|
|
|
|
|
|
|
# In lieu of having a recursive regex (which python doesn't support) to match
|
|
|
|
# arbitrary numbers of nested matching parenthesis, we manually build a regexp that
|
|
|
|
# can match up to six
|
|
|
|
# The inner_paren_contents chunk matches the innermore non-parenthesis-holding text,
|
|
|
|
# and the paren_group matches text with, optionally, a matching set of parens
|
|
|
|
inner_paren_contents = r"[^\s()\"]*"
|
|
|
|
paren_group = r"""
|
|
|
|
[^\s()\"]*? # Containing characters that won't end the URL
|
|
|
|
(?: \( %s \) # and more characters in matched parens
|
|
|
|
[^\s()\"]*? # followed by more characters
|
|
|
|
)* # zero-or-more sets of paired parens
|
|
|
|
"""
|
|
|
|
nested_paren_chunk = paren_group
|
|
|
|
for i in range(6):
|
|
|
|
nested_paren_chunk = nested_paren_chunk % (paren_group,)
|
|
|
|
nested_paren_chunk = nested_paren_chunk % (inner_paren_contents,)
|
|
|
|
|
|
|
|
file_links = r"| (?:file://(/[^/ ]*)+/?)" if settings.ENABLE_FILE_LINKS else r""
|
2022-02-15 23:45:41 +01:00
|
|
|
REGEX = rf"""
|
2019-01-22 19:35:41 +01:00
|
|
|
(?<![^\s'"\(,:<]) # Start after whitespace or specified chars
|
|
|
|
# (Double-negative lookbehind to allow start-of-string)
|
|
|
|
(?P<url> # Main group
|
|
|
|
(?:(?: # Domain part
|
|
|
|
https?://[\w.:@-]+? # If it has a protocol, anything goes.
|
|
|
|
|(?: # Or, if not, be more strict to avoid false-positives
|
|
|
|
(?:[\w-]+\.)+ # One or more domain components, separated by dots
|
2020-09-22 03:10:16 +02:00
|
|
|
(?:{tlds}) # TLDs
|
2018-11-03 17:12:15 +01:00
|
|
|
)
|
|
|
|
)
|
2019-01-22 19:35:41 +01:00
|
|
|
(?:/ # A path, beginning with /
|
2020-06-13 08:59:37 +02:00
|
|
|
{nested_paren_chunk} # zero-to-6 sets of paired parens
|
2019-01-22 19:35:41 +01:00
|
|
|
)?) # Path is optional
|
|
|
|
| (?:[\w.-]+\@[\w.-]+\.[\w]+) # Email is separate, since it can't have a path
|
2020-06-13 08:59:37 +02:00
|
|
|
{file_links} # File path start with file:///, enable by setting ENABLE_FILE_LINKS=True
|
|
|
|
| (?:bitcoin:[13][a-km-zA-HJ-NP-Z1-9]{{25,34}}) # Bitcoin address pattern, see https://mokagio.github.io/tech-journal/2014/11/21/regex-bitcoin.html
|
2019-01-22 19:35:41 +01:00
|
|
|
)
|
|
|
|
(?= # URL must be followed by (not included in group)
|
|
|
|
[!:;\?\),\.\'\"\>]* # Optional punctuation characters
|
|
|
|
(?:\Z|\s) # followed by whitespace or end of string
|
|
|
|
)
|
2020-06-13 08:59:37 +02:00
|
|
|
"""
|
2023-02-09 23:39:41 +01:00
|
|
|
return verbose_compile(REGEX)
|
2018-11-03 17:12:15 +01:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2018-11-03 17:12:15 +01:00
|
|
|
def clear_state_for_testing() -> None:
|
|
|
|
# The link regex never changes in production, but our tests
|
|
|
|
# try out both sides of ENABLE_FILE_LINKS, so we need
|
|
|
|
# a way to clear it.
|
2023-02-09 23:39:41 +01:00
|
|
|
get_web_link_regex.cache_clear()
|
2018-11-03 17:12:15 +01:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2020-06-26 20:54:05 +02:00
|
|
|
markdown_logger = logging.getLogger()
|
2018-07-03 07:25:29 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2018-11-07 16:07:34 +01:00
|
|
|
def rewrite_local_links_to_relative(db_data: Optional[DbData], link: str) -> str:
|
2019-12-13 03:56:59 +01:00
|
|
|
"""If the link points to a local destination (e.g. #narrow/...),
|
|
|
|
generate a relative link that will open it in the current window.
|
|
|
|
"""
|
2017-12-11 17:35:04 +01:00
|
|
|
|
2018-11-07 16:07:34 +01:00
|
|
|
if db_data:
|
2021-12-27 19:17:49 +01:00
|
|
|
realm_uri_prefix = db_data.realm_uri + "/"
|
2022-08-17 06:37:53 +02:00
|
|
|
if link.startswith((realm_uri_prefix + "#", realm_uri_prefix + "user_uploads/")):
|
2021-02-12 08:19:30 +01:00
|
|
|
return link[len(realm_uri_prefix) :]
|
2017-12-11 17:35:04 +01:00
|
|
|
|
|
|
|
return link
|
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
|
|
|
def url_embed_preview_enabled(
|
|
|
|
message: Optional[Message] = None, realm: Optional[Realm] = None, no_previews: bool = False
|
|
|
|
) -> bool:
|
2017-03-13 14:42:03 +01:00
|
|
|
if not settings.INLINE_URL_EMBED_PREVIEW:
|
|
|
|
return False
|
2018-11-02 14:56:32 +01:00
|
|
|
|
2019-03-01 01:53:18 +01:00
|
|
|
if no_previews:
|
|
|
|
return False
|
|
|
|
|
2023-01-18 02:59:37 +01:00
|
|
|
if realm is None and message is not None:
|
|
|
|
realm = message.get_realm()
|
2018-11-02 14:56:32 +01:00
|
|
|
|
2017-03-13 14:42:03 +01:00
|
|
|
if realm is None:
|
2018-11-02 14:56:32 +01:00
|
|
|
# realm can be None for odd use cases
|
|
|
|
# like generating documentation or running
|
|
|
|
# test code
|
2017-03-13 14:42:03 +01:00
|
|
|
return True
|
2018-11-02 14:56:32 +01:00
|
|
|
|
2017-03-13 14:42:03 +01:00
|
|
|
return realm.inline_url_embed_preview
|
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
|
|
|
def image_preview_enabled(
|
|
|
|
message: Optional[Message] = None, realm: Optional[Realm] = None, no_previews: bool = False
|
|
|
|
) -> bool:
|
2017-03-13 14:42:03 +01:00
|
|
|
if not settings.INLINE_IMAGE_PREVIEW:
|
|
|
|
return False
|
2018-11-02 14:56:32 +01:00
|
|
|
|
2019-03-01 01:53:18 +01:00
|
|
|
if no_previews:
|
|
|
|
return False
|
|
|
|
|
2023-01-18 02:59:37 +01:00
|
|
|
if realm is None and message is not None:
|
|
|
|
realm = message.get_realm()
|
2018-11-02 14:56:32 +01:00
|
|
|
|
|
|
|
if realm is None:
|
|
|
|
# realm can be None for odd use cases
|
|
|
|
# like generating documentation or running
|
|
|
|
# test code
|
2017-03-13 14:42:03 +01:00
|
|
|
return True
|
2018-11-02 14:56:32 +01:00
|
|
|
|
2017-03-13 14:42:03 +01:00
|
|
|
return realm.inline_image_preview
|
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def list_of_tlds() -> List[str]:
|
2020-09-22 03:10:16 +02:00
|
|
|
# Skip a few overly-common false-positives from file extensions
|
2021-08-02 23:16:44 +02:00
|
|
|
common_false_positives = {"java", "md", "mov", "py", "zip"}
|
2023-09-12 23:19:57 +02:00
|
|
|
return sorted(tld_set - common_false_positives, key=len, reverse=True)
|
2013-04-02 17:08:00 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
|
|
|
def walk_tree(
|
|
|
|
root: Element, processor: Callable[[Element], Optional[_T]], stop_after_first: bool = False
|
|
|
|
) -> List[_T]:
|
2013-03-08 20:07:46 +01:00
|
|
|
results = []
|
2017-04-15 12:53:10 +02:00
|
|
|
queue = deque([root])
|
2013-03-08 20:07:46 +01:00
|
|
|
|
2017-04-15 12:53:10 +02:00
|
|
|
while queue:
|
|
|
|
currElement = queue.popleft()
|
2020-06-04 02:15:21 +02:00
|
|
|
for child in currElement:
|
2021-07-03 02:47:21 +02:00
|
|
|
queue.append(child)
|
2013-03-08 20:07:46 +01:00
|
|
|
|
|
|
|
result = processor(child)
|
|
|
|
if result is not None:
|
|
|
|
results.append(result)
|
2013-03-08 21:44:06 +01:00
|
|
|
if stop_after_first:
|
|
|
|
return results
|
2013-03-08 20:07:46 +01:00
|
|
|
|
|
|
|
return results
|
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2020-06-11 21:44:23 +02:00
|
|
|
@dataclass
|
|
|
|
class ElementFamily:
|
|
|
|
grandparent: Optional[Element]
|
|
|
|
parent: Element
|
|
|
|
child: Element
|
|
|
|
in_blockquote: bool
|
2017-12-25 21:35:23 +01:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2020-05-09 02:59:28 +02:00
|
|
|
T = TypeVar("T")
|
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2020-05-09 02:59:28 +02:00
|
|
|
class ResultWithFamily(Generic[T]):
|
|
|
|
family: ElementFamily
|
|
|
|
result: T
|
|
|
|
|
2022-11-16 06:10:54 +01:00
|
|
|
def __init__(self, family: ElementFamily, result: T) -> None:
|
2020-05-09 02:59:28 +02:00
|
|
|
self.family = family
|
|
|
|
self.result = result
|
2017-12-25 21:35:23 +01:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2020-05-09 03:08:01 +02:00
|
|
|
class ElementPair:
|
|
|
|
parent: Optional["ElementPair"]
|
|
|
|
value: Element
|
|
|
|
|
2022-11-16 06:10:54 +01:00
|
|
|
def __init__(self, parent: Optional["ElementPair"], value: Element) -> None:
|
2020-05-09 03:08:01 +02:00
|
|
|
self.parent = parent
|
|
|
|
self.value = value
|
2018-03-10 07:51:01 +01:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
|
|
|
def walk_tree_with_family(
|
|
|
|
root: Element,
|
|
|
|
processor: Callable[[Element], Optional[_T]],
|
|
|
|
) -> List[ResultWithFamily[_T]]:
|
2017-12-25 21:35:23 +01:00
|
|
|
results = []
|
|
|
|
|
2018-03-10 07:51:01 +01:00
|
|
|
queue = deque([ElementPair(parent=None, value=root)])
|
2017-12-25 21:35:23 +01:00
|
|
|
while queue:
|
|
|
|
currElementPair = queue.popleft()
|
2020-06-04 02:15:21 +02:00
|
|
|
for child in currElementPair.value:
|
2021-07-03 02:47:21 +02:00
|
|
|
queue.append(ElementPair(parent=currElementPair, value=child))
|
2017-12-25 21:35:23 +01:00
|
|
|
result = processor(child)
|
|
|
|
if result is not None:
|
2018-03-10 07:51:01 +01:00
|
|
|
if currElementPair.parent is not None:
|
2019-07-12 04:25:19 +02:00
|
|
|
grandparent_element = currElementPair.parent
|
2020-07-05 02:40:07 +02:00
|
|
|
grandparent: Optional[Element] = grandparent_element.value
|
2017-12-25 21:35:23 +01:00
|
|
|
else:
|
|
|
|
grandparent = None
|
|
|
|
family = ElementFamily(
|
|
|
|
grandparent=grandparent,
|
2018-03-10 07:51:01 +01:00
|
|
|
parent=currElementPair.value,
|
2019-07-11 16:26:31 +02:00
|
|
|
child=child,
|
python: Use trailing commas consistently.
Automatically generated by the following script, based on the output
of lint with flake8-comma:
import re
import sys
last_filename = None
last_row = None
lines = []
for msg in sys.stdin:
m = re.match(
r"\x1b\[35mflake8 \|\x1b\[0m \x1b\[1;31m(.+):(\d+):(\d+): (\w+)", msg
)
if m:
filename, row_str, col_str, err = m.groups()
row, col = int(row_str), int(col_str)
if filename == last_filename:
assert last_row != row
else:
if last_filename is not None:
with open(last_filename, "w") as f:
f.writelines(lines)
with open(filename) as f:
lines = f.readlines()
last_filename = filename
last_row = row
line = lines[row - 1]
if err in ["C812", "C815"]:
lines[row - 1] = line[: col - 1] + "," + line[col - 1 :]
elif err in ["C819"]:
assert line[col - 2] == ","
lines[row - 1] = line[: col - 2] + line[col - 1 :].lstrip(" ")
if last_filename is not None:
with open(last_filename, "w") as f:
f.writelines(lines)
Signed-off-by: Anders Kaseorg <anders@zulipchat.com>
2020-04-10 05:23:40 +02:00
|
|
|
in_blockquote=has_blockquote_ancestor(currElementPair),
|
2017-12-25 21:35:23 +01:00
|
|
|
)
|
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
results.append(
|
|
|
|
ResultWithFamily(
|
|
|
|
family=family,
|
|
|
|
result=result,
|
|
|
|
)
|
|
|
|
)
|
2017-12-25 21:35:23 +01:00
|
|
|
|
|
|
|
return results
|
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2019-07-11 16:26:31 +02:00
|
|
|
def has_blockquote_ancestor(element_pair: Optional[ElementPair]) -> bool:
|
|
|
|
if element_pair is None:
|
|
|
|
return False
|
2021-02-12 08:20:45 +01:00
|
|
|
elif element_pair.value.tag == "blockquote":
|
2019-07-11 16:26:31 +02:00
|
|
|
return True
|
|
|
|
else:
|
|
|
|
return has_blockquote_ancestor(element_pair.parent)
|
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2023-04-26 03:09:19 +02:00
|
|
|
@cache_with_key(lambda tweet_id: tweet_id, cache_name="database")
|
2018-05-11 01:42:51 +02:00
|
|
|
def fetch_tweet_data(tweet_id: str) -> Optional[Dict[str, Any]]:
|
2023-05-29 18:19:45 +02:00
|
|
|
# Twitter removed support for the v1 API that this integration
|
|
|
|
# used. Given that, there's no point wasting time trying to make
|
|
|
|
# network requests to Twitter. But we leave this function, because
|
|
|
|
# existing cached renderings for Tweets is useful. We throw an
|
|
|
|
# exception rather than returning `None` to avoid caching that the
|
|
|
|
# link doesn't exist.
|
|
|
|
raise NotImplementedError("Twitter desupported their v1 API")
|
2013-03-11 16:23:34 +01:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2021-05-07 03:54:25 +02:00
|
|
|
class OpenGraphSession(OutgoingSession):
|
|
|
|
def __init__(self) -> None:
|
|
|
|
super().__init__(role="markdown", timeout=1)
|
|
|
|
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def fetch_open_graph_image(url: str) -> Optional[Dict[str, Any]]:
|
2021-10-21 07:04:57 +02:00
|
|
|
og: Dict[str, Optional[str]] = {"image": None, "title": None, "desc": None}
|
2021-08-04 10:10:20 +02:00
|
|
|
|
2014-07-17 02:41:49 +02:00
|
|
|
try:
|
2021-05-07 03:54:25 +02:00
|
|
|
with OpenGraphSession().get(
|
|
|
|
url, headers={"Accept": "text/html,application/xhtml+xml"}, stream=True
|
2021-08-04 10:10:20 +02:00
|
|
|
) as res:
|
|
|
|
if res.status_code != requests.codes.ok:
|
|
|
|
return None
|
|
|
|
|
|
|
|
mimetype, options = cgi.parse_header(res.headers["Content-Type"])
|
|
|
|
if mimetype not in ("text/html", "application/xhtml+xml"):
|
|
|
|
return None
|
|
|
|
html = mimetype == "text/html"
|
|
|
|
|
|
|
|
res.raw.decode_content = True
|
|
|
|
for event, element in lxml.etree.iterparse(
|
|
|
|
res.raw, events=("start",), no_network=True, remove_comments=True, html=html
|
|
|
|
):
|
|
|
|
parent = element.getparent()
|
|
|
|
if parent is not None:
|
|
|
|
# Reduce memory usage.
|
|
|
|
parent.text = None
|
|
|
|
parent.remove(element)
|
|
|
|
|
|
|
|
if element.tag in ("body", "{http://www.w3.org/1999/xhtml}body"):
|
|
|
|
break
|
|
|
|
elif element.tag in ("meta", "{http://www.w3.org/1999/xhtml}meta"):
|
|
|
|
if element.get("property") == "og:image":
|
2021-10-21 07:04:57 +02:00
|
|
|
content = element.get("content")
|
|
|
|
if content is not None:
|
|
|
|
og["image"] = urljoin(res.url, content)
|
2021-08-04 10:10:20 +02:00
|
|
|
elif element.get("property") == "og:title":
|
|
|
|
og["title"] = element.get("content")
|
|
|
|
elif element.get("property") == "og:description":
|
|
|
|
og["desc"] = element.get("content")
|
|
|
|
|
2020-10-09 03:32:34 +02:00
|
|
|
except requests.RequestException:
|
2014-07-17 02:41:49 +02:00
|
|
|
return None
|
2014-05-21 08:11:29 +02:00
|
|
|
|
2021-08-04 10:10:20 +02:00
|
|
|
return None if og["image"] is None else og
|
2014-05-21 08:11:29 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def get_tweet_id(url: str) -> Optional[str]:
|
2016-01-24 03:39:44 +01:00
|
|
|
parsed_url = urllib.parse.urlparse(url)
|
2021-02-12 08:20:45 +01:00
|
|
|
if not (parsed_url.netloc == "twitter.com" or parsed_url.netloc.endswith(".twitter.com")):
|
2016-06-17 00:21:01 +02:00
|
|
|
return None
|
2013-12-13 23:45:01 +01:00
|
|
|
to_match = parsed_url.path
|
2017-11-10 03:49:42 +01:00
|
|
|
# In old-style twitter.com/#!/wdaher/status/1231241234-style URLs,
|
|
|
|
# we need to look at the fragment instead
|
2021-02-12 08:20:45 +01:00
|
|
|
if parsed_url.path == "/" and len(parsed_url.fragment) > 5:
|
2016-11-28 23:29:01 +01:00
|
|
|
to_match = parsed_url.fragment
|
2013-04-30 21:37:22 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
tweet_id_match = re.match(
|
2021-02-12 08:20:45 +01:00
|
|
|
r"^!?/.*?/status(es)?/(?P<tweetid>\d{10,30})(/photo/[0-9])?/?$", to_match
|
2021-02-12 08:19:30 +01:00
|
|
|
)
|
2013-04-30 21:37:22 +02:00
|
|
|
if not tweet_id_match:
|
2016-06-17 00:21:01 +02:00
|
|
|
return None
|
2013-04-30 21:37:22 +02:00
|
|
|
return tweet_id_match.group("tweetid")
|
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2021-03-23 10:34:55 +01:00
|
|
|
class InlineImageProcessor(markdown.treeprocessors.Treeprocessor):
|
|
|
|
"""
|
|
|
|
Rewrite inline img tags to serve external content via Camo.
|
|
|
|
|
|
|
|
This rewrites all images, except ones that are served from the current
|
|
|
|
realm or global STATIC_URL. This is to ensure that each realm only loads
|
|
|
|
images that are hosted on that realm or by the global installation,
|
|
|
|
avoiding information leakage to external domains or between realms. We need
|
|
|
|
to disable proxying of images hosted on the same realm, because otherwise
|
|
|
|
we will break images in /user_uploads/, which require authorization to
|
|
|
|
view.
|
|
|
|
"""
|
|
|
|
|
2022-10-06 22:58:37 +02:00
|
|
|
def __init__(self, zmd: "ZulipMarkdown") -> None:
|
|
|
|
super().__init__(zmd)
|
|
|
|
self.zmd = zmd
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def run(self, root: Element) -> None:
|
2013-08-28 22:45:26 +02:00
|
|
|
# Get all URLs from the blob
|
|
|
|
found_imgs = walk_tree(root, lambda e: e if e.tag == "img" else None)
|
|
|
|
for img in found_imgs:
|
|
|
|
url = img.get("src")
|
2020-07-05 02:40:07 +02:00
|
|
|
assert url is not None
|
2022-10-06 22:58:37 +02:00
|
|
|
if is_static_or_current_realm_url(url, self.zmd.zulip_realm):
|
2021-03-23 10:34:55 +01:00
|
|
|
# Don't rewrite images on our own site (e.g. emoji, user uploads).
|
2013-08-28 22:45:26 +02:00
|
|
|
continue
|
2016-04-28 05:40:58 +02:00
|
|
|
img.set("src", get_camo_url(url))
|
2013-04-30 21:37:22 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2020-02-14 00:09:22 +01:00
|
|
|
class BacktickInlineProcessor(markdown.inlinepatterns.BacktickInlineProcessor):
|
2021-05-08 02:36:30 +02:00
|
|
|
"""Return a `<code>` element containing the matching text."""
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2022-07-05 21:21:44 +02:00
|
|
|
def handleMatch( # type: ignore[override] # https://github.com/python/mypy/issues/10197
|
2021-03-10 20:22:07 +01:00
|
|
|
self, m: Match[str], data: str
|
|
|
|
) -> Union[Tuple[None, None, None], Tuple[Element, int, int]]:
|
2020-02-14 00:09:22 +01:00
|
|
|
# Let upstream's implementation do its job as it is, we'll
|
|
|
|
# just replace the text to not strip the group because it
|
|
|
|
# makes it impossible to put leading/trailing whitespace in
|
2021-09-07 23:45:15 +02:00
|
|
|
# an inline code span.
|
2021-03-10 20:22:07 +01:00
|
|
|
el, start, end = ret = super().handleMatch(m, data)
|
|
|
|
if el is not None and m.group(3):
|
2020-02-14 00:09:22 +01:00
|
|
|
# upstream's code here is: m.group(3).strip() rather than m.group(3).
|
|
|
|
el.text = markdown.util.AtomicString(markdown.util.code_escape(m.group(3)))
|
2021-03-10 20:22:07 +01:00
|
|
|
return ret
|
2017-11-22 02:27:19 +01:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2021-07-01 22:53:01 +02:00
|
|
|
# List from https://support.google.com/chromeos/bin/answer.py?hl=en&answer=183093
|
2021-08-05 17:11:01 +02:00
|
|
|
IMAGE_EXTENSIONS = [".bmp", ".gif", ".jpe", ".jpeg", ".jpg", ".png", ".webp"]
|
2021-07-01 22:53:01 +02:00
|
|
|
|
|
|
|
|
2013-03-08 06:27:16 +01:00
|
|
|
class InlineInterestingLinkProcessor(markdown.treeprocessors.Treeprocessor):
|
2014-01-10 19:04:57 +01:00
|
|
|
TWITTER_MAX_IMAGE_HEIGHT = 400
|
2014-01-28 22:17:12 +01:00
|
|
|
TWITTER_MAX_TO_PREVIEW = 3
|
2023-01-07 05:30:32 +01:00
|
|
|
INLINE_PREVIEW_LIMIT_PER_MESSAGE = 24
|
2014-01-28 22:17:12 +01:00
|
|
|
|
2022-10-06 22:58:37 +02:00
|
|
|
def __init__(self, zmd: "ZulipMarkdown") -> None:
|
|
|
|
super().__init__(zmd)
|
|
|
|
self.zmd = zmd
|
|
|
|
|
2019-12-06 09:18:02 +01:00
|
|
|
def add_a(
|
2021-02-12 08:19:30 +01:00
|
|
|
self,
|
|
|
|
root: Element,
|
2022-04-14 21:39:36 +02:00
|
|
|
image_url: str,
|
2021-02-12 08:19:30 +01:00
|
|
|
link: str,
|
|
|
|
title: Optional[str] = None,
|
|
|
|
desc: Optional[str] = None,
|
|
|
|
class_attr: str = "message_inline_image",
|
|
|
|
data_id: Optional[str] = None,
|
|
|
|
insertion_index: Optional[int] = None,
|
|
|
|
already_thumbnailed: bool = False,
|
2019-12-06 09:18:02 +01:00
|
|
|
) -> None:
|
|
|
|
desc = desc if desc is not None else ""
|
|
|
|
|
2019-12-07 05:42:24 +01:00
|
|
|
# Update message.has_image attribute.
|
2022-10-06 22:58:37 +02:00
|
|
|
if "message_inline_image" in class_attr and self.zmd.zulip_message:
|
|
|
|
self.zmd.zulip_message.has_image = True
|
2019-12-07 05:42:24 +01:00
|
|
|
|
2019-12-06 09:18:02 +01:00
|
|
|
if insertion_index is not None:
|
2020-06-03 06:37:07 +02:00
|
|
|
div = Element("div")
|
2019-12-06 09:18:02 +01:00
|
|
|
root.insert(insertion_index, div)
|
|
|
|
else:
|
2020-06-03 06:37:07 +02:00
|
|
|
div = SubElement(root, "div")
|
2019-12-06 09:18:02 +01:00
|
|
|
|
|
|
|
div.set("class", class_attr)
|
2020-06-03 06:37:07 +02:00
|
|
|
a = SubElement(div, "a")
|
2019-12-06 09:18:02 +01:00
|
|
|
a.set("href", link)
|
2020-05-09 03:44:56 +02:00
|
|
|
if title is not None:
|
|
|
|
a.set("title", title)
|
2019-12-06 09:18:02 +01:00
|
|
|
if data_id is not None:
|
|
|
|
a.set("data-id", data_id)
|
2020-06-03 06:37:07 +02:00
|
|
|
img = SubElement(a, "img")
|
2021-02-12 08:19:30 +01:00
|
|
|
if (
|
|
|
|
settings.THUMBNAIL_IMAGES
|
|
|
|
and (not already_thumbnailed)
|
2022-04-14 21:39:36 +02:00
|
|
|
and user_uploads_or_external(image_url)
|
2021-02-12 08:19:30 +01:00
|
|
|
):
|
2019-12-06 09:18:02 +01:00
|
|
|
# We strip leading '/' from relative URLs here to ensure
|
|
|
|
# consistency in what gets passed to /thumbnail
|
2022-04-14 21:39:36 +02:00
|
|
|
image_url = image_url.lstrip("/")
|
|
|
|
img.set("src", "/thumbnail?" + urlencode({"url": image_url, "size": "thumbnail"}))
|
|
|
|
img.set(
|
|
|
|
"data-src-fullsize", "/thumbnail?" + urlencode({"url": image_url, "size": "full"})
|
|
|
|
)
|
2019-12-06 09:18:02 +01:00
|
|
|
else:
|
2022-04-14 21:39:36 +02:00
|
|
|
img.set("src", image_url)
|
2019-12-06 09:18:02 +01:00
|
|
|
|
|
|
|
if class_attr == "message_inline_ref":
|
2020-06-03 06:37:07 +02:00
|
|
|
summary_div = SubElement(div, "div")
|
|
|
|
title_div = SubElement(summary_div, "div")
|
2019-12-06 09:18:02 +01:00
|
|
|
title_div.set("class", "message_inline_image_title")
|
|
|
|
title_div.text = title
|
2020-06-03 06:37:07 +02:00
|
|
|
desc_div = SubElement(summary_div, "desc")
|
2019-12-06 09:18:02 +01:00
|
|
|
desc_div.set("class", "message_inline_image_desc")
|
|
|
|
|
2022-04-14 21:52:41 +02:00
|
|
|
def add_oembed_data(self, root: Element, link: str, extracted_data: UrlOEmbedData) -> None:
|
|
|
|
if extracted_data.image is None:
|
|
|
|
# Don't add an embed if an image is not found
|
|
|
|
return
|
|
|
|
|
|
|
|
if extracted_data.type == "photo":
|
|
|
|
self.add_a(
|
|
|
|
root,
|
|
|
|
image_url=extracted_data.image,
|
|
|
|
link=link,
|
|
|
|
title=extracted_data.title,
|
|
|
|
)
|
2019-12-06 09:18:02 +01:00
|
|
|
|
2022-04-14 21:52:41 +02:00
|
|
|
elif extracted_data.type == "video":
|
2021-02-12 08:19:30 +01:00
|
|
|
self.add_a(
|
|
|
|
root,
|
2022-04-14 21:52:41 +02:00
|
|
|
image_url=extracted_data.image,
|
2022-04-14 21:47:40 +02:00
|
|
|
link=link,
|
2022-04-14 21:52:41 +02:00
|
|
|
title=extracted_data.title,
|
|
|
|
desc=extracted_data.description,
|
2022-04-14 21:47:40 +02:00
|
|
|
class_attr="embed-video message_inline_image",
|
2022-04-14 21:52:41 +02:00
|
|
|
data_id=extracted_data.html,
|
2021-02-12 08:19:30 +01:00
|
|
|
already_thumbnailed=True,
|
|
|
|
)
|
2019-12-06 09:18:02 +01:00
|
|
|
|
2022-04-14 21:52:41 +02:00
|
|
|
def add_embed(self, root: Element, link: str, extracted_data: UrlEmbedData) -> None:
|
|
|
|
if isinstance(extracted_data, UrlOEmbedData):
|
|
|
|
self.add_oembed_data(root, link, extracted_data)
|
2019-12-06 09:18:02 +01:00
|
|
|
return
|
|
|
|
|
2022-04-14 21:52:41 +02:00
|
|
|
if extracted_data.image is None:
|
2019-12-06 09:18:02 +01:00
|
|
|
# Don't add an embed if an image is not found
|
|
|
|
return
|
|
|
|
|
2020-06-03 06:37:07 +02:00
|
|
|
container = SubElement(root, "div")
|
2019-12-06 09:18:02 +01:00
|
|
|
container.set("class", "message_embed")
|
|
|
|
|
2022-04-14 21:52:41 +02:00
|
|
|
img_link = get_camo_url(extracted_data.image)
|
2020-06-03 06:37:07 +02:00
|
|
|
img = SubElement(container, "a")
|
2021-10-21 23:20:56 +02:00
|
|
|
img.set("style", "background-image: url(" + css_escape(img_link) + ")")
|
2019-12-06 09:18:02 +01:00
|
|
|
img.set("href", link)
|
|
|
|
img.set("class", "message_embed_image")
|
|
|
|
|
2020-06-03 06:37:07 +02:00
|
|
|
data_container = SubElement(container, "div")
|
2019-12-06 09:18:02 +01:00
|
|
|
data_container.set("class", "data-container")
|
|
|
|
|
2022-04-14 21:52:41 +02:00
|
|
|
if extracted_data.title:
|
2020-06-03 06:37:07 +02:00
|
|
|
title_elm = SubElement(data_container, "div")
|
2019-12-06 09:18:02 +01:00
|
|
|
title_elm.set("class", "message_embed_title")
|
2020-06-03 06:37:07 +02:00
|
|
|
a = SubElement(title_elm, "a")
|
2019-12-06 09:18:02 +01:00
|
|
|
a.set("href", link)
|
2022-04-14 21:52:41 +02:00
|
|
|
a.set("title", extracted_data.title)
|
|
|
|
a.text = extracted_data.title
|
|
|
|
if extracted_data.description:
|
2020-06-03 06:37:07 +02:00
|
|
|
description_elm = SubElement(data_container, "div")
|
2019-12-06 09:18:02 +01:00
|
|
|
description_elm.set("class", "message_embed_description")
|
2022-04-14 21:52:41 +02:00
|
|
|
description_elm.text = extracted_data.description
|
2019-12-06 09:18:02 +01:00
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def get_actual_image_url(self, url: str) -> str:
|
2020-10-23 02:43:28 +02:00
|
|
|
# Add specific per-site cases to convert image-preview URLs to image URLs.
|
2017-05-03 18:42:55 +02:00
|
|
|
# See https://github.com/zulip/zulip/issues/4658 for more information
|
|
|
|
parsed_url = urllib.parse.urlparse(url)
|
2021-02-12 08:20:45 +01:00
|
|
|
if parsed_url.netloc == "github.com" or parsed_url.netloc.endswith(".github.com"):
|
2021-09-01 00:15:31 +02:00
|
|
|
# https://github.com/zulip/zulip/blob/main/static/images/logo/zulip-icon-128x128.png ->
|
|
|
|
# https://raw.githubusercontent.com/zulip/zulip/main/static/images/logo/zulip-icon-128x128.png
|
2021-02-12 08:20:45 +01:00
|
|
|
split_path = parsed_url.path.split("/")
|
2017-05-03 18:42:55 +02:00
|
|
|
if len(split_path) > 3 and split_path[3] == "blob":
|
2021-02-12 08:19:30 +01:00
|
|
|
return urllib.parse.urljoin(
|
2021-02-12 08:20:45 +01:00
|
|
|
"https://raw.githubusercontent.com", "/".join(split_path[0:3] + split_path[4:])
|
2021-02-12 08:19:30 +01:00
|
|
|
)
|
2017-05-03 18:42:55 +02:00
|
|
|
|
|
|
|
return url
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def is_image(self, url: str) -> bool:
|
2022-10-06 22:58:37 +02:00
|
|
|
if not self.zmd.image_preview_enabled:
|
2013-11-14 14:37:39 +01:00
|
|
|
return False
|
2016-01-24 03:39:44 +01:00
|
|
|
parsed_url = urllib.parse.urlparse(url)
|
2020-10-23 02:43:28 +02:00
|
|
|
# remove HTML URLs which end with image extensions that can not be shorted
|
2021-02-12 08:20:45 +01:00
|
|
|
if parsed_url.netloc == "pasteboard.co":
|
2019-02-14 17:15:30 +01:00
|
|
|
return False
|
|
|
|
|
2023-01-05 07:25:17 +01:00
|
|
|
return any(parsed_url.path.lower().endswith(ext) for ext in IMAGE_EXTENSIONS)
|
2013-05-21 16:59:09 +02:00
|
|
|
|
2020-07-05 02:40:07 +02:00
|
|
|
def corrected_image_source(self, url: str) -> Optional[str]:
|
2020-10-23 02:43:28 +02:00
|
|
|
# This function adjusts any URLs from linx.li and
|
|
|
|
# wikipedia.org to point to the actual image URL. It's
|
2019-02-14 17:15:30 +01:00
|
|
|
# structurally very similar to dropbox_image, and possibly
|
|
|
|
# should be rewritten to use open graph, but has some value.
|
|
|
|
parsed_url = urllib.parse.urlparse(url)
|
2023-07-07 00:49:43 +02:00
|
|
|
if parsed_url.netloc.lower().endswith(".wikipedia.org") and parsed_url.path.startswith(
|
|
|
|
"/wiki/File:"
|
|
|
|
):
|
2019-02-14 17:15:30 +01:00
|
|
|
# Redirecting from "/wiki/File:" to "/wiki/Special:FilePath/File:"
|
|
|
|
# A possible alternative, that avoids the redirect after hitting "Special:"
|
2020-10-23 02:43:28 +02:00
|
|
|
# is using the first characters of md5($filename) to generate the URL
|
2023-07-07 00:49:43 +02:00
|
|
|
newpath = parsed_url.path.replace("/wiki/File:", "/wiki/Special:FilePath/File:", 1)
|
|
|
|
return parsed_url._replace(path=newpath).geturl()
|
2021-02-12 08:20:45 +01:00
|
|
|
if parsed_url.netloc == "linx.li":
|
|
|
|
return "https://linx.li/s" + parsed_url.path
|
2019-02-14 17:15:30 +01:00
|
|
|
return None
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def dropbox_image(self, url: str) -> Optional[Dict[str, Any]]:
|
2017-11-02 05:39:39 +01:00
|
|
|
# TODO: The returned Dict could possibly be a TypedDict in future.
|
2016-01-24 03:39:44 +01:00
|
|
|
parsed_url = urllib.parse.urlparse(url)
|
2021-02-12 08:20:45 +01:00
|
|
|
if parsed_url.netloc == "dropbox.com" or parsed_url.netloc.endswith(".dropbox.com"):
|
|
|
|
is_album = parsed_url.path.startswith("/sc/") or parsed_url.path.startswith("/photos/")
|
2014-05-21 08:11:29 +02:00
|
|
|
# Only allow preview Dropbox shared links
|
2021-02-12 08:19:30 +01:00
|
|
|
if not (
|
2021-02-12 08:20:45 +01:00
|
|
|
parsed_url.path.startswith("/s/") or parsed_url.path.startswith("/sh/") or is_album
|
2021-02-12 08:19:30 +01:00
|
|
|
):
|
2014-05-21 08:11:29 +02:00
|
|
|
return None
|
|
|
|
|
|
|
|
# Try to retrieve open graph protocol info for a preview
|
|
|
|
# This might be redundant right now for shared links for images.
|
|
|
|
# However, we might want to make use of title and description
|
|
|
|
# in the future. If the actual image is too big, we might also
|
|
|
|
# want to use the open graph image.
|
|
|
|
image_info = fetch_open_graph_image(url)
|
|
|
|
|
|
|
|
is_image = is_album or self.is_image(url)
|
|
|
|
|
|
|
|
# If it is from an album or not an actual image file,
|
|
|
|
# just use open graph image.
|
|
|
|
if is_album or not is_image:
|
2014-07-17 02:41:49 +02:00
|
|
|
# Failed to follow link to find an image preview so
|
|
|
|
# use placeholder image and guess filename
|
|
|
|
if image_info is None:
|
2016-04-30 00:40:52 +02:00
|
|
|
return None
|
2014-07-17 02:41:49 +02:00
|
|
|
|
|
|
|
image_info["is_image"] = is_image
|
2014-05-21 08:11:29 +02:00
|
|
|
return image_info
|
|
|
|
|
|
|
|
# Otherwise, try to retrieve the actual image.
|
|
|
|
# This is because open graph image from Dropbox may have padding
|
|
|
|
# and gifs do not work.
|
|
|
|
# TODO: What if image is huge? Should we get headers first?
|
|
|
|
if image_info is None:
|
2020-09-02 08:14:51 +02:00
|
|
|
image_info = {}
|
2021-02-12 08:20:45 +01:00
|
|
|
image_info["is_image"] = True
|
2023-09-12 23:19:57 +02:00
|
|
|
image_info["image"] = parsed_url._replace(query="raw=1").geturl()
|
2014-05-21 08:11:29 +02:00
|
|
|
|
|
|
|
return image_info
|
2013-05-21 16:59:09 +02:00
|
|
|
return None
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def youtube_id(self, url: str) -> Optional[str]:
|
2022-10-06 22:58:37 +02:00
|
|
|
if not self.zmd.image_preview_enabled:
|
2013-11-14 14:37:39 +01:00
|
|
|
return None
|
2023-02-10 00:45:11 +01:00
|
|
|
|
|
|
|
id = None
|
|
|
|
split_url = urlsplit(url)
|
|
|
|
if split_url.scheme in ("http", "https"):
|
|
|
|
if split_url.hostname in (
|
|
|
|
"m.youtube.com",
|
|
|
|
"www.youtube.com",
|
|
|
|
"www.youtube-nocookie.com",
|
|
|
|
"youtube.com",
|
|
|
|
"youtube-nocookie.com",
|
|
|
|
):
|
|
|
|
query = parse_qs(split_url.query)
|
|
|
|
if split_url.path in ("/watch", "/watch_popup") and "v" in query:
|
|
|
|
id = query["v"][0]
|
|
|
|
elif split_url.path == "/watch_videos" and "video_ids" in query:
|
|
|
|
id = query["video_ids"][0].split(",", 1)[0]
|
|
|
|
elif split_url.path.startswith(("/embed/", "/shorts/", "/v/")):
|
|
|
|
id = split_url.path.split("/", 3)[2]
|
|
|
|
elif split_url.hostname == "youtu.be" and split_url.path.startswith("/"):
|
|
|
|
id = split_url.path[len("/") :]
|
|
|
|
|
|
|
|
if id is not None and re.fullmatch(r"[0-9A-Za-z_-]+", id):
|
|
|
|
return id
|
|
|
|
return None
|
2016-10-17 22:02:01 +02:00
|
|
|
|
2022-04-14 21:52:41 +02:00
|
|
|
def youtube_title(self, extracted_data: UrlEmbedData) -> Optional[str]:
|
|
|
|
if extracted_data.title is not None:
|
|
|
|
return f"YouTube - {extracted_data.title}"
|
2019-03-21 21:08:26 +01:00
|
|
|
return None
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def youtube_image(self, url: str) -> Optional[str]:
|
2016-10-17 22:02:01 +02:00
|
|
|
yt_id = self.youtube_id(url)
|
|
|
|
|
|
|
|
if yt_id is not None:
|
2020-06-13 08:59:37 +02:00
|
|
|
return f"https://i.ytimg.com/vi/{yt_id}/default.jpg"
|
2017-03-03 20:30:49 +01:00
|
|
|
return None
|
2013-05-21 16:59:09 +02:00
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def vimeo_id(self, url: str) -> Optional[str]:
|
2022-10-06 22:58:37 +02:00
|
|
|
if not self.zmd.image_preview_enabled:
|
2017-12-14 22:17:00 +01:00
|
|
|
return None
|
2021-02-12 08:19:30 +01:00
|
|
|
# (http|https)?:\/\/(www\.)?vimeo.com\/(?:channels\/(?:\w+\/)?|groups\/([^\/]*)\/videos\/|)(\d+)(?:|\/\?)
|
2017-12-14 22:17:00 +01:00
|
|
|
# If it matches, match.group('id') is the video id.
|
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
vimeo_re = (
|
2021-02-12 08:20:45 +01:00
|
|
|
r"^((http|https)?:\/\/(www\.)?vimeo.com\/"
|
2023-01-03 02:16:53 +01:00
|
|
|
r"(?:channels\/(?:\w+\/)?|groups\/"
|
|
|
|
r"([^\/]*)\/videos\/|)(\d+)(?:|\/\?))$"
|
2021-02-12 08:19:30 +01:00
|
|
|
)
|
2017-12-14 22:17:00 +01:00
|
|
|
match = re.match(vimeo_re, url)
|
|
|
|
if match is None:
|
|
|
|
return None
|
|
|
|
return match.group(5)
|
|
|
|
|
2022-04-14 21:52:41 +02:00
|
|
|
def vimeo_title(self, extracted_data: UrlEmbedData) -> Optional[str]:
|
|
|
|
if extracted_data.title is not None:
|
|
|
|
return f"Vimeo - {extracted_data.title}"
|
2018-03-24 12:53:47 +01:00
|
|
|
return None
|
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
def twitter_text(
|
|
|
|
self,
|
|
|
|
text: str,
|
|
|
|
urls: List[Dict[str, str]],
|
|
|
|
user_mentions: List[Dict[str, Any]],
|
|
|
|
media: List[Dict[str, Any]],
|
|
|
|
) -> Element:
|
2014-01-08 22:56:48 +01:00
|
|
|
"""
|
2020-10-23 02:43:28 +02:00
|
|
|
Use data from the Twitter API to turn links, mentions and media into A
|
|
|
|
tags. Also convert Unicode emojis to images.
|
2014-01-08 22:56:48 +01:00
|
|
|
|
2020-10-23 02:43:28 +02:00
|
|
|
This works by using the URLs, user_mentions and media data from
|
|
|
|
the twitter API and searching for Unicode emojis in the text using
|
2023-08-10 21:00:45 +02:00
|
|
|
`POSSIBLE_EMOJI_RE`.
|
2014-01-08 22:56:48 +01:00
|
|
|
|
2017-06-19 23:18:55 +02:00
|
|
|
The first step is finding the locations of the URLs, mentions, media and
|
|
|
|
emoji in the text. For each match we build a dictionary with type, the start
|
|
|
|
location, end location, the URL to link to, and the text(codepoint and title
|
|
|
|
in case of emojis) to be used in the link(image in case of emojis).
|
2014-01-08 22:56:48 +01:00
|
|
|
|
|
|
|
Next we sort the matches by start location. And for each we add the
|
|
|
|
text from the end of the last link to the start of the current link to
|
|
|
|
the output. The text needs to added to the text attribute of the first
|
|
|
|
node (the P tag) or the tail the last link created.
|
|
|
|
|
|
|
|
Finally we add any remaining text to the last node.
|
|
|
|
"""
|
|
|
|
|
python: Convert assignment type annotations to Python 3.6 style.
This commit was split by tabbott; this piece covers the vast majority
of files in Zulip, but excludes scripts/, tools/, and puppet/ to help
ensure we at least show the right error messages for Xenial systems.
We can likely further refine the remaining pieces with some testing.
Generated by com2ann, with whitespace fixes and various manual fixes
for runtime issues:
- invoiced_through: Optional[LicenseLedger] = models.ForeignKey(
+ invoiced_through: Optional["LicenseLedger"] = models.ForeignKey(
-_apns_client: Optional[APNsClient] = None
+_apns_client: Optional["APNsClient"] = None
- notifications_stream: Optional[Stream] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
- signup_notifications_stream: Optional[Stream] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
+ notifications_stream: Optional["Stream"] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
+ signup_notifications_stream: Optional["Stream"] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
- author: Optional[UserProfile] = models.ForeignKey('UserProfile', blank=True, null=True, on_delete=CASCADE)
+ author: Optional["UserProfile"] = models.ForeignKey('UserProfile', blank=True, null=True, on_delete=CASCADE)
- bot_owner: Optional[UserProfile] = models.ForeignKey('self', null=True, on_delete=models.SET_NULL)
+ bot_owner: Optional["UserProfile"] = models.ForeignKey('self', null=True, on_delete=models.SET_NULL)
- default_sending_stream: Optional[Stream] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
- default_events_register_stream: Optional[Stream] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
+ default_sending_stream: Optional["Stream"] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
+ default_events_register_stream: Optional["Stream"] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
-descriptors_by_handler_id: Dict[int, ClientDescriptor] = {}
+descriptors_by_handler_id: Dict[int, "ClientDescriptor"] = {}
-worker_classes: Dict[str, Type[QueueProcessingWorker]] = {}
-queues: Dict[str, Dict[str, Type[QueueProcessingWorker]]] = {}
+worker_classes: Dict[str, Type["QueueProcessingWorker"]] = {}
+queues: Dict[str, Dict[str, Type["QueueProcessingWorker"]]] = {}
-AUTH_LDAP_REVERSE_EMAIL_SEARCH: Optional[LDAPSearch] = None
+AUTH_LDAP_REVERSE_EMAIL_SEARCH: Optional["LDAPSearch"] = None
Signed-off-by: Anders Kaseorg <anders@zulipchat.com>
2020-04-22 01:09:50 +02:00
|
|
|
to_process: List[Dict[str, Any]] = []
|
2014-01-08 22:56:48 +01:00
|
|
|
# Build dicts for URLs
|
2016-07-16 06:48:10 +02:00
|
|
|
for url_data in urls:
|
2023-07-31 22:52:35 +02:00
|
|
|
to_process.extend(
|
|
|
|
{
|
|
|
|
"type": "url",
|
|
|
|
"start": match.start(),
|
|
|
|
"end": match.end(),
|
|
|
|
"url": url_data["url"],
|
|
|
|
"text": url_data["expanded_url"],
|
|
|
|
}
|
|
|
|
for match in re.finditer(re.escape(url_data["url"]), text, re.IGNORECASE)
|
|
|
|
)
|
2014-01-08 22:56:48 +01:00
|
|
|
# Build dicts for mentions
|
|
|
|
for user_mention in user_mentions:
|
2021-02-12 08:20:45 +01:00
|
|
|
screen_name = user_mention["screen_name"]
|
|
|
|
mention_string = "@" + screen_name
|
2023-07-31 22:52:35 +02:00
|
|
|
to_process.extend(
|
|
|
|
{
|
|
|
|
"type": "mention",
|
|
|
|
"start": match.start(),
|
|
|
|
"end": match.end(),
|
|
|
|
"url": "https://twitter.com/" + urllib.parse.quote(screen_name),
|
|
|
|
"text": mention_string,
|
|
|
|
}
|
|
|
|
for match in re.finditer(re.escape(mention_string), text, re.IGNORECASE)
|
|
|
|
)
|
2014-01-10 19:04:57 +01:00
|
|
|
# Build dicts for media
|
|
|
|
for media_item in media:
|
2021-02-12 08:20:45 +01:00
|
|
|
short_url = media_item["url"]
|
|
|
|
expanded_url = media_item["expanded_url"]
|
2023-07-31 22:52:35 +02:00
|
|
|
to_process.extend(
|
|
|
|
{
|
|
|
|
"type": "media",
|
|
|
|
"start": match.start(),
|
|
|
|
"end": match.end(),
|
|
|
|
"url": short_url,
|
|
|
|
"text": expanded_url,
|
|
|
|
}
|
|
|
|
for match in re.finditer(re.escape(short_url), text, re.IGNORECASE)
|
|
|
|
)
|
2017-06-19 23:18:55 +02:00
|
|
|
# Build dicts for emojis
|
2023-08-10 21:00:45 +02:00
|
|
|
for match in POSSIBLE_EMOJI_RE.finditer(text):
|
2021-02-12 08:20:45 +01:00
|
|
|
orig_syntax = match.group("syntax")
|
2023-08-10 21:00:45 +02:00
|
|
|
codepoint = emoji_to_hex_codepoint(unqualify_emoji(orig_syntax))
|
2017-06-19 23:18:55 +02:00
|
|
|
if codepoint in codepoint_to_name:
|
2021-02-12 08:20:45 +01:00
|
|
|
display_string = ":" + codepoint_to_name[codepoint] + ":"
|
2021-02-12 08:19:30 +01:00
|
|
|
to_process.append(
|
|
|
|
{
|
2021-02-12 08:20:45 +01:00
|
|
|
"type": "emoji",
|
|
|
|
"start": match.start(),
|
|
|
|
"end": match.end(),
|
|
|
|
"codepoint": codepoint,
|
|
|
|
"title": display_string,
|
2021-02-12 08:19:30 +01:00
|
|
|
}
|
|
|
|
)
|
2014-01-08 22:56:48 +01:00
|
|
|
|
2021-02-12 08:20:45 +01:00
|
|
|
to_process.sort(key=lambda x: x["start"])
|
|
|
|
p = current_node = Element("p")
|
2016-01-25 21:53:23 +01:00
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def set_text(text: str) -> None:
|
2014-01-08 22:56:48 +01:00
|
|
|
"""
|
|
|
|
Helper to set the text or the tail of the current_node
|
|
|
|
"""
|
|
|
|
if current_node == p:
|
|
|
|
current_node.text = text
|
|
|
|
else:
|
|
|
|
current_node.tail = text
|
|
|
|
|
2022-10-06 22:58:37 +02:00
|
|
|
db_data: Optional[DbData] = self.zmd.zulip_db_data
|
2014-01-08 22:56:48 +01:00
|
|
|
current_index = 0
|
2017-06-19 23:41:20 +02:00
|
|
|
for item in to_process:
|
2014-01-08 22:56:48 +01:00
|
|
|
# The text we want to link starts in already linked text skip it
|
2021-02-12 08:20:45 +01:00
|
|
|
if item["start"] < current_index:
|
2014-01-08 22:56:48 +01:00
|
|
|
continue
|
|
|
|
# Add text from the end of last link to the start of the current
|
|
|
|
# link
|
2021-02-12 08:20:45 +01:00
|
|
|
set_text(text[current_index : item["start"]])
|
|
|
|
current_index = item["end"]
|
|
|
|
if item["type"] != "emoji":
|
|
|
|
elem = url_to_a(db_data, item["url"], item["text"])
|
2020-06-03 06:50:08 +02:00
|
|
|
assert isinstance(elem, Element)
|
2017-06-19 23:18:55 +02:00
|
|
|
else:
|
2021-02-12 08:20:45 +01:00
|
|
|
elem = make_emoji(item["codepoint"], item["title"])
|
2020-06-03 06:50:08 +02:00
|
|
|
current_node = elem
|
2017-06-19 23:18:55 +02:00
|
|
|
p.append(elem)
|
2014-01-08 22:56:48 +01:00
|
|
|
|
|
|
|
# Add any unused text
|
|
|
|
set_text(text[current_index:])
|
|
|
|
return p
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def twitter_link(self, url: str) -> Optional[Element]:
|
2013-04-30 21:37:22 +02:00
|
|
|
tweet_id = get_tweet_id(url)
|
2013-03-08 06:27:16 +01:00
|
|
|
|
2016-06-17 00:21:01 +02:00
|
|
|
if tweet_id is None:
|
2013-03-08 06:27:16 +01:00
|
|
|
return None
|
|
|
|
|
|
|
|
try:
|
2013-03-11 16:23:34 +01:00
|
|
|
res = fetch_tweet_data(tweet_id)
|
2013-03-12 23:40:41 +01:00
|
|
|
if res is None:
|
|
|
|
return None
|
2021-02-12 08:20:45 +01:00
|
|
|
user: Dict[str, Any] = res["user"]
|
2020-06-03 06:37:07 +02:00
|
|
|
tweet = Element("div")
|
2013-03-08 06:27:16 +01:00
|
|
|
tweet.set("class", "twitter-tweet")
|
2021-02-12 08:20:45 +01:00
|
|
|
img_a = SubElement(tweet, "a")
|
2013-03-08 06:27:16 +01:00
|
|
|
img_a.set("href", url)
|
2021-02-12 08:20:45 +01:00
|
|
|
profile_img = SubElement(img_a, "img")
|
|
|
|
profile_img.set("class", "twitter-avatar")
|
2013-03-08 20:48:14 +01:00
|
|
|
# For some reason, for, e.g. tweet 285072525413724161,
|
|
|
|
# python-twitter does not give us a
|
|
|
|
# profile_image_url_https, but instead puts that URL in
|
|
|
|
# profile_image_url. So use _https if available, but fall
|
|
|
|
# back gracefully.
|
2021-02-12 08:20:45 +01:00
|
|
|
image_url = user.get("profile_image_url_https", user["profile_image_url"])
|
|
|
|
profile_img.set("src", image_url)
|
2014-01-08 22:56:48 +01:00
|
|
|
|
2021-02-12 08:20:45 +01:00
|
|
|
text = html.unescape(res["full_text"])
|
|
|
|
urls = res.get("urls", [])
|
|
|
|
user_mentions = res.get("user_mentions", [])
|
|
|
|
media: List[Dict[str, Any]] = res.get("media", [])
|
2014-01-10 19:04:57 +01:00
|
|
|
p = self.twitter_text(text, urls, user_mentions, media)
|
2014-01-08 22:56:48 +01:00
|
|
|
tweet.append(p)
|
|
|
|
|
2021-02-12 08:20:45 +01:00
|
|
|
span = SubElement(tweet, "span")
|
|
|
|
span.text = "- {} (@{})".format(user["name"], user["screen_name"])
|
2013-03-08 06:27:16 +01:00
|
|
|
|
2014-01-10 19:04:57 +01:00
|
|
|
# Add image previews
|
|
|
|
for media_item in media:
|
|
|
|
# Only photos have a preview image
|
2021-02-12 08:20:45 +01:00
|
|
|
if media_item["type"] != "photo":
|
2014-01-10 19:04:57 +01:00
|
|
|
continue
|
|
|
|
|
|
|
|
# Find the image size that is smaller than
|
|
|
|
# TWITTER_MAX_IMAGE_HEIGHT px tall or the smallest
|
2023-09-12 23:19:57 +02:00
|
|
|
size_name_tuples = sorted(
|
|
|
|
media_item["sizes"].items(), reverse=True, key=lambda x: x[1]["h"]
|
|
|
|
)
|
2014-01-10 19:04:57 +01:00
|
|
|
for size_name, size in size_name_tuples:
|
2021-02-12 08:20:45 +01:00
|
|
|
if size["h"] < self.TWITTER_MAX_IMAGE_HEIGHT:
|
2014-01-10 19:04:57 +01:00
|
|
|
break
|
|
|
|
|
2021-02-12 08:20:45 +01:00
|
|
|
media_url = "{}:{}".format(media_item["media_url_https"], size_name)
|
|
|
|
img_div = SubElement(tweet, "div")
|
|
|
|
img_div.set("class", "twitter-image")
|
|
|
|
img_a = SubElement(img_div, "a")
|
|
|
|
img_a.set("href", media_item["url"])
|
|
|
|
img = SubElement(img_a, "img")
|
|
|
|
img.set("src", media_url)
|
2014-01-10 19:04:57 +01:00
|
|
|
|
2013-05-21 16:59:09 +02:00
|
|
|
return tweet
|
2023-05-29 18:19:45 +02:00
|
|
|
except NotImplementedError:
|
|
|
|
return None
|
2017-03-05 10:25:27 +01:00
|
|
|
except Exception:
|
2013-03-08 06:27:16 +01:00
|
|
|
# We put this in its own try-except because it requires external
|
|
|
|
# connectivity. If Twitter flakes out, we don't want to not-render
|
|
|
|
# the entire message; we just want to not show the Twitter preview.
|
2020-06-26 20:54:05 +02:00
|
|
|
markdown_logger.warning("Error building Twitter link", exc_info=True)
|
2013-03-08 06:27:16 +01:00
|
|
|
return None
|
|
|
|
|
2020-05-09 03:44:56 +02:00
|
|
|
def get_url_data(self, e: Element) -> Optional[Tuple[str, Optional[str]]]:
|
2016-09-22 22:39:24 +02:00
|
|
|
if e.tag == "a":
|
2020-07-05 02:40:07 +02:00
|
|
|
url = e.get("href")
|
|
|
|
assert url is not None
|
|
|
|
return (url, e.text)
|
2016-09-22 22:39:24 +02:00
|
|
|
return None
|
|
|
|
|
2020-07-13 17:54:58 +02:00
|
|
|
def get_inlining_information(
|
2020-05-09 03:44:56 +02:00
|
|
|
self,
|
|
|
|
root: Element,
|
|
|
|
found_url: ResultWithFamily[Tuple[str, Optional[str]]],
|
2020-07-13 17:54:58 +02:00
|
|
|
) -> LinkInfo:
|
2017-12-25 21:35:23 +01:00
|
|
|
grandparent = found_url.family.grandparent
|
|
|
|
parent = found_url.family.parent
|
|
|
|
ahref_element = found_url.family.child
|
|
|
|
(url, text) = found_url.result
|
2017-11-27 10:03:18 +01:00
|
|
|
|
2017-12-25 21:35:23 +01:00
|
|
|
# url != text usually implies a named link, which we opt not to remove
|
2020-05-09 03:44:56 +02:00
|
|
|
url_eq_text = text is None or url == text
|
|
|
|
title = None if url_eq_text else text
|
2020-07-13 17:54:58 +02:00
|
|
|
info: LinkInfo = {
|
2021-02-12 08:20:45 +01:00
|
|
|
"parent": root,
|
|
|
|
"title": title,
|
|
|
|
"index": None,
|
|
|
|
"remove": None,
|
2020-07-13 17:54:58 +02:00
|
|
|
}
|
2017-11-27 10:03:18 +01:00
|
|
|
|
2021-02-12 08:20:45 +01:00
|
|
|
if parent.tag == "li":
|
|
|
|
info["parent"] = parent
|
2017-12-25 21:35:23 +01:00
|
|
|
if not parent.text and not ahref_element.tail and url_eq_text:
|
2021-02-12 08:20:45 +01:00
|
|
|
info["remove"] = ahref_element
|
2017-11-27 10:03:18 +01:00
|
|
|
|
2021-02-12 08:20:45 +01:00
|
|
|
elif parent.tag == "p":
|
2020-07-05 02:40:07 +02:00
|
|
|
assert grandparent is not None
|
2017-12-25 21:35:23 +01:00
|
|
|
parent_index = None
|
2020-06-04 02:15:21 +02:00
|
|
|
for index, uncle in enumerate(grandparent):
|
2017-12-25 21:35:23 +01:00
|
|
|
if uncle is parent:
|
|
|
|
parent_index = index
|
|
|
|
break
|
2017-11-27 10:03:18 +01:00
|
|
|
|
2020-07-13 17:54:58 +02:00
|
|
|
# Append to end of list of grandparent's children as normal
|
2021-02-12 08:20:45 +01:00
|
|
|
info["parent"] = grandparent
|
2017-11-27 10:03:18 +01:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
if (
|
|
|
|
len(parent) == 1
|
|
|
|
and (not parent.text or parent.text == "\n")
|
|
|
|
and not ahref_element.tail
|
|
|
|
and url_eq_text
|
|
|
|
):
|
2021-02-12 08:20:45 +01:00
|
|
|
info["remove"] = parent
|
2017-12-25 21:35:23 +01:00
|
|
|
|
2020-07-13 17:54:58 +02:00
|
|
|
if parent_index is not None:
|
2021-02-12 08:20:45 +01:00
|
|
|
info["index"] = self.find_proper_insertion_index(grandparent, parent, parent_index)
|
2020-07-13 17:54:58 +02:00
|
|
|
|
|
|
|
return info
|
|
|
|
|
|
|
|
def handle_image_inlining(
|
|
|
|
self,
|
|
|
|
root: Element,
|
|
|
|
found_url: ResultWithFamily[Tuple[str, Optional[str]]],
|
|
|
|
) -> None:
|
|
|
|
info = self.get_inlining_information(root, found_url)
|
|
|
|
(url, text) = found_url.result
|
|
|
|
actual_url = self.get_actual_image_url(url)
|
2021-02-12 08:19:30 +01:00
|
|
|
self.add_a(
|
2022-04-14 21:47:40 +02:00
|
|
|
info["parent"],
|
|
|
|
image_url=actual_url,
|
|
|
|
link=url,
|
|
|
|
title=info["title"],
|
|
|
|
insertion_index=info["index"],
|
2021-02-12 08:19:30 +01:00
|
|
|
)
|
2021-02-12 08:20:45 +01:00
|
|
|
if info["remove"] is not None:
|
|
|
|
info["parent"].remove(info["remove"])
|
2017-12-25 21:35:23 +01:00
|
|
|
|
2020-07-13 18:31:07 +02:00
|
|
|
def handle_tweet_inlining(
|
|
|
|
self,
|
|
|
|
root: Element,
|
|
|
|
found_url: ResultWithFamily[Tuple[str, Optional[str]]],
|
|
|
|
twitter_data: Element,
|
|
|
|
) -> None:
|
|
|
|
info = self.get_inlining_information(root, found_url)
|
|
|
|
|
2021-02-12 08:20:45 +01:00
|
|
|
if info["index"] is not None:
|
2020-07-13 18:31:07 +02:00
|
|
|
div = Element("div")
|
2021-02-12 08:20:45 +01:00
|
|
|
root.insert(info["index"], div)
|
2020-07-13 18:31:07 +02:00
|
|
|
else:
|
|
|
|
div = SubElement(root, "div")
|
|
|
|
|
|
|
|
div.set("class", "inline-preview-twitter")
|
|
|
|
div.insert(0, twitter_data)
|
|
|
|
|
2020-07-20 17:09:52 +02:00
|
|
|
def handle_youtube_url_inlining(
|
|
|
|
self,
|
|
|
|
root: Element,
|
|
|
|
found_url: ResultWithFamily[Tuple[str, Optional[str]]],
|
|
|
|
yt_image: str,
|
|
|
|
) -> None:
|
|
|
|
info = self.get_inlining_information(root, found_url)
|
|
|
|
(url, text) = found_url.result
|
|
|
|
yt_id = self.youtube_id(url)
|
2021-02-12 08:19:30 +01:00
|
|
|
self.add_a(
|
2021-02-12 08:20:45 +01:00
|
|
|
info["parent"],
|
2022-04-14 21:47:40 +02:00
|
|
|
image_url=yt_image,
|
|
|
|
link=url,
|
|
|
|
class_attr="youtube-video message_inline_image",
|
|
|
|
data_id=yt_id,
|
2021-02-12 08:20:45 +01:00
|
|
|
insertion_index=info["index"],
|
2021-02-12 08:19:30 +01:00
|
|
|
already_thumbnailed=True,
|
|
|
|
)
|
2020-07-20 17:09:52 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
def find_proper_insertion_index(
|
|
|
|
self, grandparent: Element, parent: Element, parent_index_in_grandparent: int
|
|
|
|
) -> int:
|
2017-12-25 21:35:23 +01:00
|
|
|
# If there are several inline images from same paragraph, ensure that
|
|
|
|
# they are in correct (and not opposite) order by inserting after last
|
|
|
|
# inline image from paragraph 'parent'
|
|
|
|
|
2021-02-12 08:20:45 +01:00
|
|
|
parent_links = [ele.attrib["href"] for ele in parent.iter(tag="a")]
|
2017-12-25 21:35:23 +01:00
|
|
|
insertion_index = parent_index_in_grandparent
|
|
|
|
|
|
|
|
while True:
|
|
|
|
insertion_index += 1
|
2020-06-04 02:15:21 +02:00
|
|
|
if insertion_index >= len(grandparent):
|
2017-12-25 21:35:23 +01:00
|
|
|
return insertion_index
|
|
|
|
|
2020-06-04 02:15:21 +02:00
|
|
|
uncle = grandparent[insertion_index]
|
2023-07-06 16:18:08 +02:00
|
|
|
inline_image_classes = {
|
2021-02-12 08:20:45 +01:00
|
|
|
"message_inline_image",
|
|
|
|
"message_inline_ref",
|
|
|
|
"inline-preview-twitter",
|
2023-07-06 16:18:08 +02:00
|
|
|
}
|
2017-12-25 21:35:23 +01:00
|
|
|
if (
|
2021-02-12 08:20:45 +01:00
|
|
|
uncle.tag != "div"
|
2023-08-10 20:12:35 +02:00
|
|
|
or "class" not in uncle.attrib
|
2023-07-06 16:18:08 +02:00
|
|
|
or not (set(uncle.attrib["class"].split()) & inline_image_classes)
|
2017-12-25 21:35:23 +01:00
|
|
|
):
|
|
|
|
return insertion_index
|
|
|
|
|
2023-07-22 00:34:11 +02:00
|
|
|
uncle_link = uncle.find("a")
|
|
|
|
assert uncle_link is not None
|
|
|
|
if uncle_link.attrib["href"] not in parent_links:
|
2017-12-25 21:35:23 +01:00
|
|
|
return insertion_index
|
2017-11-27 10:03:18 +01:00
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def run(self, root: Element) -> None:
|
2013-05-21 16:59:09 +02:00
|
|
|
# Get all URLs from the blob
|
2017-12-25 21:35:23 +01:00
|
|
|
found_urls = walk_tree_with_family(root, self.get_url_data)
|
2019-12-10 22:19:30 +01:00
|
|
|
unique_urls = {found_url.result[0] for found_url in found_urls}
|
|
|
|
# Collect unique URLs which are not quoted as we don't do
|
|
|
|
# inline previews for links inside blockquotes.
|
2021-02-12 08:19:30 +01:00
|
|
|
unique_previewable_urls = {
|
|
|
|
found_url.result[0] for found_url in found_urls if not found_url.family.in_blockquote
|
|
|
|
}
|
2019-09-24 21:10:56 +02:00
|
|
|
|
2020-08-11 01:47:49 +02:00
|
|
|
# Set has_link and similar flags whenever a message is processed by Markdown
|
2022-10-06 22:58:37 +02:00
|
|
|
if self.zmd.zulip_message:
|
|
|
|
self.zmd.zulip_message.has_link = len(found_urls) > 0
|
|
|
|
self.zmd.zulip_message.has_image = False # This is updated in self.add_a
|
2019-12-13 03:56:59 +01:00
|
|
|
|
2019-12-10 22:19:30 +01:00
|
|
|
for url in unique_urls:
|
2019-12-13 03:56:59 +01:00
|
|
|
# Due to rewrite_local_links_to_relative, we need to
|
|
|
|
# handle both relative URLs beginning with
|
|
|
|
# `/user_uploads` and beginning with `user_uploads`.
|
|
|
|
# This urllib construction converts the latter into
|
|
|
|
# the former.
|
|
|
|
parsed_url = urllib.parse.urlsplit(urllib.parse.urljoin("/", url))
|
|
|
|
host = parsed_url.netloc
|
|
|
|
|
2022-10-06 22:58:37 +02:00
|
|
|
if host != "" and (
|
|
|
|
self.zmd.zulip_realm is None or host != self.zmd.zulip_realm.host
|
|
|
|
):
|
2019-12-13 03:56:59 +01:00
|
|
|
continue
|
|
|
|
|
|
|
|
if not parsed_url.path.startswith("/user_uploads/"):
|
|
|
|
continue
|
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
path_id = parsed_url.path[len("/user_uploads/") :]
|
2022-10-06 22:58:37 +02:00
|
|
|
self.zmd.zulip_rendering_result.potential_attachment_path_ids.append(path_id)
|
2019-12-10 22:19:30 +01:00
|
|
|
|
|
|
|
if len(found_urls) == 0:
|
|
|
|
return
|
2019-09-24 21:10:56 +02:00
|
|
|
|
2019-12-10 22:19:30 +01:00
|
|
|
if len(unique_previewable_urls) > self.INLINE_PREVIEW_LIMIT_PER_MESSAGE:
|
2013-05-21 16:59:09 +02:00
|
|
|
return
|
|
|
|
|
python: Convert assignment type annotations to Python 3.6 style.
This commit was split by tabbott; this piece covers the vast majority
of files in Zulip, but excludes scripts/, tools/, and puppet/ to help
ensure we at least show the right error messages for Xenial systems.
We can likely further refine the remaining pieces with some testing.
Generated by com2ann, with whitespace fixes and various manual fixes
for runtime issues:
- invoiced_through: Optional[LicenseLedger] = models.ForeignKey(
+ invoiced_through: Optional["LicenseLedger"] = models.ForeignKey(
-_apns_client: Optional[APNsClient] = None
+_apns_client: Optional["APNsClient"] = None
- notifications_stream: Optional[Stream] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
- signup_notifications_stream: Optional[Stream] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
+ notifications_stream: Optional["Stream"] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
+ signup_notifications_stream: Optional["Stream"] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
- author: Optional[UserProfile] = models.ForeignKey('UserProfile', blank=True, null=True, on_delete=CASCADE)
+ author: Optional["UserProfile"] = models.ForeignKey('UserProfile', blank=True, null=True, on_delete=CASCADE)
- bot_owner: Optional[UserProfile] = models.ForeignKey('self', null=True, on_delete=models.SET_NULL)
+ bot_owner: Optional["UserProfile"] = models.ForeignKey('self', null=True, on_delete=models.SET_NULL)
- default_sending_stream: Optional[Stream] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
- default_events_register_stream: Optional[Stream] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
+ default_sending_stream: Optional["Stream"] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
+ default_events_register_stream: Optional["Stream"] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
-descriptors_by_handler_id: Dict[int, ClientDescriptor] = {}
+descriptors_by_handler_id: Dict[int, "ClientDescriptor"] = {}
-worker_classes: Dict[str, Type[QueueProcessingWorker]] = {}
-queues: Dict[str, Dict[str, Type[QueueProcessingWorker]]] = {}
+worker_classes: Dict[str, Type["QueueProcessingWorker"]] = {}
+queues: Dict[str, Dict[str, Type["QueueProcessingWorker"]]] = {}
-AUTH_LDAP_REVERSE_EMAIL_SEARCH: Optional[LDAPSearch] = None
+AUTH_LDAP_REVERSE_EMAIL_SEARCH: Optional["LDAPSearch"] = None
Signed-off-by: Anders Kaseorg <anders@zulipchat.com>
2020-04-22 01:09:50 +02:00
|
|
|
processed_urls: Set[str] = set()
|
2014-01-28 22:17:12 +01:00
|
|
|
rendered_tweet_count = 0
|
2014-07-17 02:41:49 +02:00
|
|
|
|
2017-12-25 21:35:23 +01:00
|
|
|
for found_url in found_urls:
|
|
|
|
(url, text) = found_url.result
|
2018-02-16 21:06:05 +01:00
|
|
|
|
2019-12-10 22:19:30 +01:00
|
|
|
if url in unique_previewable_urls and url not in processed_urls:
|
2018-02-16 21:06:05 +01:00
|
|
|
processed_urls.add(url)
|
|
|
|
else:
|
|
|
|
continue
|
|
|
|
|
2018-02-23 21:17:29 +01:00
|
|
|
dropbox_image = self.dropbox_image(url)
|
2014-05-21 08:11:29 +02:00
|
|
|
if dropbox_image is not None:
|
|
|
|
class_attr = "message_inline_ref"
|
|
|
|
is_image = dropbox_image["is_image"]
|
|
|
|
if is_image:
|
|
|
|
class_attr = "message_inline_image"
|
|
|
|
# Not making use of title and description of images
|
2021-02-12 08:19:30 +01:00
|
|
|
self.add_a(
|
|
|
|
root,
|
2022-04-14 21:47:40 +02:00
|
|
|
image_url=dropbox_image["image"],
|
|
|
|
link=url,
|
2021-02-12 08:20:45 +01:00
|
|
|
title=dropbox_image.get("title"),
|
|
|
|
desc=dropbox_image.get("desc", ""),
|
2021-02-12 08:19:30 +01:00
|
|
|
class_attr=class_attr,
|
|
|
|
already_thumbnailed=True,
|
|
|
|
)
|
2013-05-21 16:59:09 +02:00
|
|
|
continue
|
2019-02-14 17:15:30 +01:00
|
|
|
|
2013-05-21 16:59:09 +02:00
|
|
|
if self.is_image(url):
|
2019-02-14 17:15:30 +01:00
|
|
|
image_source = self.corrected_image_source(url)
|
|
|
|
if image_source is not None:
|
|
|
|
found_url = ResultWithFamily(
|
|
|
|
family=found_url.family,
|
python: Use trailing commas consistently.
Automatically generated by the following script, based on the output
of lint with flake8-comma:
import re
import sys
last_filename = None
last_row = None
lines = []
for msg in sys.stdin:
m = re.match(
r"\x1b\[35mflake8 \|\x1b\[0m \x1b\[1;31m(.+):(\d+):(\d+): (\w+)", msg
)
if m:
filename, row_str, col_str, err = m.groups()
row, col = int(row_str), int(col_str)
if filename == last_filename:
assert last_row != row
else:
if last_filename is not None:
with open(last_filename, "w") as f:
f.writelines(lines)
with open(filename) as f:
lines = f.readlines()
last_filename = filename
last_row = row
line = lines[row - 1]
if err in ["C812", "C815"]:
lines[row - 1] = line[: col - 1] + "," + line[col - 1 :]
elif err in ["C819"]:
assert line[col - 2] == ","
lines[row - 1] = line[: col - 2] + line[col - 1 :].lstrip(" ")
if last_filename is not None:
with open(last_filename, "w") as f:
f.writelines(lines)
Signed-off-by: Anders Kaseorg <anders@zulipchat.com>
2020-04-10 05:23:40 +02:00
|
|
|
result=(image_source, image_source),
|
2019-02-14 17:15:30 +01:00
|
|
|
)
|
2017-12-25 21:35:23 +01:00
|
|
|
self.handle_image_inlining(root, found_url)
|
2013-05-21 16:59:09 +02:00
|
|
|
continue
|
2019-02-14 17:15:30 +01:00
|
|
|
|
2022-08-17 06:37:53 +02:00
|
|
|
netloc = urlsplit(url).netloc
|
|
|
|
if netloc == "" or (
|
2022-10-06 22:58:37 +02:00
|
|
|
self.zmd.zulip_realm is not None and netloc == self.zmd.zulip_realm.host
|
2022-08-17 06:37:53 +02:00
|
|
|
):
|
|
|
|
# We don't have a strong use case for doing URL preview for relative links.
|
|
|
|
continue
|
|
|
|
|
2016-06-17 00:21:01 +02:00
|
|
|
if get_tweet_id(url) is not None:
|
2014-01-28 22:17:12 +01:00
|
|
|
if rendered_tweet_count >= self.TWITTER_MAX_TO_PREVIEW:
|
2013-05-21 16:59:09 +02:00
|
|
|
# Only render at most one tweet per message
|
|
|
|
continue
|
2013-05-29 21:38:16 +02:00
|
|
|
twitter_data = self.twitter_link(url)
|
|
|
|
if twitter_data is None:
|
|
|
|
# This link is not actually a tweet known to twitter
|
|
|
|
continue
|
2014-01-28 22:17:12 +01:00
|
|
|
rendered_tweet_count += 1
|
2020-07-13 18:31:07 +02:00
|
|
|
self.handle_tweet_inlining(root, found_url, twitter_data)
|
2013-05-21 16:59:09 +02:00
|
|
|
continue
|
|
|
|
youtube = self.youtube_image(url)
|
|
|
|
if youtube is not None:
|
2020-07-20 17:09:52 +02:00
|
|
|
self.handle_youtube_url_inlining(root, found_url, youtube)
|
2019-03-21 21:08:26 +01:00
|
|
|
# NOTE: We don't `continue` here, to allow replacing the URL with
|
|
|
|
# the title, if INLINE_URL_EMBED_PREVIEW feature is enabled.
|
|
|
|
# The entire preview would ideally be shown only if the feature
|
|
|
|
# is enabled, but URL previews are a beta feature and YouTube
|
|
|
|
# previews are pretty stable.
|
2013-05-21 16:59:09 +02:00
|
|
|
|
2022-10-06 22:58:37 +02:00
|
|
|
db_data: Optional[DbData] = self.zmd.zulip_db_data
|
2021-12-27 19:17:49 +01:00
|
|
|
if db_data and db_data.sent_by_bot:
|
2017-02-03 23:28:26 +01:00
|
|
|
continue
|
|
|
|
|
2022-10-06 22:58:37 +02:00
|
|
|
if not self.zmd.url_embed_preview_enabled:
|
2016-10-27 12:06:44 +02:00
|
|
|
continue
|
2018-11-02 14:56:32 +01:00
|
|
|
|
2022-10-06 22:58:37 +02:00
|
|
|
if self.zmd.url_embed_data is None or url not in self.zmd.url_embed_data:
|
|
|
|
self.zmd.zulip_rendering_result.links_for_preview.add(url)
|
2016-10-27 12:06:44 +02:00
|
|
|
continue
|
2019-03-21 21:08:26 +01:00
|
|
|
|
2022-04-14 21:57:20 +02:00
|
|
|
# Existing but being None means that we did process the
|
|
|
|
# URL, but it was not valid to preview.
|
2022-10-06 22:58:37 +02:00
|
|
|
extracted_data = self.zmd.url_embed_data[url]
|
2022-04-14 21:57:20 +02:00
|
|
|
if extracted_data is None:
|
|
|
|
continue
|
|
|
|
|
|
|
|
if youtube is not None:
|
|
|
|
title = self.youtube_title(extracted_data)
|
|
|
|
if title is not None:
|
|
|
|
if url == text:
|
|
|
|
found_url.family.child.text = title
|
|
|
|
else:
|
|
|
|
found_url.family.child.text = text
|
|
|
|
continue
|
|
|
|
self.add_embed(root, url, extracted_data)
|
|
|
|
if self.vimeo_id(url):
|
|
|
|
title = self.vimeo_title(extracted_data)
|
|
|
|
if title:
|
|
|
|
if url == text:
|
|
|
|
found_url.family.child.text = title
|
|
|
|
else:
|
|
|
|
found_url.family.child.text = text
|
2016-10-27 12:06:44 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2021-05-22 12:14:59 +02:00
|
|
|
class CompiledInlineProcessor(markdown.inlinepatterns.InlineProcessor):
|
2022-10-06 22:58:37 +02:00
|
|
|
def __init__(self, compiled_re: Pattern[str], zmd: "ZulipMarkdown") -> None:
|
2021-05-22 12:14:59 +02:00
|
|
|
# This is similar to the superclass's small __init__ function,
|
|
|
|
# but we skip the compilation step and let the caller give us
|
|
|
|
# a compiled regex.
|
|
|
|
self.compiled_re = compiled_re
|
2022-10-06 22:58:37 +02:00
|
|
|
self.md = zmd
|
|
|
|
self.zmd = zmd
|
2021-05-22 12:14:59 +02:00
|
|
|
|
|
|
|
|
2018-07-18 14:36:04 +02:00
|
|
|
class Timestamp(markdown.inlinepatterns.Pattern):
|
|
|
|
def handleMatch(self, match: Match[str]) -> Optional[Element]:
|
2021-02-12 08:20:45 +01:00
|
|
|
time_input_string = match.group("time")
|
2018-07-18 14:36:04 +02:00
|
|
|
try:
|
2021-08-03 22:04:48 +02:00
|
|
|
timestamp = dateutil.parser.parse(time_input_string, tzinfos=common_timezones)
|
2018-07-18 14:36:04 +02:00
|
|
|
except ValueError:
|
|
|
|
try:
|
2022-12-26 00:36:14 +01:00
|
|
|
timestamp = datetime.datetime.fromtimestamp(
|
|
|
|
float(time_input_string), tz=datetime.timezone.utc
|
|
|
|
)
|
2018-07-18 14:36:04 +02:00
|
|
|
except ValueError:
|
2023-01-18 05:25:49 +01:00
|
|
|
timestamp = None
|
2020-06-18 01:32:24 +02:00
|
|
|
|
|
|
|
if not timestamp:
|
2021-02-12 08:20:45 +01:00
|
|
|
error_element = Element("span")
|
|
|
|
error_element.set("class", "timestamp-error")
|
2020-06-18 01:32:24 +02:00
|
|
|
error_element.text = markdown.util.AtomicString(
|
2021-02-12 08:19:30 +01:00
|
|
|
f"Invalid time format: {time_input_string}"
|
|
|
|
)
|
2020-06-18 01:32:24 +02:00
|
|
|
return error_element
|
|
|
|
|
|
|
|
# Use HTML5 <time> element for valid timestamps.
|
2021-02-12 08:20:45 +01:00
|
|
|
time_element = Element("time")
|
2020-06-18 01:32:24 +02:00
|
|
|
if timestamp.tzinfo:
|
2020-06-18 23:30:24 +02:00
|
|
|
timestamp = timestamp.astimezone(datetime.timezone.utc)
|
2020-06-18 01:32:24 +02:00
|
|
|
else:
|
2020-06-18 23:30:24 +02:00
|
|
|
timestamp = timestamp.replace(tzinfo=datetime.timezone.utc)
|
2021-02-12 08:20:45 +01:00
|
|
|
time_element.set("datetime", timestamp.isoformat().replace("+00:00", "Z"))
|
2020-06-18 01:32:24 +02:00
|
|
|
# Set text to initial input, so simple clients translating
|
|
|
|
# HTML to text will at least display something.
|
|
|
|
time_element.text = markdown.util.AtomicString(time_input_string)
|
|
|
|
return time_element
|
2018-07-18 14:36:04 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2023-08-10 21:00:45 +02:00
|
|
|
# From https://unicode.org/reports/tr51/#EBNF_and_Regex. Keep this synced with `possible_emoji_regex`.
|
|
|
|
POSSIBLE_EMOJI_RE = regex.compile(
|
|
|
|
r"""(?P<syntax>
|
|
|
|
\p{RI} \p{RI}
|
|
|
|
| \p{Emoji}
|
|
|
|
(?: \p{Emoji_Modifier}
|
|
|
|
| \uFE0F \u20E3?
|
|
|
|
| [\U000E0020-\U000E007E]+ \U000E007F
|
|
|
|
)?
|
|
|
|
(?: \u200D
|
|
|
|
(?: \p{RI} \p{RI}
|
|
|
|
| \p{Emoji}
|
|
|
|
(?: \p{Emoji_Modifier}
|
|
|
|
| \uFE0F \u20E3?
|
|
|
|
| [\U000E0020-\U000E007E]+ \U000E007F
|
|
|
|
)?
|
|
|
|
)
|
|
|
|
)*)
|
|
|
|
""",
|
|
|
|
regex.VERBOSE,
|
2021-02-12 08:19:30 +01:00
|
|
|
)
|
2017-06-19 23:30:14 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def make_emoji(codepoint: str, display_string: str) -> Element:
|
2017-06-09 10:30:24 +02:00
|
|
|
# Replace underscore in emoji's title with space
|
|
|
|
title = display_string[1:-1].replace("_", " ")
|
2021-02-12 08:20:45 +01:00
|
|
|
span = Element("span")
|
|
|
|
span.set("class", f"emoji emoji-{codepoint}")
|
|
|
|
span.set("title", title)
|
|
|
|
span.set("role", "img")
|
|
|
|
span.set("aria-label", title)
|
2019-05-01 20:45:41 +02:00
|
|
|
span.text = markdown.util.AtomicString(display_string)
|
2017-09-27 19:39:42 +02:00
|
|
|
return span
|
2013-03-01 22:07:27 +01:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def make_realm_emoji(src: str, display_string: str) -> Element:
|
2021-02-12 08:20:45 +01:00
|
|
|
elt = Element("img")
|
|
|
|
elt.set("src", src)
|
|
|
|
elt.set("class", "emoji")
|
2017-05-01 01:25:03 +02:00
|
|
|
elt.set("alt", display_string)
|
2017-06-09 10:30:24 +02:00
|
|
|
elt.set("title", display_string[1:-1].replace("_", " "))
|
2017-05-01 01:25:03 +02:00
|
|
|
return elt
|
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2018-01-15 19:36:32 +01:00
|
|
|
class EmoticonTranslation(markdown.inlinepatterns.Pattern):
|
2021-05-08 02:36:30 +02:00
|
|
|
"""Translates emoticons like `:)` into emoji like `:smile:`."""
|
2020-04-22 01:45:30 +02:00
|
|
|
|
2022-10-06 22:58:37 +02:00
|
|
|
def __init__(self, pattern: str, zmd: "ZulipMarkdown") -> None:
|
|
|
|
super().__init__(pattern, zmd)
|
|
|
|
self.zmd = zmd
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def handleMatch(self, match: Match[str]) -> Optional[Element]:
|
2022-10-06 22:58:37 +02:00
|
|
|
db_data: Optional[DbData] = self.zmd.zulip_db_data
|
2021-12-27 19:17:49 +01:00
|
|
|
if db_data is None or not db_data.translate_emoticons:
|
2018-01-15 19:36:32 +01:00
|
|
|
return None
|
|
|
|
|
2021-02-12 08:20:45 +01:00
|
|
|
emoticon = match.group("emoticon")
|
2018-01-15 19:36:32 +01:00
|
|
|
translated = translate_emoticons(emoticon)
|
|
|
|
name = translated[1:-1]
|
|
|
|
return make_emoji(name_to_codepoint[name], translated)
|
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2023-08-10 21:00:45 +02:00
|
|
|
TEXT_PRESENTATION_RE = regex.compile(r"\P{Emoji_Presentation}\u20E3?")
|
|
|
|
|
|
|
|
|
|
|
|
class UnicodeEmoji(CompiledInlineProcessor):
|
|
|
|
def handleMatch( # type: ignore[override] # https://github.com/python/mypy/issues/10197
|
|
|
|
self, match: Match[str], data: str
|
|
|
|
) -> Union[Tuple[None, None, None], Tuple[Element, int, int]]:
|
2021-02-12 08:20:45 +01:00
|
|
|
orig_syntax = match.group("syntax")
|
2023-08-10 21:00:45 +02:00
|
|
|
|
|
|
|
# We want to avoid turning things like arrows (↔) and keycaps (numbers
|
|
|
|
# in boxes) into qualified emoji.
|
|
|
|
# More specifically, we skip anything with text in the second column of
|
|
|
|
# this table https://unicode.org/Public/emoji/1.0/emoji-data.txt
|
|
|
|
if TEXT_PRESENTATION_RE.fullmatch(orig_syntax):
|
|
|
|
return None, None, None
|
|
|
|
|
|
|
|
codepoint = emoji_to_hex_codepoint(unqualify_emoji(orig_syntax))
|
2017-06-20 15:52:14 +02:00
|
|
|
if codepoint in codepoint_to_name:
|
2021-02-12 08:20:45 +01:00
|
|
|
display_string = ":" + codepoint_to_name[codepoint] + ":"
|
2023-08-10 21:00:45 +02:00
|
|
|
return make_emoji(codepoint, display_string), match.start(), match.end()
|
2016-06-24 20:03:56 +02:00
|
|
|
else:
|
2023-08-10 21:00:45 +02:00
|
|
|
return None, None, None
|
2016-06-24 20:03:56 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2013-03-01 22:07:27 +01:00
|
|
|
class Emoji(markdown.inlinepatterns.Pattern):
|
2022-10-06 22:58:37 +02:00
|
|
|
def __init__(self, pattern: str, zmd: "ZulipMarkdown") -> None:
|
|
|
|
super().__init__(pattern, zmd)
|
|
|
|
self.zmd = zmd
|
|
|
|
|
2021-03-18 02:11:28 +01:00
|
|
|
def handleMatch(self, match: Match[str]) -> Optional[Union[str, Element]]:
|
2013-03-01 22:07:27 +01:00
|
|
|
orig_syntax = match.group("syntax")
|
|
|
|
name = orig_syntax[1:-1]
|
2013-08-22 20:50:00 +02:00
|
|
|
|
2021-12-29 16:16:15 +01:00
|
|
|
active_realm_emoji: Dict[str, EmojiInfo] = {}
|
2022-10-06 22:58:37 +02:00
|
|
|
db_data: Optional[DbData] = self.zmd.zulip_db_data
|
2018-11-07 16:26:33 +01:00
|
|
|
if db_data is not None:
|
2021-12-27 19:17:49 +01:00
|
|
|
active_realm_emoji = db_data.active_realm_emoji
|
2013-08-22 20:50:00 +02:00
|
|
|
|
2021-08-10 20:04:16 +02:00
|
|
|
if name in active_realm_emoji:
|
2021-02-12 08:20:45 +01:00
|
|
|
return make_realm_emoji(active_realm_emoji[name]["source_url"], orig_syntax)
|
|
|
|
elif name == "zulip":
|
2023-02-07 22:34:43 +01:00
|
|
|
# We explicitly do not use staticfiles to generate the URL
|
|
|
|
# for this, so that it is portable if exported.
|
2021-02-12 08:19:30 +01:00
|
|
|
return make_realm_emoji(
|
2021-02-12 08:20:45 +01:00
|
|
|
"/static/generated/emoji/images/emoji/unicode/zulip.png", orig_syntax
|
2021-02-12 08:19:30 +01:00
|
|
|
)
|
2017-02-04 23:27:24 +01:00
|
|
|
elif name in name_to_codepoint:
|
2017-05-01 01:34:31 +02:00
|
|
|
return make_emoji(name_to_codepoint[name], orig_syntax)
|
2013-08-22 20:50:00 +02:00
|
|
|
else:
|
2019-03-05 08:18:59 +01:00
|
|
|
return orig_syntax
|
2013-03-01 22:07:27 +01:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def content_has_emoji_syntax(content: str) -> bool:
|
2017-09-15 03:08:15 +02:00
|
|
|
return re.search(EMOJI_REGEX, content) is not None
|
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2017-03-20 16:56:39 +01:00
|
|
|
class Tex(markdown.inlinepatterns.Pattern):
|
2022-10-06 22:58:37 +02:00
|
|
|
def handleMatch(self, match: Match[str]) -> Union[str, Element]:
|
2021-02-12 08:20:45 +01:00
|
|
|
rendered = render_tex(match.group("body"), is_inline=True)
|
2017-03-20 16:56:39 +01:00
|
|
|
if rendered is not None:
|
2021-05-01 23:51:19 +02:00
|
|
|
return self.md.htmlStash.store(rendered)
|
2017-07-09 01:28:18 +02:00
|
|
|
else: # Something went wrong while rendering
|
2021-02-12 08:20:45 +01:00
|
|
|
span = Element("span")
|
|
|
|
span.set("class", "tex-error")
|
2021-05-01 23:51:19 +02:00
|
|
|
span.text = markdown.util.AtomicString("$$" + match.group("body") + "$$")
|
2017-03-20 16:56:39 +01:00
|
|
|
return span
|
|
|
|
|
2013-02-01 23:15:05 +01:00
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def sanitize_url(url: str) -> Optional[str]:
|
2013-02-01 23:15:05 +01:00
|
|
|
"""
|
2020-10-23 02:43:28 +02:00
|
|
|
Sanitize a URL against XSS attacks.
|
2013-02-01 23:15:05 +01:00
|
|
|
See the docstring on markdown.inlinepatterns.LinkPattern.sanitize_url.
|
|
|
|
"""
|
|
|
|
try:
|
2021-02-12 08:20:45 +01:00
|
|
|
parts = urllib.parse.urlparse(url.replace(" ", "%20"))
|
2013-02-01 23:15:05 +01:00
|
|
|
scheme, netloc, path, params, query, fragment = parts
|
|
|
|
except ValueError:
|
2020-10-23 02:43:28 +02:00
|
|
|
# Bad URL - so bad it couldn't be parsed.
|
2021-02-12 08:20:45 +01:00
|
|
|
return ""
|
2013-02-01 23:15:05 +01:00
|
|
|
|
2013-03-29 20:17:33 +01:00
|
|
|
# If there is no scheme or netloc and there is a '@' in the path,
|
|
|
|
# treat it as a mailto: and set the appropriate scheme
|
2021-02-12 08:20:45 +01:00
|
|
|
if scheme == "" and netloc == "" and "@" in path:
|
|
|
|
scheme = "mailto"
|
|
|
|
elif scheme == "" and netloc == "" and len(path) > 0 and path[0] == "/":
|
2013-10-24 18:06:33 +02:00
|
|
|
# Allow domain-relative links
|
2021-02-12 08:20:45 +01:00
|
|
|
return urllib.parse.urlunparse(("", "", path, params, query, fragment))
|
|
|
|
elif (scheme, netloc, path, params, query) == ("", "", "", "", "") and len(fragment) > 0:
|
2013-10-24 18:06:33 +02:00
|
|
|
# Allow fragment links
|
2021-02-12 08:20:45 +01:00
|
|
|
return urllib.parse.urlunparse(("", "", "", "", "", fragment))
|
2013-03-29 20:17:33 +01:00
|
|
|
|
2013-08-06 21:32:15 +02:00
|
|
|
# Zulip modification: If scheme is not specified, assume http://
|
2013-02-01 23:15:05 +01:00
|
|
|
# We re-enter sanitize_url because netloc etc. need to be re-parsed.
|
|
|
|
if not scheme:
|
2021-02-12 08:20:45 +01:00
|
|
|
return sanitize_url("http://" + url)
|
2013-02-01 23:15:05 +01:00
|
|
|
|
2013-04-02 19:57:35 +02:00
|
|
|
# Upstream code will accept a URL like javascript://foo because it
|
|
|
|
# appears to have a netloc. Additionally there are plenty of other
|
|
|
|
# schemes that do weird things like launch external programs. To be
|
2021-08-05 09:01:18 +02:00
|
|
|
# on the safe side, we allow a fixed set of schemes.
|
|
|
|
if scheme not in allowed_schemes:
|
2013-04-02 19:57:35 +02:00
|
|
|
return None
|
|
|
|
|
2013-04-02 19:36:37 +02:00
|
|
|
# Upstream code scans path, parameters, and query for colon characters
|
|
|
|
# because
|
|
|
|
#
|
2016-01-24 03:39:44 +01:00
|
|
|
# some aliases [for javascript:] will appear to urllib.parse to have
|
2013-04-02 19:36:37 +02:00
|
|
|
# no scheme. On top of that relative links (i.e.: "foo/bar.html")
|
|
|
|
# have no scheme.
|
|
|
|
#
|
|
|
|
# We already converted an empty scheme to http:// above, so we skip
|
|
|
|
# the colon check, which would also forbid a lot of legitimate URLs.
|
2013-02-01 23:15:05 +01:00
|
|
|
|
2020-10-23 02:43:28 +02:00
|
|
|
# URL passes all tests. Return URL as-is.
|
2016-01-24 03:39:44 +01:00
|
|
|
return urllib.parse.urlunparse((scheme, netloc, path, params, query, fragment))
|
2013-02-01 23:15:05 +01:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
|
|
|
def url_to_a(
|
|
|
|
db_data: Optional[DbData], url: str, text: Optional[str] = None
|
|
|
|
) -> Union[Element, str]:
|
2021-02-12 08:20:45 +01:00
|
|
|
a = Element("a")
|
2013-02-26 22:41:39 +01:00
|
|
|
|
2013-03-29 20:17:33 +01:00
|
|
|
href = sanitize_url(url)
|
2013-02-26 22:41:39 +01:00
|
|
|
if href is None:
|
|
|
|
# Rejected by sanitize_url; render it as plain text.
|
|
|
|
return url
|
2013-06-05 17:45:57 +02:00
|
|
|
if text is None:
|
2013-10-02 21:14:22 +02:00
|
|
|
text = markdown.util.AtomicString(url)
|
2013-02-26 22:41:39 +01:00
|
|
|
|
2018-11-07 16:07:34 +01:00
|
|
|
href = rewrite_local_links_to_relative(db_data, href)
|
2017-10-31 22:03:39 +01:00
|
|
|
|
2021-02-12 08:20:45 +01:00
|
|
|
a.set("href", href)
|
2013-06-05 17:45:57 +02:00
|
|
|
a.text = text
|
2013-02-11 20:49:48 +01:00
|
|
|
return a
|
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2019-01-22 19:08:33 +01:00
|
|
|
class CompiledPattern(markdown.inlinepatterns.Pattern):
|
2022-10-06 22:58:37 +02:00
|
|
|
def __init__(self, compiled_re: Pattern[str], zmd: "ZulipMarkdown") -> None:
|
2019-01-22 19:11:50 +01:00
|
|
|
# This is similar to the superclass's small __init__ function,
|
|
|
|
# but we skip the compilation step and let the caller give us
|
|
|
|
# a compiled regex.
|
2018-11-03 17:12:15 +01:00
|
|
|
self.compiled_re = compiled_re
|
2022-10-06 22:58:37 +02:00
|
|
|
self.md = zmd
|
|
|
|
self.zmd = zmd
|
2013-06-21 23:42:33 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2019-01-22 19:08:33 +01:00
|
|
|
class AutoLink(CompiledPattern):
|
2018-05-11 01:42:51 +02:00
|
|
|
def handleMatch(self, match: Match[str]) -> ElementStringNone:
|
2021-02-12 08:20:45 +01:00
|
|
|
url = match.group("url")
|
2022-10-06 22:58:37 +02:00
|
|
|
db_data: Optional[DbData] = self.zmd.zulip_db_data
|
2018-11-07 16:07:34 +01:00
|
|
|
return url_to_a(db_data, url)
|
2012-10-22 02:32:18 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2019-08-11 07:41:34 +02:00
|
|
|
class OListProcessor(sane_lists.SaneOListProcessor):
|
2021-03-18 02:15:59 +01:00
|
|
|
def __init__(self, parser: BlockParser) -> None:
|
2020-06-03 04:16:38 +02:00
|
|
|
parser.md.tab_length = 2
|
2019-08-11 07:41:34 +02:00
|
|
|
super().__init__(parser)
|
2020-06-03 04:16:38 +02:00
|
|
|
parser.md.tab_length = 4
|
2012-11-02 18:25:37 +01:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2019-08-11 07:41:34 +02:00
|
|
|
class UListProcessor(sane_lists.SaneUListProcessor):
|
2021-05-08 02:36:30 +02:00
|
|
|
"""Unordered lists, but with 2-space indent"""
|
2012-11-02 18:25:37 +01:00
|
|
|
|
2021-03-18 02:15:59 +01:00
|
|
|
def __init__(self, parser: BlockParser) -> None:
|
2020-06-03 04:16:38 +02:00
|
|
|
parser.md.tab_length = 2
|
2017-10-27 08:28:23 +02:00
|
|
|
super().__init__(parser)
|
2020-06-03 04:16:38 +02:00
|
|
|
parser.md.tab_length = 4
|
2017-03-26 21:14:05 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2017-03-26 21:14:05 +02:00
|
|
|
class ListIndentProcessor(markdown.blockprocessors.ListIndentProcessor):
|
2021-02-12 08:19:30 +01:00
|
|
|
"""Process unordered list blocks.
|
2017-03-26 21:14:05 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
Based on markdown.blockprocessors.ListIndentProcessor, but with 2-space indent
|
2017-03-26 21:14:05 +02:00
|
|
|
"""
|
|
|
|
|
2021-03-18 02:15:59 +01:00
|
|
|
def __init__(self, parser: BlockParser) -> None:
|
2017-03-26 21:14:05 +02:00
|
|
|
# HACK: Set the tab length to 2 just for the initialization of
|
|
|
|
# this class, so that bulleted lists (and only bulleted lists)
|
|
|
|
# work off 2-space indentation.
|
2020-06-03 04:16:38 +02:00
|
|
|
parser.md.tab_length = 2
|
2017-10-27 08:28:23 +02:00
|
|
|
super().__init__(parser)
|
2020-06-03 04:16:38 +02:00
|
|
|
parser.md.tab_length = 4
|
2017-03-26 21:14:05 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2019-07-31 08:04:32 +02:00
|
|
|
class HashHeaderProcessor(markdown.blockprocessors.HashHeaderProcessor):
|
2021-02-12 08:19:30 +01:00
|
|
|
"""Process hash headers.
|
2019-07-31 08:04:32 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
Based on markdown.blockprocessors.HashHeaderProcessor, but requires space for heading.
|
2019-07-31 08:04:32 +02:00
|
|
|
"""
|
|
|
|
|
|
|
|
# Original regex for hashheader is
|
|
|
|
# RE = re.compile(r'(?:^|\n)(?P<level>#{1,6})(?P<header>(?:\\.|[^\\])*?)#*(?:\n|$)')
|
2021-02-12 08:20:45 +01:00
|
|
|
RE = re.compile(r"(?:^|\n)(?P<level>#{1,6})\s(?P<header>(?:\\.|[^\\])*?)#*(?:\n|$)")
|
2019-07-31 08:04:32 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2019-01-08 11:30:13 +01:00
|
|
|
class BlockQuoteProcessor(markdown.blockprocessors.BlockQuoteProcessor):
|
2021-02-12 08:19:30 +01:00
|
|
|
"""Process block quotes.
|
2019-01-08 11:30:13 +01:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
Based on markdown.blockprocessors.BlockQuoteProcessor, but with 2-space indent
|
2019-01-08 11:30:13 +01:00
|
|
|
"""
|
|
|
|
|
|
|
|
# Original regex for blockquote is RE = re.compile(r'(^|\n)[ ]{0,3}>[ ]?(.*)')
|
2023-01-03 01:51:16 +01:00
|
|
|
RE = re.compile(r"(^|\n)(?!(?:[ ]{0,3}>\s*(?:$|\n))*(?:$|\n))[ ]{0,3}>[ ]?(.*)")
|
2019-01-08 11:30:13 +01:00
|
|
|
|
2020-10-30 20:10:29 +01:00
|
|
|
# run() is very slightly forked from the base class; see notes below.
|
2021-03-18 02:15:59 +01:00
|
|
|
def run(self, parent: Element, blocks: List[str]) -> None:
|
2020-10-30 20:10:29 +01:00
|
|
|
block = blocks.pop(0)
|
|
|
|
m = self.RE.search(block)
|
|
|
|
if m:
|
2021-02-12 08:19:30 +01:00
|
|
|
before = block[: m.start()] # Lines before blockquote
|
2021-04-25 22:54:23 +02:00
|
|
|
# Pass lines before blockquote in recursively for parsing first.
|
2020-10-30 20:10:29 +01:00
|
|
|
self.parser.parseBlocks(parent, [before])
|
|
|
|
# Remove ``> `` from beginning of each line.
|
2021-02-12 08:20:45 +01:00
|
|
|
block = "\n".join([self.clean(line) for line in block[m.start() :].split("\n")])
|
2020-10-30 20:10:29 +01:00
|
|
|
|
|
|
|
# Zulip modification: The next line is patched to match
|
|
|
|
# CommonMark rather than original Markdown. In original
|
|
|
|
# Markdown, blockquotes with a blank line between them were
|
|
|
|
# merged, which makes it impossible to break a blockquote with
|
|
|
|
# a blank line intentionally.
|
|
|
|
#
|
|
|
|
# This is a new blockquote. Create a new parent element.
|
2022-11-16 06:28:44 +01:00
|
|
|
quote = SubElement(parent, "blockquote")
|
2020-10-30 20:10:29 +01:00
|
|
|
|
|
|
|
# Recursively parse block with blockquote as parent.
|
|
|
|
# change parser state so blockquotes embedded in lists use p tags
|
2021-02-12 08:20:45 +01:00
|
|
|
self.parser.state.set("blockquote")
|
2020-10-30 20:10:29 +01:00
|
|
|
self.parser.parseChunk(quote, block)
|
|
|
|
self.parser.state.reset()
|
|
|
|
|
2019-01-08 11:30:13 +01:00
|
|
|
def clean(self, line: str) -> str:
|
|
|
|
# Silence all the mentions inside blockquotes
|
2021-05-15 18:55:34 +02:00
|
|
|
line = mention.MENTIONS_RE.sub(lambda m: "@_**{}**".format(m.group("match")), line)
|
2021-05-16 10:33:39 +02:00
|
|
|
# Silence all the user group mentions inside blockquotes
|
|
|
|
line = mention.USER_GROUP_MENTIONS_RE.sub(lambda m: "@_*{}*".format(m.group("match")), line)
|
2019-01-08 11:30:13 +01:00
|
|
|
|
|
|
|
# And then run the upstream processor's code for removing the '>'
|
|
|
|
return super().clean(line)
|
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2020-06-11 21:44:23 +02:00
|
|
|
@dataclass
|
|
|
|
class Fence:
|
|
|
|
fence_str: str
|
|
|
|
is_code: bool
|
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2020-06-25 21:38:36 +02:00
|
|
|
class MarkdownListPreprocessor(markdown.preprocessors.Preprocessor):
|
2021-02-12 08:19:30 +01:00
|
|
|
"""Allows list blocks that come directly after another block
|
|
|
|
to be rendered as a list.
|
2013-01-24 19:35:20 +01:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
Detects paragraphs that have a matching list item that comes
|
|
|
|
directly after a line of text, and inserts a newline between
|
|
|
|
to satisfy Markdown"""
|
2013-01-24 19:35:20 +01:00
|
|
|
|
2021-02-12 08:20:45 +01:00
|
|
|
LI_RE = re.compile(r"^[ ]*([*+-]|\d\.)[ ]+(.*)", re.MULTILINE)
|
2013-01-24 19:35:20 +01:00
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def run(self, lines: List[str]) -> List[str]:
|
2021-05-08 02:36:30 +02:00
|
|
|
"""Insert a newline between a paragraph and ulist if missing"""
|
2013-01-24 19:35:20 +01:00
|
|
|
inserts = 0
|
python: Convert assignment type annotations to Python 3.6 style.
This commit was split by tabbott; this piece covers the vast majority
of files in Zulip, but excludes scripts/, tools/, and puppet/ to help
ensure we at least show the right error messages for Xenial systems.
We can likely further refine the remaining pieces with some testing.
Generated by com2ann, with whitespace fixes and various manual fixes
for runtime issues:
- invoiced_through: Optional[LicenseLedger] = models.ForeignKey(
+ invoiced_through: Optional["LicenseLedger"] = models.ForeignKey(
-_apns_client: Optional[APNsClient] = None
+_apns_client: Optional["APNsClient"] = None
- notifications_stream: Optional[Stream] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
- signup_notifications_stream: Optional[Stream] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
+ notifications_stream: Optional["Stream"] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
+ signup_notifications_stream: Optional["Stream"] = models.ForeignKey('Stream', related_name='+', null=True, blank=True, on_delete=CASCADE)
- author: Optional[UserProfile] = models.ForeignKey('UserProfile', blank=True, null=True, on_delete=CASCADE)
+ author: Optional["UserProfile"] = models.ForeignKey('UserProfile', blank=True, null=True, on_delete=CASCADE)
- bot_owner: Optional[UserProfile] = models.ForeignKey('self', null=True, on_delete=models.SET_NULL)
+ bot_owner: Optional["UserProfile"] = models.ForeignKey('self', null=True, on_delete=models.SET_NULL)
- default_sending_stream: Optional[Stream] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
- default_events_register_stream: Optional[Stream] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
+ default_sending_stream: Optional["Stream"] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
+ default_events_register_stream: Optional["Stream"] = models.ForeignKey('zerver.Stream', null=True, related_name='+', on_delete=CASCADE)
-descriptors_by_handler_id: Dict[int, ClientDescriptor] = {}
+descriptors_by_handler_id: Dict[int, "ClientDescriptor"] = {}
-worker_classes: Dict[str, Type[QueueProcessingWorker]] = {}
-queues: Dict[str, Dict[str, Type[QueueProcessingWorker]]] = {}
+worker_classes: Dict[str, Type["QueueProcessingWorker"]] = {}
+queues: Dict[str, Dict[str, Type["QueueProcessingWorker"]]] = {}
-AUTH_LDAP_REVERSE_EMAIL_SEARCH: Optional[LDAPSearch] = None
+AUTH_LDAP_REVERSE_EMAIL_SEARCH: Optional["LDAPSearch"] = None
Signed-off-by: Anders Kaseorg <anders@zulipchat.com>
2020-04-22 01:09:50 +02:00
|
|
|
in_code_fence: bool = False
|
|
|
|
open_fences: List[Fence] = []
|
2013-01-24 19:35:20 +01:00
|
|
|
copy = lines[:]
|
2015-11-01 17:15:05 +01:00
|
|
|
for i in range(len(lines) - 1):
|
2020-01-02 02:22:01 +01:00
|
|
|
# Ignore anything that is inside a fenced code block but not quoted.
|
2022-04-28 05:15:11 +02:00
|
|
|
# We ignore all lines where some parent is a non-quote code block.
|
2013-01-24 19:35:20 +01:00
|
|
|
m = FENCE_RE.match(lines[i])
|
2020-01-02 02:22:01 +01:00
|
|
|
if m:
|
2021-02-12 08:20:45 +01:00
|
|
|
fence_str = m.group("fence")
|
2021-05-13 19:42:53 +02:00
|
|
|
lang: Optional[str] = m.group("lang")
|
|
|
|
is_code = lang not in ("quote", "quoted")
|
2021-02-12 08:19:30 +01:00
|
|
|
matches_last_fence = (
|
2023-01-18 03:30:35 +01:00
|
|
|
fence_str == open_fences[-1].fence_str if open_fences else False
|
2021-02-12 08:19:30 +01:00
|
|
|
)
|
2021-05-13 19:42:53 +02:00
|
|
|
closes_last_fence = not lang and matches_last_fence
|
2020-01-02 02:22:01 +01:00
|
|
|
|
|
|
|
if closes_last_fence:
|
|
|
|
open_fences.pop()
|
|
|
|
else:
|
|
|
|
open_fences.append(Fence(fence_str, is_code))
|
|
|
|
|
2020-09-02 06:20:26 +02:00
|
|
|
in_code_fence = any(fence.is_code for fence in open_fences)
|
2013-01-24 19:35:20 +01:00
|
|
|
|
|
|
|
# If we're not in a fenced block and we detect an upcoming list
|
2019-08-11 07:41:34 +02:00
|
|
|
# hanging off any block (including a list of another type), add
|
|
|
|
# a newline.
|
|
|
|
li1 = self.LI_RE.match(lines[i])
|
2021-02-12 08:19:30 +01:00
|
|
|
li2 = self.LI_RE.match(lines[i + 1])
|
2023-01-18 02:59:37 +01:00
|
|
|
if (
|
|
|
|
not in_code_fence
|
|
|
|
and lines[i]
|
|
|
|
and (
|
|
|
|
(li2 and not li1)
|
|
|
|
or (li1 and li2 and (len(li1.group(1)) == 1) != (len(li2.group(1)) == 1))
|
|
|
|
)
|
|
|
|
):
|
|
|
|
copy.insert(i + inserts + 1, "")
|
|
|
|
inserts += 1
|
2013-01-24 19:35:20 +01:00
|
|
|
return copy
|
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2019-08-30 18:36:14 +02:00
|
|
|
# Name for the outer capture group we use to separate whitespace and
|
|
|
|
# other delimiters from the actual content. This value won't be an
|
|
|
|
# option in user-entered capture groups.
|
CVE-2021-41115: Use re2 for user-supplied linkifier patterns.
Zulip attempts to validate that the regular expressions that admins
enter for linkifiers are well-formatted, and only contain a specific
subset of regex grammar. The process of checking these
properties (via a regex!) can cause denial-of-service via
backtracking.
Furthermore, this validation itself does not prevent the creation of
linkifiers which themselves cause denial-of-service when they are
executed. As the validator accepts literally anything inside of a
`(?P<word>...)` block, any quadratic backtracking expression can be
hidden therein.
Switch user-provided linkifier patterns to be matched in the Markdown
processor by the `re2` library, which is guaranteed constant-time.
This somewhat limits the possible features of the regular
expression (notably, look-head and -behind, and back-references);
however, these features had never been advertised as working in the
context of linkifiers.
A migration removes any existing linkifiers which would not function
under re2, after printing them for posterity during the upgrade; they
are unlikely to be common, and are impossible to fix automatically.
The denial-of-service in the linkifier validator was discovered by
@erik-krogh and @yoff, as GHSL-2021-118.
2021-09-29 01:27:54 +02:00
|
|
|
BEFORE_CAPTURE_GROUP = "linkifier_before_match"
|
2019-08-30 18:36:14 +02:00
|
|
|
OUTER_CAPTURE_GROUP = "linkifier_actual_match"
|
CVE-2021-41115: Use re2 for user-supplied linkifier patterns.
Zulip attempts to validate that the regular expressions that admins
enter for linkifiers are well-formatted, and only contain a specific
subset of regex grammar. The process of checking these
properties (via a regex!) can cause denial-of-service via
backtracking.
Furthermore, this validation itself does not prevent the creation of
linkifiers which themselves cause denial-of-service when they are
executed. As the validator accepts literally anything inside of a
`(?P<word>...)` block, any quadratic backtracking expression can be
hidden therein.
Switch user-provided linkifier patterns to be matched in the Markdown
processor by the `re2` library, which is guaranteed constant-time.
This somewhat limits the possible features of the regular
expression (notably, look-head and -behind, and back-references);
however, these features had never been advertised as working in the
context of linkifiers.
A migration removes any existing linkifiers which would not function
under re2, after printing them for posterity during the upgrade; they
are unlikely to be common, and are impossible to fix automatically.
The denial-of-service in the linkifier validator was discovered by
@erik-krogh and @yoff, as GHSL-2021-118.
2021-09-29 01:27:54 +02:00
|
|
|
AFTER_CAPTURE_GROUP = "linkifier_after_match"
|
2021-02-12 08:19:30 +01:00
|
|
|
|
|
|
|
|
2021-03-30 12:15:39 +02:00
|
|
|
def prepare_linkifier_pattern(source: str) -> str:
|
2021-03-30 12:08:03 +02:00
|
|
|
"""Augment a linkifier so it only matches after start-of-string,
|
2019-03-07 17:41:54 +01:00
|
|
|
whitespace, or opening delimiters, won't match if there are word
|
2019-08-30 18:36:14 +02:00
|
|
|
characters directly after, and saves what was matched as
|
|
|
|
OUTER_CAPTURE_GROUP."""
|
2022-02-15 23:45:41 +01:00
|
|
|
return rf"""(?P<{BEFORE_CAPTURE_GROUP}>^|\s|['"\(,:<])(?P<{OUTER_CAPTURE_GROUP}>{source})(?P<{AFTER_CAPTURE_GROUP}>$|[^\pL\pN])"""
|
2013-07-15 17:56:45 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2013-06-05 17:45:57 +02:00
|
|
|
# Given a regular expression pattern, linkifies groups that match it
|
|
|
|
# using the provided format string to construct the URL.
|
CVE-2021-41115: Use re2 for user-supplied linkifier patterns.
Zulip attempts to validate that the regular expressions that admins
enter for linkifiers are well-formatted, and only contain a specific
subset of regex grammar. The process of checking these
properties (via a regex!) can cause denial-of-service via
backtracking.
Furthermore, this validation itself does not prevent the creation of
linkifiers which themselves cause denial-of-service when they are
executed. As the validator accepts literally anything inside of a
`(?P<word>...)` block, any quadratic backtracking expression can be
hidden therein.
Switch user-provided linkifier patterns to be matched in the Markdown
processor by the `re2` library, which is guaranteed constant-time.
This somewhat limits the possible features of the regular
expression (notably, look-head and -behind, and back-references);
however, these features had never been advertised as working in the
context of linkifiers.
A migration removes any existing linkifiers which would not function
under re2, after printing them for posterity during the upgrade; they
are unlikely to be common, and are impossible to fix automatically.
The denial-of-service in the linkifier validator was discovered by
@erik-krogh and @yoff, as GHSL-2021-118.
2021-09-29 01:27:54 +02:00
|
|
|
class LinkifierPattern(CompiledInlineProcessor):
|
2021-05-08 02:36:30 +02:00
|
|
|
"""Applied a given linkifier to the input"""
|
2016-11-29 07:22:02 +01:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
source_pattern: str,
|
linkifier: Support URL templates for linkifiers.
This swaps out url_format_string from all of our APIs and replaces it
with url_template. Note that the documentation changes in the following
commits will be squashed with this commit.
We change the "url_format" key to "url_template" for the
realm_linkifiers events in event_schema, along with updating
LinkifierDict. "url_template" is the name chosen to normalize
mixed usages of "url_format_string" and "url_format" throughout
the backend.
The markdown processor is updated to stop handling the format string
interpolation and delegate the task template expansion to the uri_template
library instead.
This change affects many test cases. We mostly just replace "%(name)s"
with "{name}", "url_format_string" with "url_template" to make sure that
they still pass. There are some test cases dedicated for testing "%"
escaping, which aren't relevant anymore and are subject to removal.
But for now we keep most of them as-is, and make sure that "%" is always
escaped since we do not use it for variable substitution any more.
Since url_format_string is not populated anymore, a migration is created
to remove this field entirely, and make url_template non-nullable since
we will always populate it. Note that it is possible to have
url_template being null after migration 0422 and before 0424, but
in practice, url_template will not be None after backfilling and the
backend now is always setting url_template.
With the removal of url_format_string, RealmFilter model will now be cleaned
with URL template checks, and the old checks for escapes are removed.
We also modified RealmFilter.clean to skip the validation when the
url_template is invalid. This avoids raising mulitple ValidationError's
when calling full_clean on a linkifier. But we might eventually want to
have a more centric approach to data validation instead of having
the same validation in both the clean method and the validator.
Fixes #23124.
Signed-off-by: Zixuan James Li <p359101898@gmail.com>
2022-10-05 20:55:31 +02:00
|
|
|
url_template: str,
|
2022-10-06 22:58:37 +02:00
|
|
|
zmd: "ZulipMarkdown",
|
2021-02-12 08:19:30 +01:00
|
|
|
) -> None:
|
CVE-2021-41115: Use re2 for user-supplied linkifier patterns.
Zulip attempts to validate that the regular expressions that admins
enter for linkifiers are well-formatted, and only contain a specific
subset of regex grammar. The process of checking these
properties (via a regex!) can cause denial-of-service via
backtracking.
Furthermore, this validation itself does not prevent the creation of
linkifiers which themselves cause denial-of-service when they are
executed. As the validator accepts literally anything inside of a
`(?P<word>...)` block, any quadratic backtracking expression can be
hidden therein.
Switch user-provided linkifier patterns to be matched in the Markdown
processor by the `re2` library, which is guaranteed constant-time.
This somewhat limits the possible features of the regular
expression (notably, look-head and -behind, and back-references);
however, these features had never been advertised as working in the
context of linkifiers.
A migration removes any existing linkifiers which would not function
under re2, after printing them for posterity during the upgrade; they
are unlikely to be common, and are impossible to fix automatically.
The denial-of-service in the linkifier validator was discovered by
@erik-krogh and @yoff, as GHSL-2021-118.
2021-09-29 01:27:54 +02:00
|
|
|
# Do not write errors to stderr (this still raises exceptions)
|
|
|
|
options = re2.Options()
|
|
|
|
options.log_errors = False
|
|
|
|
|
|
|
|
compiled_re2 = re2.compile(prepare_linkifier_pattern(source_pattern), options=options)
|
linkifier: Support URL templates for linkifiers.
This swaps out url_format_string from all of our APIs and replaces it
with url_template. Note that the documentation changes in the following
commits will be squashed with this commit.
We change the "url_format" key to "url_template" for the
realm_linkifiers events in event_schema, along with updating
LinkifierDict. "url_template" is the name chosen to normalize
mixed usages of "url_format_string" and "url_format" throughout
the backend.
The markdown processor is updated to stop handling the format string
interpolation and delegate the task template expansion to the uri_template
library instead.
This change affects many test cases. We mostly just replace "%(name)s"
with "{name}", "url_format_string" with "url_template" to make sure that
they still pass. There are some test cases dedicated for testing "%"
escaping, which aren't relevant anymore and are subject to removal.
But for now we keep most of them as-is, and make sure that "%" is always
escaped since we do not use it for variable substitution any more.
Since url_format_string is not populated anymore, a migration is created
to remove this field entirely, and make url_template non-nullable since
we will always populate it. Note that it is possible to have
url_template being null after migration 0422 and before 0424, but
in practice, url_template will not be None after backfilling and the
backend now is always setting url_template.
With the removal of url_format_string, RealmFilter model will now be cleaned
with URL template checks, and the old checks for escapes are removed.
We also modified RealmFilter.clean to skip the validation when the
url_template is invalid. This avoids raising mulitple ValidationError's
when calling full_clean on a linkifier. But we might eventually want to
have a more centric approach to data validation instead of having
the same validation in both the clean method and the validator.
Fixes #23124.
Signed-off-by: Zixuan James Li <p359101898@gmail.com>
2022-10-05 20:55:31 +02:00
|
|
|
|
|
|
|
self.prepared_url_template = uri_template.URITemplate(url_template)
|
2021-10-20 04:57:53 +02:00
|
|
|
|
2022-10-06 22:58:37 +02:00
|
|
|
super().__init__(compiled_re2, zmd)
|
2019-03-07 17:41:54 +01:00
|
|
|
|
2022-07-05 21:21:44 +02:00
|
|
|
def handleMatch( # type: ignore[override] # https://github.com/python/mypy/issues/10197
|
CVE-2021-41115: Use re2 for user-supplied linkifier patterns.
Zulip attempts to validate that the regular expressions that admins
enter for linkifiers are well-formatted, and only contain a specific
subset of regex grammar. The process of checking these
properties (via a regex!) can cause denial-of-service via
backtracking.
Furthermore, this validation itself does not prevent the creation of
linkifiers which themselves cause denial-of-service when they are
executed. As the validator accepts literally anything inside of a
`(?P<word>...)` block, any quadratic backtracking expression can be
hidden therein.
Switch user-provided linkifier patterns to be matched in the Markdown
processor by the `re2` library, which is guaranteed constant-time.
This somewhat limits the possible features of the regular
expression (notably, look-head and -behind, and back-references);
however, these features had never been advertised as working in the
context of linkifiers.
A migration removes any existing linkifiers which would not function
under re2, after printing them for posterity during the upgrade; they
are unlikely to be common, and are impossible to fix automatically.
The denial-of-service in the linkifier validator was discovered by
@erik-krogh and @yoff, as GHSL-2021-118.
2021-09-29 01:27:54 +02:00
|
|
|
self, m: Match[str], data: str
|
|
|
|
) -> Union[Tuple[Element, int, int], Tuple[None, None, None]]:
|
2022-10-06 22:58:37 +02:00
|
|
|
db_data: Optional[DbData] = self.zmd.zulip_db_data
|
CVE-2021-41115: Use re2 for user-supplied linkifier patterns.
Zulip attempts to validate that the regular expressions that admins
enter for linkifiers are well-formatted, and only contain a specific
subset of regex grammar. The process of checking these
properties (via a regex!) can cause denial-of-service via
backtracking.
Furthermore, this validation itself does not prevent the creation of
linkifiers which themselves cause denial-of-service when they are
executed. As the validator accepts literally anything inside of a
`(?P<word>...)` block, any quadratic backtracking expression can be
hidden therein.
Switch user-provided linkifier patterns to be matched in the Markdown
processor by the `re2` library, which is guaranteed constant-time.
This somewhat limits the possible features of the regular
expression (notably, look-head and -behind, and back-references);
however, these features had never been advertised as working in the
context of linkifiers.
A migration removes any existing linkifiers which would not function
under re2, after printing them for posterity during the upgrade; they
are unlikely to be common, and are impossible to fix automatically.
The denial-of-service in the linkifier validator was discovered by
@erik-krogh and @yoff, as GHSL-2021-118.
2021-09-29 01:27:54 +02:00
|
|
|
url = url_to_a(
|
2021-02-12 08:19:30 +01:00
|
|
|
db_data,
|
linkifier: Support URL templates for linkifiers.
This swaps out url_format_string from all of our APIs and replaces it
with url_template. Note that the documentation changes in the following
commits will be squashed with this commit.
We change the "url_format" key to "url_template" for the
realm_linkifiers events in event_schema, along with updating
LinkifierDict. "url_template" is the name chosen to normalize
mixed usages of "url_format_string" and "url_format" throughout
the backend.
The markdown processor is updated to stop handling the format string
interpolation and delegate the task template expansion to the uri_template
library instead.
This change affects many test cases. We mostly just replace "%(name)s"
with "{name}", "url_format_string" with "url_template" to make sure that
they still pass. There are some test cases dedicated for testing "%"
escaping, which aren't relevant anymore and are subject to removal.
But for now we keep most of them as-is, and make sure that "%" is always
escaped since we do not use it for variable substitution any more.
Since url_format_string is not populated anymore, a migration is created
to remove this field entirely, and make url_template non-nullable since
we will always populate it. Note that it is possible to have
url_template being null after migration 0422 and before 0424, but
in practice, url_template will not be None after backfilling and the
backend now is always setting url_template.
With the removal of url_format_string, RealmFilter model will now be cleaned
with URL template checks, and the old checks for escapes are removed.
We also modified RealmFilter.clean to skip the validation when the
url_template is invalid. This avoids raising mulitple ValidationError's
when calling full_clean on a linkifier. But we might eventually want to
have a more centric approach to data validation instead of having
the same validation in both the clean method and the validator.
Fixes #23124.
Signed-off-by: Zixuan James Li <p359101898@gmail.com>
2022-10-05 20:55:31 +02:00
|
|
|
self.prepared_url_template.expand(**m.groupdict()),
|
2021-02-12 08:19:30 +01:00
|
|
|
markdown.util.AtomicString(m.group(OUTER_CAPTURE_GROUP)),
|
|
|
|
)
|
CVE-2021-41115: Use re2 for user-supplied linkifier patterns.
Zulip attempts to validate that the regular expressions that admins
enter for linkifiers are well-formatted, and only contain a specific
subset of regex grammar. The process of checking these
properties (via a regex!) can cause denial-of-service via
backtracking.
Furthermore, this validation itself does not prevent the creation of
linkifiers which themselves cause denial-of-service when they are
executed. As the validator accepts literally anything inside of a
`(?P<word>...)` block, any quadratic backtracking expression can be
hidden therein.
Switch user-provided linkifier patterns to be matched in the Markdown
processor by the `re2` library, which is guaranteed constant-time.
This somewhat limits the possible features of the regular
expression (notably, look-head and -behind, and back-references);
however, these features had never been advertised as working in the
context of linkifiers.
A migration removes any existing linkifiers which would not function
under re2, after printing them for posterity during the upgrade; they
are unlikely to be common, and are impossible to fix automatically.
The denial-of-service in the linkifier validator was discovered by
@erik-krogh and @yoff, as GHSL-2021-118.
2021-09-29 01:27:54 +02:00
|
|
|
if isinstance(url, str):
|
|
|
|
return None, None, None
|
|
|
|
|
|
|
|
return (
|
|
|
|
url,
|
|
|
|
m.start(2),
|
|
|
|
m.end(2),
|
|
|
|
)
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2013-06-05 17:45:57 +02:00
|
|
|
|
2021-05-22 12:14:59 +02:00
|
|
|
class UserMentionPattern(CompiledInlineProcessor):
|
2022-07-05 21:21:44 +02:00
|
|
|
def handleMatch( # type: ignore[override] # https://github.com/python/mypy/issues/10197
|
2021-03-16 18:19:51 +01:00
|
|
|
self, m: Match[str], data: str
|
|
|
|
) -> Union[Tuple[None, None, None], Tuple[Element, int, int]]:
|
2021-05-15 18:55:34 +02:00
|
|
|
name = m.group("match")
|
2021-02-12 08:20:45 +01:00
|
|
|
silent = m.group("silent") == "_"
|
2022-10-06 22:58:37 +02:00
|
|
|
db_data: Optional[DbData] = self.zmd.zulip_db_data
|
2021-08-10 20:04:16 +02:00
|
|
|
if db_data is not None:
|
2023-06-06 09:23:01 +02:00
|
|
|
topic_wildcard = mention.user_mention_matches_topic_wildcard(name)
|
2023-06-03 16:51:38 +02:00
|
|
|
stream_wildcard = mention.user_mention_matches_stream_wildcard(name)
|
2018-08-19 00:02:17 +02:00
|
|
|
|
2021-03-25 20:38:06 +01:00
|
|
|
# For @**|id** and @**name|id** mention syntaxes.
|
|
|
|
id_syntax_match = re.match(r"(?P<full_name>.+)?\|(?P<user_id>\d+)$", name)
|
2018-08-19 00:02:17 +02:00
|
|
|
if id_syntax_match:
|
2021-03-25 20:38:06 +01:00
|
|
|
full_name = id_syntax_match.group("full_name")
|
2020-07-02 03:12:09 +02:00
|
|
|
id = int(id_syntax_match.group("user_id"))
|
2021-12-27 19:17:49 +01:00
|
|
|
user = db_data.mention_data.get_user_by_id(id)
|
2021-03-25 20:38:06 +01:00
|
|
|
|
|
|
|
# For @**name|id**, we need to specifically check that
|
|
|
|
# name matches the full_name of user in mention_data.
|
|
|
|
# This enforces our decision that
|
|
|
|
# @**user_1_name|id_for_user_2** should be invalid syntax.
|
2023-01-18 02:59:37 +01:00
|
|
|
if full_name and user and user.full_name != full_name:
|
|
|
|
return None, None, None
|
2018-08-19 00:02:17 +02:00
|
|
|
else:
|
2021-03-25 20:38:06 +01:00
|
|
|
# For @**name** syntax.
|
2021-12-27 19:17:49 +01:00
|
|
|
user = db_data.mention_data.get_user_by_name(name)
|
2013-06-28 16:02:58 +02:00
|
|
|
|
2023-06-06 09:23:01 +02:00
|
|
|
user_id = None
|
2023-06-03 16:51:38 +02:00
|
|
|
if stream_wildcard:
|
2021-05-10 18:52:42 +02:00
|
|
|
if not silent:
|
2023-06-03 16:51:38 +02:00
|
|
|
self.zmd.zulip_rendering_result.mentions_stream_wildcard = True
|
2017-01-20 18:27:30 +01:00
|
|
|
user_id = "*"
|
2023-06-06 09:23:01 +02:00
|
|
|
elif topic_wildcard:
|
|
|
|
if not silent:
|
|
|
|
self.zmd.zulip_rendering_result.mentions_topic_wildcard = True
|
2021-12-28 10:02:27 +01:00
|
|
|
elif user is not None:
|
|
|
|
assert isinstance(user, FullNameInfo)
|
|
|
|
|
2019-01-08 09:30:19 +01:00
|
|
|
if not silent:
|
2022-10-06 22:58:37 +02:00
|
|
|
self.zmd.zulip_rendering_result.mentions_user_ids.add(user.id)
|
2021-12-28 10:02:27 +01:00
|
|
|
name = user.full_name
|
|
|
|
user_id = str(user.id)
|
2013-06-28 16:02:58 +02:00
|
|
|
else:
|
|
|
|
# Don't highlight @mentions that don't refer to a valid user
|
2021-03-16 18:19:51 +01:00
|
|
|
return None, None, None
|
2013-06-28 16:02:58 +02:00
|
|
|
|
2020-06-03 06:37:07 +02:00
|
|
|
el = Element("span")
|
2023-06-06 09:23:01 +02:00
|
|
|
if user_id:
|
|
|
|
el.set("data-user-id", user_id)
|
|
|
|
text = f"@{name}"
|
|
|
|
if topic_wildcard:
|
|
|
|
el.set("class", "topic-mention")
|
2019-01-08 09:30:19 +01:00
|
|
|
else:
|
2021-02-12 08:20:45 +01:00
|
|
|
el.set("class", "user-mention")
|
2023-06-06 09:23:01 +02:00
|
|
|
if silent:
|
|
|
|
el.set("class", el.get("class", "") + " silent")
|
|
|
|
text = f"{name}"
|
2020-03-06 13:00:17 +01:00
|
|
|
el.text = markdown.util.AtomicString(text)
|
2021-03-16 18:19:51 +01:00
|
|
|
return el, m.start(), m.end()
|
|
|
|
return None, None, None
|
2016-10-26 20:56:17 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2021-05-22 12:14:59 +02:00
|
|
|
class UserGroupMentionPattern(CompiledInlineProcessor):
|
2022-07-05 21:21:44 +02:00
|
|
|
def handleMatch( # type: ignore[override] # https://github.com/python/mypy/issues/10197
|
2021-03-18 11:34:49 +01:00
|
|
|
self, m: Match[str], data: str
|
|
|
|
) -> Union[Tuple[None, None, None], Tuple[Element, int, int]]:
|
2021-05-15 19:44:06 +02:00
|
|
|
name = m.group("match")
|
2021-05-16 09:43:47 +02:00
|
|
|
silent = m.group("silent") == "_"
|
2022-10-06 22:58:37 +02:00
|
|
|
db_data: Optional[DbData] = self.zmd.zulip_db_data
|
2021-03-18 11:34:49 +01:00
|
|
|
|
2021-08-10 20:04:16 +02:00
|
|
|
if db_data is not None:
|
2021-12-27 19:17:49 +01:00
|
|
|
user_group = db_data.mention_data.get_user_group(name)
|
2017-09-25 09:47:15 +02:00
|
|
|
if user_group:
|
2021-05-16 09:43:47 +02:00
|
|
|
if not silent:
|
2022-10-06 22:58:37 +02:00
|
|
|
self.zmd.zulip_rendering_result.mentions_user_group_ids.add(user_group.id)
|
2017-09-25 09:47:15 +02:00
|
|
|
name = user_group.name
|
|
|
|
user_group_id = str(user_group.id)
|
|
|
|
else:
|
|
|
|
# Don't highlight @-mentions that don't refer to a valid user
|
|
|
|
# group.
|
2021-03-18 11:34:49 +01:00
|
|
|
return None, None, None
|
2017-09-25 09:47:15 +02:00
|
|
|
|
2020-06-03 06:37:07 +02:00
|
|
|
el = Element("span")
|
2021-02-12 08:20:45 +01:00
|
|
|
el.set("data-user-group-id", user_group_id)
|
2021-05-16 09:43:47 +02:00
|
|
|
if silent:
|
|
|
|
el.set("class", "user-group-mention silent")
|
|
|
|
text = f"{name}"
|
|
|
|
else:
|
|
|
|
el.set("class", "user-group-mention")
|
|
|
|
text = f"@{name}"
|
2020-03-06 12:48:06 +01:00
|
|
|
el.text = markdown.util.AtomicString(text)
|
2021-03-18 11:34:49 +01:00
|
|
|
return el, m.start(), m.end()
|
|
|
|
return None, None, None
|
2017-09-25 09:47:15 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2021-05-22 12:14:59 +02:00
|
|
|
class StreamPattern(CompiledInlineProcessor):
|
2021-12-27 19:41:00 +01:00
|
|
|
def find_stream_id(self, name: str) -> Optional[int]:
|
2022-10-06 22:58:37 +02:00
|
|
|
db_data: Optional[DbData] = self.zmd.zulip_db_data
|
2018-11-07 16:26:33 +01:00
|
|
|
if db_data is None:
|
2016-10-26 20:56:17 +02:00
|
|
|
return None
|
2021-12-27 19:17:49 +01:00
|
|
|
stream_id = db_data.stream_names.get(name)
|
2021-12-27 19:41:00 +01:00
|
|
|
return stream_id
|
2016-10-26 20:56:17 +02:00
|
|
|
|
2022-07-05 21:21:44 +02:00
|
|
|
def handleMatch( # type: ignore[override] # https://github.com/python/mypy/issues/10197
|
2021-03-18 11:43:52 +01:00
|
|
|
self, m: Match[str], data: str
|
|
|
|
) -> Union[Tuple[None, None, None], Tuple[Element, int, int]]:
|
2021-02-12 08:20:45 +01:00
|
|
|
name = m.group("stream_name")
|
2016-10-26 20:56:17 +02:00
|
|
|
|
2021-12-27 19:41:00 +01:00
|
|
|
stream_id = self.find_stream_id(name)
|
|
|
|
if stream_id is None:
|
2021-08-10 20:04:16 +02:00
|
|
|
return None, None, None
|
|
|
|
el = Element("a")
|
|
|
|
el.set("class", "stream")
|
2021-12-27 19:41:00 +01:00
|
|
|
el.set("data-stream-id", str(stream_id))
|
2021-08-10 20:04:16 +02:00
|
|
|
# TODO: We should quite possibly not be specifying the
|
|
|
|
# href here and instead having the browser auto-add the
|
|
|
|
# href when it processes a message with one of these, to
|
|
|
|
# provide more clarity to API clients.
|
|
|
|
# Also do the same for StreamTopicPattern.
|
2021-12-27 19:41:00 +01:00
|
|
|
stream_url = encode_stream(stream_id, name)
|
2021-08-10 20:04:16 +02:00
|
|
|
el.set("href", f"/#narrow/stream/{stream_url}")
|
|
|
|
text = f"#{name}"
|
|
|
|
el.text = markdown.util.AtomicString(text)
|
|
|
|
return el, m.start(), m.end()
|
2016-10-26 20:56:17 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2021-05-22 12:14:59 +02:00
|
|
|
class StreamTopicPattern(CompiledInlineProcessor):
|
2021-12-27 19:41:00 +01:00
|
|
|
def find_stream_id(self, name: str) -> Optional[int]:
|
2022-10-06 22:58:37 +02:00
|
|
|
db_data: Optional[DbData] = self.zmd.zulip_db_data
|
2019-06-21 17:31:16 +02:00
|
|
|
if db_data is None:
|
|
|
|
return None
|
2021-12-27 19:17:49 +01:00
|
|
|
stream_id = db_data.stream_names.get(name)
|
2021-12-27 19:41:00 +01:00
|
|
|
return stream_id
|
2019-06-21 17:31:16 +02:00
|
|
|
|
2022-07-05 21:21:44 +02:00
|
|
|
def handleMatch( # type: ignore[override] # https://github.com/python/mypy/issues/10197
|
2021-03-18 11:43:52 +01:00
|
|
|
self, m: Match[str], data: str
|
|
|
|
) -> Union[Tuple[None, None, None], Tuple[Element, int, int]]:
|
2021-02-12 08:20:45 +01:00
|
|
|
stream_name = m.group("stream_name")
|
|
|
|
topic_name = m.group("topic_name")
|
2019-06-21 17:31:16 +02:00
|
|
|
|
2021-12-27 19:41:00 +01:00
|
|
|
stream_id = self.find_stream_id(stream_name)
|
|
|
|
if stream_id is None or topic_name is None:
|
2021-08-10 20:04:16 +02:00
|
|
|
return None, None, None
|
|
|
|
el = Element("a")
|
|
|
|
el.set("class", "stream-topic")
|
2021-12-27 19:41:00 +01:00
|
|
|
el.set("data-stream-id", str(stream_id))
|
|
|
|
stream_url = encode_stream(stream_id, stream_name)
|
2021-08-10 20:04:16 +02:00
|
|
|
topic_url = hash_util_encode(topic_name)
|
|
|
|
link = f"/#narrow/stream/{stream_url}/topic/{topic_url}"
|
|
|
|
el.set("href", link)
|
|
|
|
text = f"#{stream_name} > {topic_name}"
|
|
|
|
el.text = markdown.util.AtomicString(text)
|
|
|
|
return el, m.start(), m.end()
|
2019-06-21 17:31:16 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def possible_linked_stream_names(content: str) -> Set[str]:
|
2023-07-31 22:52:35 +02:00
|
|
|
return {
|
|
|
|
*re.findall(STREAM_LINK_REGEX, content, re.VERBOSE),
|
|
|
|
*(
|
|
|
|
match.group("stream_name")
|
|
|
|
for match in re.finditer(STREAM_TOPIC_LINK_REGEX, content, re.VERBOSE)
|
|
|
|
),
|
|
|
|
}
|
2017-09-15 00:25:38 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2020-04-19 21:13:03 +02:00
|
|
|
class AlertWordNotificationProcessor(markdown.preprocessors.Preprocessor):
|
2021-02-12 08:20:45 +01:00
|
|
|
allowed_before_punctuation = {" ", "\n", "(", '"', ".", ",", "'", ";", "[", "*", "`", ">"}
|
2021-02-12 08:19:30 +01:00
|
|
|
allowed_after_punctuation = {
|
2021-02-12 08:20:45 +01:00
|
|
|
" ",
|
|
|
|
"\n",
|
|
|
|
")",
|
2021-02-12 08:19:30 +01:00
|
|
|
'",',
|
2021-02-12 08:20:45 +01:00
|
|
|
"?",
|
|
|
|
":",
|
|
|
|
".",
|
|
|
|
",",
|
|
|
|
"'",
|
|
|
|
";",
|
|
|
|
"]",
|
|
|
|
"!",
|
|
|
|
"*",
|
|
|
|
"`",
|
2021-02-12 08:19:30 +01:00
|
|
|
}
|
2019-02-11 15:19:38 +01:00
|
|
|
|
2022-10-06 22:58:37 +02:00
|
|
|
def __init__(self, zmd: "ZulipMarkdown") -> None:
|
|
|
|
super().__init__(zmd)
|
|
|
|
self.zmd = zmd
|
|
|
|
|
2019-02-11 15:19:38 +01:00
|
|
|
def check_valid_start_position(self, content: str, index: int) -> bool:
|
|
|
|
if index <= 0 or content[index] in self.allowed_before_punctuation:
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
def check_valid_end_position(self, content: str, index: int) -> bool:
|
|
|
|
if index >= len(content) or content[index] in self.allowed_after_punctuation:
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
2020-11-10 23:02:26 +01:00
|
|
|
def run(self, lines: List[str]) -> List[str]:
|
2022-10-06 22:58:37 +02:00
|
|
|
db_data: Optional[DbData] = self.zmd.zulip_db_data
|
2021-08-10 20:04:16 +02:00
|
|
|
if db_data is not None:
|
2017-03-15 02:06:22 +01:00
|
|
|
# We check for alert words here, the set of which are
|
2016-09-14 18:02:24 +02:00
|
|
|
# dependent on which users may see this message.
|
|
|
|
#
|
|
|
|
# Our caller passes in the list of possible_words. We
|
|
|
|
# don't do any special rendering; we just append the alert words
|
2022-10-06 22:58:37 +02:00
|
|
|
# we find to the set self.zmd.zulip_rendering_result.user_ids_with_alert_words.
|
2016-09-14 18:02:24 +02:00
|
|
|
|
2021-12-27 19:17:49 +01:00
|
|
|
realm_alert_words_automaton = db_data.realm_alert_words_automaton
|
2013-09-03 22:41:17 +02:00
|
|
|
|
2019-02-11 15:19:38 +01:00
|
|
|
if realm_alert_words_automaton is not None:
|
2021-02-12 08:20:45 +01:00
|
|
|
content = "\n".join(lines).lower()
|
2021-02-12 08:19:30 +01:00
|
|
|
for end_index, (original_value, user_ids) in realm_alert_words_automaton.iter(
|
|
|
|
content
|
|
|
|
):
|
|
|
|
if self.check_valid_start_position(
|
|
|
|
content, end_index - len(original_value)
|
|
|
|
) and self.check_valid_end_position(content, end_index + 1):
|
2022-10-06 22:58:37 +02:00
|
|
|
self.zmd.zulip_rendering_result.user_ids_with_alert_words.update(user_ids)
|
2013-09-03 22:41:17 +02:00
|
|
|
return lines
|
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2019-08-11 13:04:53 +02:00
|
|
|
class LinkInlineProcessor(markdown.inlinepatterns.LinkInlineProcessor):
|
2022-10-06 22:58:37 +02:00
|
|
|
def __init__(self, pattern: str, zmd: "ZulipMarkdown") -> None:
|
|
|
|
super().__init__(pattern, zmd)
|
|
|
|
self.zmd = zmd
|
|
|
|
|
2019-08-11 13:04:53 +02:00
|
|
|
def zulip_specific_link_changes(self, el: Element) -> Union[None, Element]:
|
2021-02-12 08:20:45 +01:00
|
|
|
href = el.get("href")
|
2020-07-05 02:40:07 +02:00
|
|
|
assert href is not None
|
2019-01-22 19:21:56 +01:00
|
|
|
|
2020-10-23 02:43:28 +02:00
|
|
|
# Sanitize URL or don't parse link. See linkify_tests in markdown_test_cases for banned syntax.
|
2019-01-22 19:21:56 +01:00
|
|
|
href = sanitize_url(self.unescape(href.strip()))
|
|
|
|
if href is None:
|
2019-08-11 13:04:53 +02:00
|
|
|
return None # no-op; the link is not processed.
|
2019-01-22 19:21:56 +01:00
|
|
|
|
2019-08-11 13:04:53 +02:00
|
|
|
# Rewrite local links to be relative
|
2022-10-06 22:58:37 +02:00
|
|
|
db_data: Optional[DbData] = self.zmd.zulip_db_data
|
2019-01-22 19:21:56 +01:00
|
|
|
href = rewrite_local_links_to_relative(db_data, href)
|
|
|
|
|
2019-08-11 13:04:53 +02:00
|
|
|
# Make changes to <a> tag attributes
|
|
|
|
el.set("href", href)
|
|
|
|
|
2019-08-11 13:34:24 +02:00
|
|
|
# Show link href if title is empty
|
2020-07-05 02:40:07 +02:00
|
|
|
if not el.text or not el.text.strip():
|
2019-08-11 13:34:24 +02:00
|
|
|
el.text = href
|
|
|
|
|
2021-03-30 12:08:03 +02:00
|
|
|
# Prevent linkifiers from running on the content of a Markdown link, breaking up the link.
|
2019-08-11 13:04:53 +02:00
|
|
|
# This is a monkey-patch, but it might be worth sending a version of this change upstream.
|
2020-04-18 03:55:04 +02:00
|
|
|
el.text = markdown.util.AtomicString(el.text)
|
2019-08-11 13:04:53 +02:00
|
|
|
|
2019-01-22 19:21:56 +01:00
|
|
|
return el
|
|
|
|
|
2022-07-05 21:21:44 +02:00
|
|
|
def handleMatch( # type: ignore[override] # https://github.com/python/mypy/issues/10197
|
2021-03-10 20:22:07 +01:00
|
|
|
self, m: Match[str], data: str
|
|
|
|
) -> Union[Tuple[None, None, None], Tuple[Element, int, int]]:
|
|
|
|
ret = super().handleMatch(m, data)
|
|
|
|
if ret[0] is not None:
|
|
|
|
el: Optional[Element]
|
|
|
|
el, match_start, index = ret
|
2019-08-11 13:04:53 +02:00
|
|
|
el = self.zulip_specific_link_changes(el)
|
2021-03-10 20:22:07 +01:00
|
|
|
if el is not None:
|
|
|
|
return el, match_start, index
|
|
|
|
return None, None, None
|
2013-07-31 22:53:15 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2018-12-20 08:28:40 +01:00
|
|
|
def get_sub_registry(r: markdown.util.Registry, keys: List[str]) -> markdown.util.Registry:
|
2020-10-23 02:43:28 +02:00
|
|
|
# Registry is a new class added by Python-Markdown to replace OrderedDict.
|
2018-12-20 08:28:40 +01:00
|
|
|
# Since Registry doesn't support .keys(), it is easier to make a new
|
|
|
|
# object instead of removing keys from the existing object.
|
|
|
|
new_r = markdown.util.Registry()
|
|
|
|
for k in keys:
|
|
|
|
new_r.register(r[k], k, r.get_index_for_name(k))
|
|
|
|
return new_r
|
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2021-03-30 12:15:39 +02:00
|
|
|
# These are used as keys ("linkifiers_keys") to md_engines and the respective
|
|
|
|
# linkifier caches
|
2020-06-26 22:05:13 +02:00
|
|
|
DEFAULT_MARKDOWN_KEY = -1
|
|
|
|
ZEPHYR_MIRROR_MARKDOWN_KEY = -2
|
2016-12-31 03:08:43 +01:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2022-10-06 22:46:09 +02:00
|
|
|
class ZulipMarkdown(markdown.Markdown):
|
2020-11-10 23:08:50 +01:00
|
|
|
zulip_message: Optional[Message]
|
|
|
|
zulip_realm: Optional[Realm]
|
|
|
|
zulip_db_data: Optional[DbData]
|
2022-10-06 22:37:16 +02:00
|
|
|
zulip_rendering_result: MessageRenderingResult
|
2020-11-10 23:08:50 +01:00
|
|
|
image_preview_enabled: bool
|
|
|
|
url_embed_preview_enabled: bool
|
2022-04-14 21:57:20 +02:00
|
|
|
url_embed_data: Optional[Dict[str, Optional[UrlEmbedData]]]
|
2020-11-10 23:08:50 +01:00
|
|
|
|
2020-11-10 23:28:18 +01:00
|
|
|
def __init__(
|
|
|
|
self,
|
2021-03-30 12:38:49 +02:00
|
|
|
linkifiers: List[LinkifierDict],
|
2021-03-30 12:15:39 +02:00
|
|
|
linkifiers_key: int,
|
2020-11-10 23:28:18 +01:00
|
|
|
email_gateway: bool,
|
|
|
|
) -> None:
|
2021-03-30 12:15:39 +02:00
|
|
|
self.linkifiers = linkifiers
|
|
|
|
self.linkifiers_key = linkifiers_key
|
2021-02-05 06:36:24 +01:00
|
|
|
self.email_gateway = email_gateway
|
2016-10-14 05:23:15 +02:00
|
|
|
|
2020-11-10 23:28:18 +01:00
|
|
|
super().__init__(
|
|
|
|
extensions=[
|
|
|
|
nl2br.makeExtension(),
|
|
|
|
tables.makeExtension(),
|
|
|
|
codehilite.makeExtension(
|
|
|
|
linenums=False,
|
|
|
|
guess_lang=False,
|
|
|
|
),
|
|
|
|
],
|
|
|
|
)
|
2021-02-12 08:20:45 +01:00
|
|
|
self.set_output_format("html")
|
2016-10-14 05:23:15 +02:00
|
|
|
|
2019-01-20 09:10:58 +01:00
|
|
|
def build_parser(self) -> markdown.Markdown:
|
2020-10-23 02:43:28 +02:00
|
|
|
# Build the parser using selected default features from Python-Markdown.
|
2019-01-20 09:10:58 +01:00
|
|
|
# The complete list of all available processors can be found in the
|
|
|
|
# super().build_parser() function.
|
|
|
|
#
|
2020-10-23 02:43:28 +02:00
|
|
|
# Note: for any Python-Markdown updates, manually check if we want any
|
2019-01-20 09:10:58 +01:00
|
|
|
# of the new features added upstream or not; they wouldn't get
|
|
|
|
# included by default.
|
|
|
|
self.preprocessors = self.build_preprocessors()
|
|
|
|
self.parser = self.build_block_parser()
|
|
|
|
self.inlinePatterns = self.build_inlinepatterns()
|
|
|
|
self.treeprocessors = self.build_treeprocessors()
|
|
|
|
self.postprocessors = self.build_postprocessors()
|
|
|
|
self.handle_zephyr_mirror()
|
|
|
|
return self
|
|
|
|
|
|
|
|
def build_preprocessors(self) -> markdown.util.Registry:
|
2019-01-28 21:24:06 +01:00
|
|
|
# We disable the following preprocessors from upstream:
|
|
|
|
#
|
|
|
|
# html_block - insecure
|
|
|
|
# reference - references don't make sense in a chat context.
|
2019-01-20 09:10:58 +01:00
|
|
|
preprocessors = markdown.util.Registry()
|
2021-02-12 08:20:45 +01:00
|
|
|
preprocessors.register(MarkdownListPreprocessor(self), "hanging_lists", 35)
|
2021-02-12 08:19:30 +01:00
|
|
|
preprocessors.register(
|
2021-02-12 08:20:45 +01:00
|
|
|
markdown.preprocessors.NormalizeWhitespace(self), "normalize_whitespace", 30
|
2021-02-12 08:19:30 +01:00
|
|
|
)
|
2021-02-12 08:20:45 +01:00
|
|
|
preprocessors.register(fenced_code.FencedBlockPreprocessor(self), "fenced_code_block", 25)
|
2021-02-12 08:19:30 +01:00
|
|
|
preprocessors.register(
|
2021-02-12 08:20:45 +01:00
|
|
|
AlertWordNotificationProcessor(self), "custom_text_notifications", 20
|
2021-02-12 08:19:30 +01:00
|
|
|
)
|
2019-01-20 09:10:58 +01:00
|
|
|
return preprocessors
|
|
|
|
|
2020-11-11 00:39:09 +01:00
|
|
|
def build_block_parser(self) -> BlockParser:
|
2019-01-28 21:24:06 +01:00
|
|
|
# We disable the following blockparsers from upstream:
|
|
|
|
#
|
|
|
|
# indent - replaced by ours
|
2020-03-02 23:36:03 +01:00
|
|
|
# setextheader - disabled; we only support hashheaders for headings
|
2019-01-28 21:24:06 +01:00
|
|
|
# olist - replaced by ours
|
|
|
|
# ulist - replaced by ours
|
|
|
|
# quote - replaced by ours
|
2020-11-11 00:39:09 +01:00
|
|
|
parser = BlockParser(self)
|
2021-02-12 08:19:30 +01:00
|
|
|
parser.blockprocessors.register(
|
2021-02-12 08:20:45 +01:00
|
|
|
markdown.blockprocessors.EmptyBlockProcessor(parser), "empty", 95
|
2021-02-12 08:19:30 +01:00
|
|
|
)
|
2021-02-12 08:20:45 +01:00
|
|
|
parser.blockprocessors.register(ListIndentProcessor(parser), "indent", 90)
|
2021-02-05 06:36:24 +01:00
|
|
|
if not self.email_gateway:
|
2021-02-12 08:19:30 +01:00
|
|
|
parser.blockprocessors.register(
|
2021-02-12 08:20:45 +01:00
|
|
|
markdown.blockprocessors.CodeBlockProcessor(parser), "code", 85
|
2021-02-12 08:19:30 +01:00
|
|
|
)
|
2021-02-12 08:20:45 +01:00
|
|
|
parser.blockprocessors.register(HashHeaderProcessor(parser), "hashheader", 80)
|
2019-01-20 09:10:58 +01:00
|
|
|
# We get priority 75 from 'table' extension
|
2021-02-12 08:20:45 +01:00
|
|
|
parser.blockprocessors.register(markdown.blockprocessors.HRProcessor(parser), "hr", 70)
|
|
|
|
parser.blockprocessors.register(OListProcessor(parser), "olist", 65)
|
|
|
|
parser.blockprocessors.register(UListProcessor(parser), "ulist", 60)
|
|
|
|
parser.blockprocessors.register(BlockQuoteProcessor(parser), "quote", 55)
|
2022-06-26 02:38:55 +02:00
|
|
|
# We get priority 51 from our 'include' extension
|
2021-02-12 08:19:30 +01:00
|
|
|
parser.blockprocessors.register(
|
2021-02-12 08:20:45 +01:00
|
|
|
markdown.blockprocessors.ParagraphProcessor(parser), "paragraph", 50
|
2021-02-12 08:19:30 +01:00
|
|
|
)
|
2019-01-20 09:10:58 +01:00
|
|
|
return parser
|
|
|
|
|
|
|
|
def build_inlinepatterns(self) -> markdown.util.Registry:
|
2019-01-28 21:24:06 +01:00
|
|
|
# We disable the following upstream inline patterns:
|
|
|
|
#
|
|
|
|
# backtick - replaced by ours
|
|
|
|
# escape - probably will re-add at some point.
|
|
|
|
# link - replaced by ours
|
|
|
|
# image_link - replaced by ours
|
|
|
|
# autolink - replaced by ours
|
|
|
|
# automail - replaced by ours
|
|
|
|
# linebreak - we use nl2br and consider that good enough
|
|
|
|
# html - insecure
|
|
|
|
# reference - references not useful
|
|
|
|
# image_reference - references not useful
|
|
|
|
# short_reference - references not useful
|
|
|
|
# ---------------------------------------------------
|
|
|
|
# strong_em - for these three patterns,
|
|
|
|
# strong2 - we have our own versions where
|
|
|
|
# emphasis2 - we disable _ for bold and emphasis
|
|
|
|
|
2019-01-20 09:10:58 +01:00
|
|
|
# Declare regexes for clean single line calls to .register().
|
2021-06-14 22:56:13 +02:00
|
|
|
#
|
2016-11-08 07:26:38 +01:00
|
|
|
# Custom strikethrough syntax: ~~foo~~
|
2021-02-12 08:20:45 +01:00
|
|
|
DEL_RE = r"(?<!~)(\~\~)([^~\n]+?)(\~\~)(?!~)"
|
2019-01-20 09:10:58 +01:00
|
|
|
# Custom bold syntax: **foo** but not __foo__
|
2018-05-11 01:42:51 +02:00
|
|
|
# str inside ** must start and end with a word character
|
2016-11-03 07:56:28 +01:00
|
|
|
# it need for things like "const char *x = (char *)y"
|
2021-02-12 08:20:45 +01:00
|
|
|
EMPHASIS_RE = r"(\*)(?!\s+)([^\*^\n]+)(?<!\s)\*"
|
2021-06-14 22:56:13 +02:00
|
|
|
STRONG_RE = r"(\*\*)([^\n]+?)\2"
|
2021-02-12 08:20:45 +01:00
|
|
|
STRONG_EM_RE = r"(\*\*\*)(?!\s+)([^\*^\n]+)(?<!\s)\*\*\*"
|
2021-06-14 22:56:13 +02:00
|
|
|
TEX_RE = r"\B(?<!\$)\$\$(?P<body>[^\n_$](\\\$|[^$\n])*)\$\$(?!\$)\B"
|
|
|
|
TIMESTAMP_RE = r"<time:(?P<time>[^>]*?)>"
|
2019-01-20 09:10:58 +01:00
|
|
|
|
2020-10-23 02:43:28 +02:00
|
|
|
# Add inline patterns. We use a custom numbering of the
|
2019-01-28 21:24:06 +01:00
|
|
|
# rules, that preserves the order from upstream but leaves
|
|
|
|
# space for us to add our own.
|
2019-01-20 09:10:58 +01:00
|
|
|
reg = markdown.util.Registry()
|
2021-02-12 08:20:45 +01:00
|
|
|
reg.register(BacktickInlineProcessor(markdown.inlinepatterns.BACKTICK_RE), "backtick", 105)
|
2021-02-12 08:19:30 +01:00
|
|
|
reg.register(
|
2021-02-12 08:20:45 +01:00
|
|
|
markdown.inlinepatterns.DoubleTagPattern(STRONG_EM_RE, "strong,em"), "strong_em", 100
|
2021-02-12 08:19:30 +01:00
|
|
|
)
|
2021-05-15 11:31:24 +02:00
|
|
|
reg.register(UserMentionPattern(mention.MENTIONS_RE, self), "usermention", 95)
|
2021-06-14 22:56:13 +02:00
|
|
|
reg.register(Tex(TEX_RE, self), "tex", 90)
|
2021-02-12 08:20:45 +01:00
|
|
|
reg.register(StreamTopicPattern(get_compiled_stream_topic_link_regex(), self), "topic", 87)
|
|
|
|
reg.register(StreamPattern(get_compiled_stream_link_regex(), self), "stream", 85)
|
2021-06-14 22:56:13 +02:00
|
|
|
reg.register(Timestamp(TIMESTAMP_RE), "timestamp", 75)
|
2021-02-12 08:19:30 +01:00
|
|
|
reg.register(
|
2021-05-15 11:38:21 +02:00
|
|
|
UserGroupMentionPattern(mention.USER_GROUP_MENTIONS_RE, self), "usergroupmention", 65
|
2021-02-12 08:19:30 +01:00
|
|
|
)
|
2021-02-12 08:20:45 +01:00
|
|
|
reg.register(LinkInlineProcessor(markdown.inlinepatterns.LINK_RE, self), "link", 60)
|
|
|
|
reg.register(AutoLink(get_web_link_regex(), self), "autolink", 55)
|
2021-03-30 12:15:39 +02:00
|
|
|
# Reserve priority 45-54 for linkifiers
|
|
|
|
reg = self.register_linkifiers(reg)
|
2021-02-12 08:19:30 +01:00
|
|
|
reg.register(
|
2021-06-14 22:56:13 +02:00
|
|
|
markdown.inlinepatterns.HtmlInlineProcessor(markdown.inlinepatterns.ENTITY_RE, self),
|
|
|
|
"entity",
|
|
|
|
40,
|
2021-02-12 08:19:30 +01:00
|
|
|
)
|
2021-06-14 22:56:13 +02:00
|
|
|
reg.register(markdown.inlinepatterns.SimpleTagPattern(STRONG_RE, "strong"), "strong", 35)
|
2021-02-12 08:20:45 +01:00
|
|
|
reg.register(markdown.inlinepatterns.SimpleTagPattern(EMPHASIS_RE, "em"), "emphasis", 30)
|
|
|
|
reg.register(markdown.inlinepatterns.SimpleTagPattern(DEL_RE, "del"), "del", 25)
|
2021-02-12 08:19:30 +01:00
|
|
|
reg.register(
|
2021-06-14 22:56:13 +02:00
|
|
|
markdown.inlinepatterns.SimpleTextInlineProcessor(
|
|
|
|
markdown.inlinepatterns.NOT_STRONG_RE
|
|
|
|
),
|
|
|
|
"not_strong",
|
|
|
|
20,
|
2021-02-12 08:19:30 +01:00
|
|
|
)
|
2021-02-12 08:20:45 +01:00
|
|
|
reg.register(Emoji(EMOJI_REGEX, self), "emoji", 15)
|
2021-05-15 12:02:50 +02:00
|
|
|
reg.register(EmoticonTranslation(EMOTICON_RE, self), "translate_emoticons", 10)
|
2019-01-20 09:10:58 +01:00
|
|
|
# We get priority 5 from 'nl2br' extension
|
2023-08-10 21:00:45 +02:00
|
|
|
reg.register(UnicodeEmoji(cast(Pattern[str], POSSIBLE_EMOJI_RE), self), "unicodeemoji", 0)
|
2019-01-20 09:10:58 +01:00
|
|
|
return reg
|
|
|
|
|
2022-11-16 06:15:41 +01:00
|
|
|
def register_linkifiers(self, registry: markdown.util.Registry) -> markdown.util.Registry:
|
2021-03-30 12:38:49 +02:00
|
|
|
for linkifier in self.linkifiers:
|
|
|
|
pattern = linkifier["pattern"]
|
2022-11-16 06:15:41 +01:00
|
|
|
registry.register(
|
linkifier: Support URL templates for linkifiers.
This swaps out url_format_string from all of our APIs and replaces it
with url_template. Note that the documentation changes in the following
commits will be squashed with this commit.
We change the "url_format" key to "url_template" for the
realm_linkifiers events in event_schema, along with updating
LinkifierDict. "url_template" is the name chosen to normalize
mixed usages of "url_format_string" and "url_format" throughout
the backend.
The markdown processor is updated to stop handling the format string
interpolation and delegate the task template expansion to the uri_template
library instead.
This change affects many test cases. We mostly just replace "%(name)s"
with "{name}", "url_format_string" with "url_template" to make sure that
they still pass. There are some test cases dedicated for testing "%"
escaping, which aren't relevant anymore and are subject to removal.
But for now we keep most of them as-is, and make sure that "%" is always
escaped since we do not use it for variable substitution any more.
Since url_format_string is not populated anymore, a migration is created
to remove this field entirely, and make url_template non-nullable since
we will always populate it. Note that it is possible to have
url_template being null after migration 0422 and before 0424, but
in practice, url_template will not be None after backfilling and the
backend now is always setting url_template.
With the removal of url_format_string, RealmFilter model will now be cleaned
with URL template checks, and the old checks for escapes are removed.
We also modified RealmFilter.clean to skip the validation when the
url_template is invalid. This avoids raising mulitple ValidationError's
when calling full_clean on a linkifier. But we might eventually want to
have a more centric approach to data validation instead of having
the same validation in both the clean method and the validator.
Fixes #23124.
Signed-off-by: Zixuan James Li <p359101898@gmail.com>
2022-10-05 20:55:31 +02:00
|
|
|
LinkifierPattern(pattern, linkifier["url_template"], self),
|
2021-03-30 12:15:39 +02:00
|
|
|
f"linkifiers/{pattern}",
|
|
|
|
45,
|
2021-02-12 08:19:30 +01:00
|
|
|
)
|
2022-11-16 06:15:41 +01:00
|
|
|
return registry
|
2019-01-20 09:10:58 +01:00
|
|
|
|
|
|
|
def build_treeprocessors(self) -> markdown.util.Registry:
|
2019-01-28 21:24:06 +01:00
|
|
|
# Here we build all the processors from upstream, plus a few of our own.
|
2019-01-20 09:10:58 +01:00
|
|
|
treeprocessors = markdown.util.Registry()
|
|
|
|
# We get priority 30 from 'hilite' extension
|
2021-02-12 08:20:45 +01:00
|
|
|
treeprocessors.register(markdown.treeprocessors.InlineProcessor(self), "inline", 25)
|
|
|
|
treeprocessors.register(markdown.treeprocessors.PrettifyTreeprocessor(self), "prettify", 20)
|
2023-03-04 01:24:14 +01:00
|
|
|
treeprocessors.register(markdown.treeprocessors.UnescapeTreeprocessor(self), "unescape", 18)
|
2021-02-12 08:19:30 +01:00
|
|
|
treeprocessors.register(
|
2021-02-12 08:20:45 +01:00
|
|
|
InlineInterestingLinkProcessor(self), "inline_interesting_links", 15
|
2021-02-12 08:19:30 +01:00
|
|
|
)
|
2013-11-15 19:53:04 +01:00
|
|
|
if settings.CAMO_URI:
|
2021-03-23 10:34:55 +01:00
|
|
|
treeprocessors.register(InlineImageProcessor(self), "rewrite_images_proxy", 10)
|
2019-01-20 09:10:58 +01:00
|
|
|
return treeprocessors
|
|
|
|
|
|
|
|
def build_postprocessors(self) -> markdown.util.Registry:
|
2021-04-25 23:11:21 +02:00
|
|
|
# These are the default Python-Markdown processors, unmodified.
|
2019-01-20 09:10:58 +01:00
|
|
|
postprocessors = markdown.util.Registry()
|
2021-02-12 08:20:45 +01:00
|
|
|
postprocessors.register(markdown.postprocessors.RawHtmlPostprocessor(self), "raw_html", 20)
|
2021-02-12 08:19:30 +01:00
|
|
|
postprocessors.register(
|
2021-02-12 08:20:45 +01:00
|
|
|
markdown.postprocessors.AndSubstitutePostprocessor(), "amp_substitute", 15
|
2021-02-12 08:19:30 +01:00
|
|
|
)
|
2019-01-20 09:10:58 +01:00
|
|
|
return postprocessors
|
|
|
|
|
|
|
|
def handle_zephyr_mirror(self) -> None:
|
2021-03-30 12:15:39 +02:00
|
|
|
if self.linkifiers_key == ZEPHYR_MIRROR_MARKDOWN_KEY:
|
2016-07-27 02:04:11 +02:00
|
|
|
# Disable almost all inline patterns for zephyr mirror
|
|
|
|
# users' traffic that is mirrored. Note that
|
|
|
|
# inline_interesting_links is a treeprocessor and thus is
|
|
|
|
# not removed
|
2021-02-12 08:20:45 +01:00
|
|
|
self.inlinePatterns = get_sub_registry(self.inlinePatterns, ["autolink"])
|
2021-02-12 08:19:30 +01:00
|
|
|
self.treeprocessors = get_sub_registry(
|
2021-03-23 10:34:55 +01:00
|
|
|
self.treeprocessors, ["inline_interesting_links", "rewrite_images_proxy"]
|
2021-02-12 08:19:30 +01:00
|
|
|
)
|
2019-01-20 09:10:58 +01:00
|
|
|
# insert new 'inline' processor because we have changed self.inlinePatterns
|
2018-12-20 08:28:40 +01:00
|
|
|
# but InlineProcessor copies md as self.md in __init__.
|
2021-02-12 08:19:30 +01:00
|
|
|
self.treeprocessors.register(
|
2021-02-12 08:20:45 +01:00
|
|
|
markdown.treeprocessors.InlineProcessor(self), "inline", 25
|
2021-02-12 08:19:30 +01:00
|
|
|
)
|
2021-02-12 08:20:45 +01:00
|
|
|
self.preprocessors = get_sub_registry(self.preprocessors, ["custom_text_notifications"])
|
2021-02-12 08:19:30 +01:00
|
|
|
self.parser.blockprocessors = get_sub_registry(
|
2021-02-12 08:20:45 +01:00
|
|
|
self.parser.blockprocessors, ["paragraph"]
|
2021-02-12 08:19:30 +01:00
|
|
|
)
|
|
|
|
|
2013-06-05 17:45:57 +02:00
|
|
|
|
2022-10-06 22:46:09 +02:00
|
|
|
md_engines: Dict[Tuple[int, bool], ZulipMarkdown] = {}
|
2021-03-30 12:38:49 +02:00
|
|
|
linkifier_data: Dict[int, List[LinkifierDict]] = {}
|
2013-06-05 17:45:57 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2021-03-30 12:15:39 +02:00
|
|
|
def make_md_engine(linkifiers_key: int, email_gateway: bool) -> None:
|
|
|
|
md_engine_key = (linkifiers_key, email_gateway)
|
2017-11-11 16:00:46 +01:00
|
|
|
if md_engine_key in md_engines:
|
|
|
|
del md_engines[md_engine_key]
|
|
|
|
|
2021-03-30 12:15:39 +02:00
|
|
|
linkifiers = linkifier_data[linkifiers_key]
|
2022-10-06 22:46:09 +02:00
|
|
|
md_engines[md_engine_key] = ZulipMarkdown(
|
2021-03-30 12:15:39 +02:00
|
|
|
linkifiers=linkifiers,
|
|
|
|
linkifiers_key=linkifiers_key,
|
2018-11-03 15:09:59 +01:00
|
|
|
email_gateway=email_gateway,
|
|
|
|
)
|
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2019-05-25 16:10:30 +02:00
|
|
|
# Split the topic name into multiple sections so that we can easily use
|
|
|
|
# our common single link matching regex on it.
|
2023-09-01 23:55:49 +02:00
|
|
|
basic_link_splitter = re.compile(r"[ !;\),\'\"]")
|
2019-05-25 16:10:30 +02:00
|
|
|
|
2022-10-09 04:45:39 +02:00
|
|
|
|
|
|
|
def percent_escape_format_string(format_string: str) -> str:
|
|
|
|
# Find percent-encoded bytes and escape them from the python
|
|
|
|
# interpolation. That is:
|
|
|
|
# %(foo)s -> %(foo)s
|
|
|
|
# %% -> %%
|
|
|
|
# %ab -> %%ab
|
|
|
|
# %%ab -> %%ab
|
|
|
|
# %%%ab -> %%%%ab
|
|
|
|
#
|
|
|
|
# We do this here, rather than before storing, to make edits
|
|
|
|
# to the underlying linkifier more straightforward, and
|
|
|
|
# because JS does not have a real formatter.
|
|
|
|
return re.sub(r"(?<!%)(%%)*%([a-fA-F0-9][a-fA-F0-9])", r"\1%%\2", format_string)
|
|
|
|
|
|
|
|
|
2022-12-02 01:39:06 +01:00
|
|
|
@dataclass
|
|
|
|
class TopicLinkMatch:
|
|
|
|
url: str
|
|
|
|
text: str
|
|
|
|
index: int
|
|
|
|
precedence: Optional[int]
|
|
|
|
|
|
|
|
|
2019-06-21 08:54:25 +02:00
|
|
|
# Security note: We don't do any HTML escaping in this
|
|
|
|
# function on the URLs; they are expected to be HTML-escaped when
|
|
|
|
# rendered by clients (just as links rendered into message bodies
|
|
|
|
# are validated and escaped inside `url_to_a`).
|
2021-03-30 12:15:39 +02:00
|
|
|
def topic_links(linkifiers_key: int, topic_name: str) -> List[Dict[str, str]]:
|
2022-12-02 01:39:06 +01:00
|
|
|
matches: List[TopicLinkMatch] = []
|
2021-03-30 12:15:39 +02:00
|
|
|
linkifiers = linkifiers_for_realm(linkifiers_key)
|
2022-12-02 01:39:06 +01:00
|
|
|
precedence = 0
|
2016-06-01 04:46:42 +02:00
|
|
|
|
CVE-2021-41115: Use re2 for user-supplied linkifier patterns.
Zulip attempts to validate that the regular expressions that admins
enter for linkifiers are well-formatted, and only contain a specific
subset of regex grammar. The process of checking these
properties (via a regex!) can cause denial-of-service via
backtracking.
Furthermore, this validation itself does not prevent the creation of
linkifiers which themselves cause denial-of-service when they are
executed. As the validator accepts literally anything inside of a
`(?P<word>...)` block, any quadratic backtracking expression can be
hidden therein.
Switch user-provided linkifier patterns to be matched in the Markdown
processor by the `re2` library, which is guaranteed constant-time.
This somewhat limits the possible features of the regular
expression (notably, look-head and -behind, and back-references);
however, these features had never been advertised as working in the
context of linkifiers.
A migration removes any existing linkifiers which would not function
under re2, after printing them for posterity during the upgrade; they
are unlikely to be common, and are impossible to fix automatically.
The denial-of-service in the linkifier validator was discovered by
@erik-krogh and @yoff, as GHSL-2021-118.
2021-09-29 01:27:54 +02:00
|
|
|
options = re2.Options()
|
|
|
|
options.log_errors = False
|
2021-03-30 12:15:39 +02:00
|
|
|
for linkifier in linkifiers:
|
2021-03-30 12:38:49 +02:00
|
|
|
raw_pattern = linkifier["pattern"]
|
linkifier: Support URL templates for linkifiers.
This swaps out url_format_string from all of our APIs and replaces it
with url_template. Note that the documentation changes in the following
commits will be squashed with this commit.
We change the "url_format" key to "url_template" for the
realm_linkifiers events in event_schema, along with updating
LinkifierDict. "url_template" is the name chosen to normalize
mixed usages of "url_format_string" and "url_format" throughout
the backend.
The markdown processor is updated to stop handling the format string
interpolation and delegate the task template expansion to the uri_template
library instead.
This change affects many test cases. We mostly just replace "%(name)s"
with "{name}", "url_format_string" with "url_template" to make sure that
they still pass. There are some test cases dedicated for testing "%"
escaping, which aren't relevant anymore and are subject to removal.
But for now we keep most of them as-is, and make sure that "%" is always
escaped since we do not use it for variable substitution any more.
Since url_format_string is not populated anymore, a migration is created
to remove this field entirely, and make url_template non-nullable since
we will always populate it. Note that it is possible to have
url_template being null after migration 0422 and before 0424, but
in practice, url_template will not be None after backfilling and the
backend now is always setting url_template.
With the removal of url_format_string, RealmFilter model will now be cleaned
with URL template checks, and the old checks for escapes are removed.
We also modified RealmFilter.clean to skip the validation when the
url_template is invalid. This avoids raising mulitple ValidationError's
when calling full_clean on a linkifier. But we might eventually want to
have a more centric approach to data validation instead of having
the same validation in both the clean method and the validator.
Fixes #23124.
Signed-off-by: Zixuan James Li <p359101898@gmail.com>
2022-10-05 20:55:31 +02:00
|
|
|
prepared_url_template = uri_template.URITemplate(linkifier["url_template"])
|
CVE-2021-41115: Use re2 for user-supplied linkifier patterns.
Zulip attempts to validate that the regular expressions that admins
enter for linkifiers are well-formatted, and only contain a specific
subset of regex grammar. The process of checking these
properties (via a regex!) can cause denial-of-service via
backtracking.
Furthermore, this validation itself does not prevent the creation of
linkifiers which themselves cause denial-of-service when they are
executed. As the validator accepts literally anything inside of a
`(?P<word>...)` block, any quadratic backtracking expression can be
hidden therein.
Switch user-provided linkifier patterns to be matched in the Markdown
processor by the `re2` library, which is guaranteed constant-time.
This somewhat limits the possible features of the regular
expression (notably, look-head and -behind, and back-references);
however, these features had never been advertised as working in the
context of linkifiers.
A migration removes any existing linkifiers which would not function
under re2, after printing them for posterity during the upgrade; they
are unlikely to be common, and are impossible to fix automatically.
The denial-of-service in the linkifier validator was discovered by
@erik-krogh and @yoff, as GHSL-2021-118.
2021-09-29 01:27:54 +02:00
|
|
|
try:
|
|
|
|
pattern = re2.compile(prepare_linkifier_pattern(raw_pattern), options=options)
|
|
|
|
except re2.error:
|
|
|
|
# An invalid regex shouldn't be possible here, and logging
|
|
|
|
# here on an invalid regex would spam the logs with every
|
|
|
|
# message sent; simply move on.
|
|
|
|
continue
|
2022-03-22 01:11:23 +01:00
|
|
|
pos = 0
|
|
|
|
while pos < len(topic_name):
|
|
|
|
m = pattern.search(topic_name, pos)
|
|
|
|
if m is None:
|
|
|
|
break
|
|
|
|
|
2021-01-26 07:32:29 +01:00
|
|
|
match_details = m.groupdict()
|
2021-06-15 11:19:07 +02:00
|
|
|
match_text = match_details[OUTER_CAPTURE_GROUP]
|
2022-03-22 01:11:23 +01:00
|
|
|
|
|
|
|
# Adjust the start point of the match for the next
|
|
|
|
# iteration -- we rewind the non-word character at the
|
|
|
|
# end, if there was one, so a potential next match can
|
|
|
|
# also use it.
|
|
|
|
pos = m.end() - len(match_details[AFTER_CAPTURE_GROUP])
|
|
|
|
|
2021-03-30 12:15:39 +02:00
|
|
|
# We format the linkifier's url string using the matched text.
|
2021-01-26 07:32:29 +01:00
|
|
|
# Also, we include the matched text in the response, so that our clients
|
|
|
|
# don't have to implement any logic of their own to get back the text.
|
|
|
|
matches += [
|
2022-12-02 01:39:06 +01:00
|
|
|
TopicLinkMatch(
|
linkifier: Support URL templates for linkifiers.
This swaps out url_format_string from all of our APIs and replaces it
with url_template. Note that the documentation changes in the following
commits will be squashed with this commit.
We change the "url_format" key to "url_template" for the
realm_linkifiers events in event_schema, along with updating
LinkifierDict. "url_template" is the name chosen to normalize
mixed usages of "url_format_string" and "url_format" throughout
the backend.
The markdown processor is updated to stop handling the format string
interpolation and delegate the task template expansion to the uri_template
library instead.
This change affects many test cases. We mostly just replace "%(name)s"
with "{name}", "url_format_string" with "url_template" to make sure that
they still pass. There are some test cases dedicated for testing "%"
escaping, which aren't relevant anymore and are subject to removal.
But for now we keep most of them as-is, and make sure that "%" is always
escaped since we do not use it for variable substitution any more.
Since url_format_string is not populated anymore, a migration is created
to remove this field entirely, and make url_template non-nullable since
we will always populate it. Note that it is possible to have
url_template being null after migration 0422 and before 0424, but
in practice, url_template will not be None after backfilling and the
backend now is always setting url_template.
With the removal of url_format_string, RealmFilter model will now be cleaned
with URL template checks, and the old checks for escapes are removed.
We also modified RealmFilter.clean to skip the validation when the
url_template is invalid. This avoids raising mulitple ValidationError's
when calling full_clean on a linkifier. But we might eventually want to
have a more centric approach to data validation instead of having
the same validation in both the clean method and the validator.
Fixes #23124.
Signed-off-by: Zixuan James Li <p359101898@gmail.com>
2022-10-05 20:55:31 +02:00
|
|
|
url=prepared_url_template.expand(**match_details),
|
2021-01-26 07:32:29 +01:00
|
|
|
text=match_text,
|
2022-12-02 09:40:45 +01:00
|
|
|
index=m.start(),
|
2022-12-02 01:39:06 +01:00
|
|
|
precedence=precedence,
|
2021-01-26 07:32:29 +01:00
|
|
|
)
|
|
|
|
]
|
2022-12-02 01:39:06 +01:00
|
|
|
precedence += 1
|
|
|
|
|
|
|
|
# Sort the matches beforehand so we favor the match with a higher priority and tie-break with the starting index.
|
|
|
|
# Note that we sort it before processing the raw URLs so that linkifiers will be prioritized over them.
|
|
|
|
matches.sort(key=lambda k: (k.precedence, k.index))
|
2019-05-25 16:10:30 +02:00
|
|
|
|
2022-12-02 09:40:45 +01:00
|
|
|
pos = 0
|
2020-10-23 02:43:28 +02:00
|
|
|
# Also make raw URLs navigable.
|
2022-12-02 09:40:45 +01:00
|
|
|
while pos < len(topic_name):
|
|
|
|
# Assuming that basic_link_splitter matches 1 character,
|
|
|
|
# we match segments of the string for URL divided by the matched character.
|
|
|
|
next_split = basic_link_splitter.search(topic_name, pos)
|
|
|
|
end = next_split.start() if next_split is not None else len(topic_name)
|
|
|
|
# We have to match the substring because LINK_REGEX
|
|
|
|
# matches the start of the entire string with "^"
|
|
|
|
link_match = re.match(get_web_link_regex(), topic_name[pos:end])
|
2019-05-25 16:10:30 +02:00
|
|
|
if link_match:
|
2021-01-26 07:32:29 +01:00
|
|
|
actual_match_url = link_match.group("url")
|
|
|
|
result = urlsplit(actual_match_url)
|
2020-09-13 04:46:59 +02:00
|
|
|
if not result.scheme:
|
|
|
|
if not result.netloc:
|
|
|
|
i = (result.path + "/").index("/")
|
|
|
|
result = result._replace(netloc=result.path[:i], path=result.path[i:])
|
|
|
|
url = result._replace(scheme="https").geturl()
|
2021-01-26 07:32:29 +01:00
|
|
|
else:
|
|
|
|
url = actual_match_url
|
2022-12-02 01:39:06 +01:00
|
|
|
matches.append(
|
|
|
|
TopicLinkMatch(
|
|
|
|
url=url,
|
|
|
|
text=actual_match_url,
|
|
|
|
index=pos,
|
|
|
|
precedence=None,
|
|
|
|
)
|
|
|
|
)
|
2022-12-02 09:40:45 +01:00
|
|
|
# Move pass the next split point, and start matching the URL from there
|
|
|
|
pos = end + 1
|
2019-05-25 16:10:30 +02:00
|
|
|
|
2022-12-02 01:39:06 +01:00
|
|
|
def are_matches_overlapping(match_a: TopicLinkMatch, match_b: TopicLinkMatch) -> bool:
|
|
|
|
return (match_b.index <= match_a.index < match_b.index + len(match_b.text)) or (
|
|
|
|
match_a.index <= match_b.index < match_a.index + len(match_a.text)
|
|
|
|
)
|
|
|
|
|
|
|
|
# The following removes overlapping intervals depending on the precedence of linkifier patterns.
|
2023-02-22 23:03:47 +01:00
|
|
|
# This uses the same algorithm implemented in web/src/markdown.js.
|
2022-12-02 01:39:06 +01:00
|
|
|
# To avoid mutating matches inside the loop, the final output gets appended to another list.
|
|
|
|
applied_matches: List[TopicLinkMatch] = []
|
|
|
|
for current_match in matches:
|
|
|
|
# When the current match does not overlap with all existing matches,
|
|
|
|
# we are confident that the link should present in the final output because
|
|
|
|
# 1. Given that the links are sorted by precedence, the current match has the highest priority
|
|
|
|
# among the matches to be checked.
|
|
|
|
# 2. None of the matches with higher priority overlaps with the current match.
|
|
|
|
# This might be optimized to search for overlapping matches in O(logn) time,
|
|
|
|
# but it is kept as-is since performance is not critical for this codepath and for simplicity.
|
|
|
|
if all(
|
|
|
|
not are_matches_overlapping(old_match, current_match) for old_match in applied_matches
|
|
|
|
):
|
|
|
|
applied_matches.append(current_match)
|
|
|
|
# We need to sort applied_matches again because the links were previously ordered by precedence.
|
|
|
|
applied_matches.sort(key=lambda v: v.index)
|
|
|
|
return [{"url": match.url, "text": match.text} for match in applied_matches]
|
2013-12-11 20:06:37 +01:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2021-04-13 17:42:48 +02:00
|
|
|
def maybe_update_markdown_engines(linkifiers_key: int, email_gateway: bool) -> None:
|
|
|
|
linkifiers = linkifiers_for_realm(linkifiers_key)
|
|
|
|
if linkifiers_key not in linkifier_data or linkifier_data[linkifiers_key] != linkifiers:
|
|
|
|
# Linkifier data has changed, update `linkifier_data` and any
|
|
|
|
# of the existing Markdown engines using this set of linkifiers.
|
|
|
|
linkifier_data[linkifiers_key] = linkifiers
|
|
|
|
for email_gateway_flag in [True, False]:
|
|
|
|
if (linkifiers_key, email_gateway_flag) in md_engines:
|
|
|
|
# Update only existing engines(if any), don't create new one.
|
|
|
|
make_md_engine(linkifiers_key, email_gateway_flag)
|
|
|
|
|
|
|
|
if (linkifiers_key, email_gateway) not in md_engines:
|
|
|
|
# Markdown engine corresponding to this key doesn't exists so create one.
|
|
|
|
make_md_engine(linkifiers_key, email_gateway)
|
2013-12-11 20:06:37 +01:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2012-10-25 21:38:47 +02:00
|
|
|
# We want to log Markdown parser failures, but shouldn't log the actual input
|
|
|
|
# message for privacy reasons. The compromise is to replace all alphanumeric
|
|
|
|
# characters with 'x'.
|
|
|
|
#
|
|
|
|
# We also use repr() to improve reproducibility, and to escape terminal control
|
|
|
|
# codes, which can do surprisingly nasty things.
|
2021-10-21 06:00:38 +02:00
|
|
|
_privacy_re = re.compile("\\w")
|
2021-02-12 08:19:30 +01:00
|
|
|
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def privacy_clean_markdown(content: str) -> str:
|
2021-02-12 08:20:45 +01:00
|
|
|
return repr(_privacy_re.sub("x", content))
|
2012-10-25 21:38:47 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
|
|
|
def do_convert(
|
|
|
|
content: str,
|
|
|
|
realm_alert_words_automaton: Optional[ahocorasick.Automaton] = None,
|
|
|
|
message: Optional[Message] = None,
|
|
|
|
message_realm: Optional[Realm] = None,
|
|
|
|
sent_by_bot: bool = False,
|
|
|
|
translate_emoticons: bool = False,
|
2022-04-14 21:57:20 +02:00
|
|
|
url_embed_data: Optional[Dict[str, Optional[UrlEmbedData]]] = None,
|
2021-02-12 08:19:30 +01:00
|
|
|
mention_data: Optional[MentionData] = None,
|
|
|
|
email_gateway: bool = False,
|
|
|
|
no_previews: bool = False,
|
2021-06-17 12:20:40 +02:00
|
|
|
) -> MessageRenderingResult:
|
2013-08-06 21:32:15 +02:00
|
|
|
"""Convert Markdown to HTML, with Zulip-specific settings and hacks."""
|
2017-01-22 06:29:11 +01:00
|
|
|
# This logic is a bit convoluted, but the overall goal is to support a range of use cases:
|
|
|
|
# * Nothing is passed in other than content -> just run default options (e.g. for docs)
|
|
|
|
# * message is passed, but no realm is -> look up realm from message
|
2020-08-11 01:47:49 +02:00
|
|
|
# * message_realm is passed -> use that realm for Markdown purposes
|
2023-01-18 02:59:37 +01:00
|
|
|
if message is not None and message_realm is None:
|
|
|
|
message_realm = message.get_realm()
|
2017-01-22 06:29:11 +01:00
|
|
|
if message_realm is None:
|
2021-03-30 12:15:39 +02:00
|
|
|
linkifiers_key = DEFAULT_MARKDOWN_KEY
|
2017-01-22 06:29:11 +01:00
|
|
|
else:
|
2021-03-30 12:15:39 +02:00
|
|
|
linkifiers_key = message_realm.id
|
2017-01-22 06:29:11 +01:00
|
|
|
|
2021-02-12 08:20:45 +01:00
|
|
|
if message and hasattr(message, "id") and message.id:
|
|
|
|
logging_message_id = "id# " + str(message.id)
|
2019-01-29 21:06:27 +01:00
|
|
|
else:
|
2021-02-12 08:20:45 +01:00
|
|
|
logging_message_id = "unknown"
|
2019-01-29 21:06:27 +01:00
|
|
|
|
2023-01-18 02:59:37 +01:00
|
|
|
if (
|
|
|
|
message is not None
|
|
|
|
and message_realm is not None
|
|
|
|
and message_realm.is_zephyr_mirror_realm
|
|
|
|
and message.sending_client.name == "zephyr_mirror"
|
|
|
|
):
|
|
|
|
# Use slightly customized Markdown processor for content
|
|
|
|
# delivered via zephyr_mirror
|
|
|
|
linkifiers_key = ZEPHYR_MIRROR_MARKDOWN_KEY
|
2017-01-22 06:29:11 +01:00
|
|
|
|
2021-03-30 12:15:39 +02:00
|
|
|
maybe_update_markdown_engines(linkifiers_key, email_gateway)
|
|
|
|
md_engine_key = (linkifiers_key, email_gateway)
|
2021-04-13 17:42:48 +02:00
|
|
|
_md_engine = md_engines[md_engine_key]
|
2012-11-20 20:15:55 +01:00
|
|
|
# Reset the parser; otherwise it will get slower over time.
|
|
|
|
_md_engine.reset()
|
2012-10-15 22:03:50 +02:00
|
|
|
|
2018-11-07 15:24:36 +01:00
|
|
|
# Filters such as UserMentionPattern need a message.
|
2021-06-17 12:20:40 +02:00
|
|
|
rendering_result: MessageRenderingResult = MessageRenderingResult(
|
|
|
|
rendered_content="",
|
2023-06-06 09:23:01 +02:00
|
|
|
mentions_topic_wildcard=False,
|
2023-06-03 16:51:38 +02:00
|
|
|
mentions_stream_wildcard=False,
|
2021-06-17 12:20:40 +02:00
|
|
|
mentions_user_ids=set(),
|
|
|
|
mentions_user_group_ids=set(),
|
|
|
|
alert_words=set(),
|
|
|
|
links_for_preview=set(),
|
|
|
|
user_ids_with_alert_words=set(),
|
|
|
|
potential_attachment_path_ids=[],
|
|
|
|
)
|
|
|
|
|
2018-11-07 15:24:36 +01:00
|
|
|
_md_engine.zulip_message = message
|
2021-06-17 12:20:40 +02:00
|
|
|
_md_engine.zulip_rendering_result = rendering_result
|
2018-11-07 15:48:08 +01:00
|
|
|
_md_engine.zulip_realm = message_realm
|
2018-11-07 16:26:33 +01:00
|
|
|
_md_engine.zulip_db_data = None # for now
|
2021-02-12 08:19:30 +01:00
|
|
|
_md_engine.image_preview_enabled = image_preview_enabled(message, message_realm, no_previews)
|
2019-03-01 01:53:18 +01:00
|
|
|
_md_engine.url_embed_preview_enabled = url_embed_preview_enabled(
|
2021-02-12 08:19:30 +01:00
|
|
|
message, message_realm, no_previews
|
|
|
|
)
|
2022-04-14 21:57:20 +02:00
|
|
|
_md_engine.url_embed_data = url_embed_data
|
2013-10-09 20:48:05 +02:00
|
|
|
|
2020-08-11 01:47:49 +02:00
|
|
|
# Pre-fetch data from the DB that is used in the Markdown thread
|
2020-06-23 00:37:25 +02:00
|
|
|
if message_realm is not None:
|
2017-09-15 10:21:04 +02:00
|
|
|
# Here we fetch the data structures needed to render
|
markdown: Remove !avatar() and !gravatar() syntax.
This particular commit has been a long time coming. For reference,
!avatar(email) was an undocumented syntax that simply rendered an
inline 50px avatar for a user in a message, essentially allowing
you to create a user pill like:
`!avatar(alice@example.com) Alice: hey!`
---
Reimplementation
If we decide to reimplement this or a similar feature in the future,
we could use something like `<avatar:userid>` syntax which is more
in line with creating links in markdown. Even then, it would not be
a good idea to add this instead of supporting inline images directly.
Since any usecases of such a syntax are in automation, we do not need
to make it userfriendly and something like the following is a better
implementation that doesn't need a custom syntax:
`![avatar for Alice](/avatar/1234?s=50) Alice: hey!`
---
History
We initially added this syntax back in 2012 and it was 'deprecated'
from the get go. Here's what the original commit had to say about
the new syntax:
> We'll use this internally for the commit bot. We might eventually
> disable it for external users.
We eventually did start using this for our github integrations in 2013
but since then, those integrations have been neglected in favor of
our GitHub webhooks which do not use this syntax.
When we copied `!gravatar` to add the `!avatar` syntax, we also noted
that we want to deprecate the `!gravatar` syntax entirely - in 2013!
Since then, we haven't advertised either of these syntaxes anywhere
in our docs, and the only two places where this syntax remains is
our game bots that could easily do without these, and the git commit
integration that we have deprecated anyway.
We do not have any evidence of someone asking about this syntax on
chat.zulip.org when developing an integration and rightfully so- only
the people who work on Zulip (and specifically, markdown) are likely
to stumble upon it and try it out.
This is also the only peice of code due to which we had to look up
emails -> userid mapping in our backend markdown. By removing this,
we entirely remove the backend markdown's dependency on user emails
to render messages.
---
Relevant commits:
- Oct 2012, Initial commit c31462c2782a33886e737cf33424a36a95c81f97
- Nov 2013, Update commit bot 968c393826f8846065c5c880427328f6e534c2f5
- Nov 2013, Add avatar syntax 761c0a0266669aca82d134716a4d6b6e33d541fc
- Sep 2017, Avoid email use c3032a7fe8ed49b011e0d242f4b8a7d756b9f647
- Apr 2019, Remove from webhook 674fcfcce1fcf35bdc57031a1025ef169d495d36
2020-07-06 23:01:38 +02:00
|
|
|
# mentions/stream mentions from the database, but only
|
2017-09-15 10:21:04 +02:00
|
|
|
# if there is syntax in the message that might use them, since
|
|
|
|
# the fetches are somewhat expensive and these types of syntax
|
|
|
|
# are uncommon enough that it's a useful optimization.
|
2017-10-24 02:47:09 +02:00
|
|
|
|
|
|
|
if mention_data is None:
|
2021-12-29 13:52:27 +01:00
|
|
|
mention_backend = MentionBackend(message_realm.id)
|
|
|
|
mention_data = MentionData(mention_backend, content)
|
2017-09-14 19:47:22 +02:00
|
|
|
|
2017-09-15 00:25:38 +02:00
|
|
|
stream_names = possible_linked_stream_names(content)
|
2021-12-30 14:25:46 +01:00
|
|
|
stream_name_info = mention_data.get_stream_name_map(stream_names)
|
2017-09-15 00:25:38 +02:00
|
|
|
|
2017-09-15 03:08:15 +02:00
|
|
|
if content_has_emoji_syntax(content):
|
2023-07-14 12:37:29 +02:00
|
|
|
active_realm_emoji = get_name_keyed_dict_for_active_realm_emoji(message_realm.id)
|
2017-09-15 03:08:15 +02:00
|
|
|
else:
|
2020-09-02 08:14:51 +02:00
|
|
|
active_realm_emoji = {}
|
2017-09-15 03:08:15 +02:00
|
|
|
|
2021-12-27 19:17:49 +01:00
|
|
|
_md_engine.zulip_db_data = DbData(
|
|
|
|
realm_alert_words_automaton=realm_alert_words_automaton,
|
|
|
|
mention_data=mention_data,
|
|
|
|
active_realm_emoji=active_realm_emoji,
|
|
|
|
realm_uri=message_realm.uri,
|
|
|
|
sent_by_bot=sent_by_bot,
|
|
|
|
stream_names=stream_name_info,
|
|
|
|
translate_emoticons=translate_emoticons,
|
|
|
|
)
|
2013-10-09 20:48:05 +02:00
|
|
|
|
2012-10-15 22:03:50 +02:00
|
|
|
try:
|
2018-04-13 17:38:40 +02:00
|
|
|
# Spend at most 5 seconds rendering; this protects the backend
|
2020-08-11 01:47:49 +02:00
|
|
|
# from being overloaded by bugs (e.g. Markdown logic that is
|
2018-04-13 17:38:40 +02:00
|
|
|
# extremely inefficient in corner cases) as well as user
|
2021-03-30 12:08:03 +02:00
|
|
|
# errors (e.g. a linkifier that makes some syntax
|
2018-04-13 17:38:40 +02:00
|
|
|
# infinite-loop).
|
2021-06-17 12:20:40 +02:00
|
|
|
rendering_result.rendered_content = timeout(5, lambda: _md_engine.convert(content))
|
2018-02-09 19:49:13 +01:00
|
|
|
|
|
|
|
# Throw an exception if the content is huge; this protects the
|
|
|
|
# rest of the codebase from any bugs where we end up rendering
|
|
|
|
# something huge.
|
2021-06-03 15:04:22 +02:00
|
|
|
MAX_MESSAGE_LENGTH = settings.MAX_MESSAGE_LENGTH
|
2021-06-17 12:20:40 +02:00
|
|
|
if len(rendering_result.rendered_content) > MAX_MESSAGE_LENGTH * 100:
|
2022-11-17 09:30:48 +01:00
|
|
|
raise MarkdownRenderingError(
|
2021-05-28 20:04:15 +02:00
|
|
|
f"Rendered content exceeds {MAX_MESSAGE_LENGTH * 100} characters (message {logging_message_id})"
|
2020-06-14 02:57:50 +02:00
|
|
|
)
|
2021-06-17 12:20:40 +02:00
|
|
|
return rendering_result
|
2017-03-05 10:25:27 +01:00
|
|
|
except Exception:
|
2017-10-12 02:40:42 +02:00
|
|
|
cleaned = privacy_clean_markdown(content)
|
2020-06-26 20:54:05 +02:00
|
|
|
markdown_logger.exception(
|
2021-02-12 08:20:45 +01:00
|
|
|
"Exception in Markdown parser; input (sanitized) was: %s\n (message %s)",
|
2020-06-12 01:35:37 +02:00
|
|
|
cleaned,
|
|
|
|
logging_message_id,
|
|
|
|
)
|
2017-10-13 02:45:33 +02:00
|
|
|
|
2023-02-04 02:07:20 +01:00
|
|
|
raise MarkdownRenderingError
|
2013-06-28 16:02:58 +02:00
|
|
|
finally:
|
2018-11-07 16:26:33 +01:00
|
|
|
# These next three lines are slightly paranoid, since
|
|
|
|
# we always set these right before actually using the
|
|
|
|
# engine, but better safe then sorry.
|
2018-11-07 15:24:36 +01:00
|
|
|
_md_engine.zulip_message = None
|
2018-11-07 15:48:08 +01:00
|
|
|
_md_engine.zulip_realm = None
|
2018-11-07 16:26:33 +01:00
|
|
|
_md_engine.zulip_db_data = None
|
2013-05-21 23:59:27 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2020-06-26 23:06:05 +02:00
|
|
|
markdown_time_start = 0.0
|
|
|
|
markdown_total_time = 0.0
|
|
|
|
markdown_total_requests = 0
|
2013-05-21 23:59:27 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2020-06-26 23:06:05 +02:00
|
|
|
def get_markdown_time() -> float:
|
|
|
|
return markdown_total_time
|
2013-05-21 23:59:27 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2020-06-26 23:06:05 +02:00
|
|
|
def get_markdown_requests() -> int:
|
|
|
|
return markdown_total_requests
|
2013-05-21 23:59:27 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2020-06-26 23:06:05 +02:00
|
|
|
def markdown_stats_start() -> None:
|
|
|
|
global markdown_time_start
|
|
|
|
markdown_time_start = time.time()
|
2013-05-21 23:59:27 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2020-06-26 23:06:05 +02:00
|
|
|
def markdown_stats_finish() -> None:
|
|
|
|
global markdown_total_time
|
|
|
|
global markdown_total_requests
|
|
|
|
markdown_total_requests += 1
|
2021-02-12 08:19:30 +01:00
|
|
|
markdown_total_time += time.time() - markdown_time_start
|
|
|
|
|
|
|
|
|
|
|
|
def markdown_convert(
|
|
|
|
content: str,
|
|
|
|
realm_alert_words_automaton: Optional[ahocorasick.Automaton] = None,
|
|
|
|
message: Optional[Message] = None,
|
|
|
|
message_realm: Optional[Realm] = None,
|
|
|
|
sent_by_bot: bool = False,
|
|
|
|
translate_emoticons: bool = False,
|
2022-04-14 21:57:20 +02:00
|
|
|
url_embed_data: Optional[Dict[str, Optional[UrlEmbedData]]] = None,
|
2021-02-12 08:19:30 +01:00
|
|
|
mention_data: Optional[MentionData] = None,
|
|
|
|
email_gateway: bool = False,
|
|
|
|
no_previews: bool = False,
|
2021-06-17 12:20:40 +02:00
|
|
|
) -> MessageRenderingResult:
|
2020-06-26 23:06:05 +02:00
|
|
|
markdown_stats_start()
|
2021-02-12 08:19:30 +01:00
|
|
|
ret = do_convert(
|
|
|
|
content,
|
|
|
|
realm_alert_words_automaton,
|
|
|
|
message,
|
|
|
|
message_realm,
|
|
|
|
sent_by_bot,
|
|
|
|
translate_emoticons,
|
2022-04-14 21:57:20 +02:00
|
|
|
url_embed_data,
|
2021-02-12 08:19:30 +01:00
|
|
|
mention_data,
|
|
|
|
email_gateway,
|
|
|
|
no_previews=no_previews,
|
|
|
|
)
|
2020-06-26 23:06:05 +02:00
|
|
|
markdown_stats_finish()
|
2013-05-21 23:59:27 +02:00
|
|
|
return ret
|