diff --git a/requirements/common.in b/requirements/common.in index 0c6e0d1870..8d62def08f 100644 --- a/requirements/common.in +++ b/requirements/common.in @@ -17,6 +17,7 @@ importlib-metadata ; python_version < "3.10" # for Markdown Pygments jsx-lexer uri-template +regex # Needed for manage.py ipython diff --git a/requirements/dev.txt b/requirements/dev.txt index 04aaaa2706..0fb43ce5de 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -2230,7 +2230,9 @@ regex==2023.8.8 \ --hash=sha256:f2181c20ef18747d5f4a7ea513e09ea03bdd50884a11ce46066bb90fe4213675 \ --hash=sha256:f2200e00b62568cfd920127782c61bc1c546062a879cdc741cfcc6976668dfcf \ --hash=sha256:fcbdc5f2b0f1cd0f6a56cdb46fe41d2cce1e644e3b68832f3eeebc5fb0f7712e - # via talon-core + # via + # -r requirements/common.in + # talon-core requests[security]==2.31.0 \ --hash=sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f \ --hash=sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1 @@ -2834,6 +2836,10 @@ types-redis==4.6.0.4 \ --hash=sha256:03a1e1659ae4d8f6543bc2b8b11e94b1ee53937f313b1dc6f67dc7bde7d38fe0 \ --hash=sha256:c475a9d3cf73dd696c3887d30644323fc56f5e00af96151035b3b5b52875c9b3 # via -r requirements/mypy.in +types-regex==2023.6.3.1 \ + --hash=sha256:21880e584e2bada8656abeeb3131287a89dcb215f24f1f5b1503eb9bca37f759 \ + --hash=sha256:f385191206021e48db0f452fe9479812710daf07058c8f6972cfb2f6202df136 + # via -r requirements/mypy.in types-requests==2.31.0.2 \ --hash=sha256:56d181c85b5925cbc59f4489a57e72a8b2166f18273fd8ba7b6fe0c0b986f12a \ --hash=sha256:6aa3f7faf0ea52d728bb18c0a0d1522d9bfd8c72d26ff6f61bfc3d06a411cf40 diff --git a/requirements/mypy.in b/requirements/mypy.in index 484a56f200..f8e38f1883 100644 --- a/requirements/mypy.in +++ b/requirements/mypy.in @@ -21,6 +21,7 @@ types-Pygments types-python-dateutil types-PyYAML types-redis +types-regex types-requests types-stripe types-zxcvbn diff --git a/requirements/prod.txt b/requirements/prod.txt index 51477a0d25..665c2453b5 100644 --- a/requirements/prod.txt +++ b/requirements/prod.txt @@ -1707,7 +1707,9 @@ regex==2023.8.8 \ --hash=sha256:f2181c20ef18747d5f4a7ea513e09ea03bdd50884a11ce46066bb90fe4213675 \ --hash=sha256:f2200e00b62568cfd920127782c61bc1c546062a879cdc741cfcc6976668dfcf \ --hash=sha256:fcbdc5f2b0f1cd0f6a56cdb46fe41d2cce1e644e3b68832f3eeebc5fb0f7712e - # via talon-core + # via + # -r requirements/common.in + # talon-core requests[security]==2.31.0 \ --hash=sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f \ --hash=sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1 diff --git a/tools/setup/emoji/emoji_names.py b/tools/setup/emoji/emoji_names.py index 0bd7f41770..ccac4ad382 100644 --- a/tools/setup/emoji/emoji_names.py +++ b/tools/setup/emoji/emoji_names.py @@ -524,11 +524,11 @@ EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = { "1f3ca-200d-2640": {"canonical_name": "woman_swimming", "aliases": []}, "1f3ca-200d-2642": {"canonical_name": "man_swimming", "aliases": []}, "1f3ca": {"canonical_name": "swim", "aliases": []}, - "1f3cb-fe0f-200d-2640-fe0f": {"canonical_name": "woman_lifting_weights", "aliases": []}, - "1f3cb-fe0f-200d-2642-fe0f": {"canonical_name": "man_lifting_weights", "aliases": []}, + "1f3cb-200d-2640": {"canonical_name": "woman_lifting_weights", "aliases": []}, + "1f3cb-200d-2642": {"canonical_name": "man_lifting_weights", "aliases": []}, "1f3cb": {"canonical_name": "lift", "aliases": ["work_out", "weight_lift", "gym"]}, - "1f3cc-fe0f-200d-2640-fe0f": {"canonical_name": "woman_golfing", "aliases": []}, - "1f3cc-fe0f-200d-2642-fe0f": {"canonical_name": "man_golfing", "aliases": []}, + "1f3cc-200d-2640": {"canonical_name": "woman_golfing", "aliases": []}, + "1f3cc-200d-2642": {"canonical_name": "man_golfing", "aliases": []}, "1f3cc": {"canonical_name": "golf", "aliases": []}, "1f3cd": {"canonical_name": "motorcycle", "aliases": []}, "1f3ce": {"canonical_name": "racecar", "aliases": []}, @@ -567,7 +567,7 @@ EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = { "1f3ef": {"canonical_name": "shiro", "aliases": []}, "1f3f0": {"canonical_name": "castle", "aliases": []}, "1f3f3-200d-1f308": {"canonical_name": "rainbow_flag", "aliases": []}, - "1f3f3-fe0f-200d-26a7-fe0f": {"canonical_name": "transgender_flag", "aliases": []}, + "1f3f3-200d-26a7": {"canonical_name": "transgender_flag", "aliases": []}, "1f3f3": {"canonical_name": "white_flag", "aliases": ["surrender"]}, "1f3f4-200d-2620": {"canonical_name": "pirate_flag", "aliases": ["jolly_roger", "plunder"]}, "1f3f4-e0067-e0062-e0065-e006e-e0067-e007f": {"canonical_name": "flag_england", "aliases": []}, @@ -648,7 +648,7 @@ EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = { "1f43e": {"canonical_name": "paw_prints", "aliases": ["paws"]}, "1f43f": {"canonical_name": "chipmunk", "aliases": []}, "1f440": {"canonical_name": "eyes", "aliases": ["looking"]}, - "1f441-fe0f-200d-1f5e8-fe0f": { + "1f441-200d-1f5e8": { "canonical_name": "eye_in_speech_bubble", "aliases": ["speech", "witness"], }, @@ -1056,8 +1056,8 @@ EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = { "1f570": {"canonical_name": "mantelpiece_clock", "aliases": []}, "1f573": {"canonical_name": "hole", "aliases": []}, "1f574": {"canonical_name": "levitating", "aliases": ["hover"]}, - "1f575-fe0f-200d-2640-fe0f": {"canonical_name": "woman_detective", "aliases": []}, - "1f575-fe0f-200d-2642-fe0f": {"canonical_name": "man_detective", "aliases": []}, + "1f575-200d-2640": {"canonical_name": "woman_detective", "aliases": []}, + "1f575-200d-2642": {"canonical_name": "man_detective", "aliases": []}, "1f575": {"canonical_name": "detective", "aliases": ["spy", "sleuth", "agent", "sneaky"]}, "1f576": {"canonical_name": "dark_sunglasses", "aliases": []}, "1f577": {"canonical_name": "spider", "aliases": []}, @@ -2022,8 +2022,8 @@ EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = { "26f5": {"canonical_name": "boat", "aliases": ["sailboat"]}, "26f7": {"canonical_name": "skier", "aliases": []}, "26f8": {"canonical_name": "ice_skate", "aliases": []}, - "26f9-fe0f-200d-2640-fe0f": {"canonical_name": "woman_bouncing_ball", "aliases": []}, - "26f9-fe0f-200d-2642-fe0f": {"canonical_name": "man_bouncing_ball", "aliases": []}, + "26f9-200d-2640": {"canonical_name": "woman_bouncing_ball", "aliases": []}, + "26f9-200d-2642": {"canonical_name": "man_bouncing_ball", "aliases": []}, "26f9": {"canonical_name": "ball", "aliases": ["sports"]}, "26fa": {"canonical_name": "tent", "aliases": ["camping"]}, "26fd": {"canonical_name": "fuel_pump", "aliases": ["gas_pump", "petrol_pump"]}, diff --git a/tools/setup/emoji/emoji_setup_utils.py b/tools/setup/emoji/emoji_setup_utils.py index 2baa139e72..95837aa3a7 100644 --- a/tools/setup/emoji/emoji_setup_utils.py +++ b/tools/setup/emoji/emoji_setup_utils.py @@ -3,6 +3,8 @@ from collections import defaultdict from typing import Any, Dict, List +from zerver.lib.emoji_utils import emoji_to_hex_codepoint, hex_codepoint_to_emoji, unqualify_emoji + # Emoji sets that we currently support. EMOJISETS = ["google", "twitter"] @@ -61,18 +63,12 @@ def emoji_names_for_picker(emoji_name_maps: Dict[str, Dict[str, Any]]) -> List[s def get_emoji_code(emoji_dict: Dict[str, Any]) -> str: - # Starting from version 4.0.0, `emoji_datasource` package has started to - # add an emoji presentation variation selector for certain emojis which - # have defined variation sequences. Since in informal environments(like - # texting and chat), it is more appropriate for an emoji to have a colorful - # display so until emoji characters have a text presentation selector, it - # should have a colorful display. Hence we can continue using emoji characters - # without appending emoji presentation selector. - # (http://unicode.org/reports/tr51/index.html#Presentation_Style) - # If `non_qualified` field is present and not None return it otherwise - # return `unified` field. - emoji_code = emoji_dict.get("non_qualified") or emoji_dict["unified"] - return emoji_code.lower() + # There is a `non_qualified` field on `emoji_dict` but it's + # inconsistently present, so we'll always use the unqualified + # emoji by unqualifying it ourselves. This gives us more consistent + # behaviour between emojis, and doesn't rely on the incomplete + # upstream package (https://github.com/iamcal/emoji-data/pull/217). + return emoji_to_hex_codepoint(unqualify_emoji(hex_codepoint_to_emoji(emoji_dict["unified"]))) # Returns a dict from categories to list of codepoints. The list of diff --git a/version.py b/version.py index fa8ef1ce22..48a7a6d6e1 100644 --- a/version.py +++ b/version.py @@ -48,4 +48,4 @@ API_FEATURE_LEVEL = 203 # historical commits sharing the same major version, in which case a # minor version bump suffices. -PROVISION_VERSION = (247, 4) +PROVISION_VERSION = (247, 5) diff --git a/web/src/markdown.js b/web/src/markdown.js index 89e3602765..f39f72669d 100644 --- a/web/src/markdown.js +++ b/web/src/markdown.js @@ -388,7 +388,21 @@ function make_emoji_span(codepoint, title, alt_text) { } function handleUnicodeEmoji({unicode_emoji, get_emoji_name}) { - const codepoint = unicode_emoji.codePointAt(0).toString(16); + // We want to avoid turning things like arrows (↔) and keycaps (numbers + // in boxes) into qualified emoji (images). + // More specifically, we skip anything with text in the second column of + // this table https://unicode.org/Public/emoji/1.0/emoji-data.txt + if (/^\P{Emoji_Presentation}\u20E3?$/u.test(unicode_emoji)) { + return unicode_emoji; + } + + // This unqualifies qualified emoji, which helps us make sure we + // can match both versions. + const unqualified_unicode_emoji = unicode_emoji.replace(/\uFE0F/, ""); + + const codepoint = [...unqualified_unicode_emoji] + .map((char) => char.codePointAt(0).toString(16).padStart(4, "0")) + .join("-"); const emoji_name = get_emoji_name(codepoint); if (emoji_name) { diff --git a/web/third/marked/lib/marked.js b/web/third/marked/lib/marked.js index f8eef60407..80d725a262 100644 --- a/web/third/marked/lib/marked.js +++ b/web/third/marked/lib/marked.js @@ -537,12 +537,12 @@ inline.breaks = merge({}, inline.gfm, { text: replace(inline.gfm.text)('{2,}', '*')() }); +// From https://unicode.org/reports/tr51/#EBNF_and_Regex. Keep this synced with POSSIBLE_EMOJI_RE. +const possible_emoji_regex = /^(\p{RI}\p{RI}|\p{Emoji}(?:\p{Emoji_Modifier}|\u{FE0F}\u{20E3}?|[\u{E0020}-\u{E007E}]+\u{E007F})?(?:\u{200D}(?:\p{RI}\p{RI}|\p{Emoji}(?:\p{Emoji_Modifier}|\u{FE0F}\u{20E3}?|[\u{E0020}-\u{E007E}]+\u{E007F})?))*)/u; + inline.zulip = merge({}, inline.breaks, { emoji: /^:([A-Za-z0-9_\-\+]+?):/, - unicodeemoji: RegExp('^(\ud83c[\udd00-\udfff]|\ud83d[\udc00-\ude4f]|' + - '\ud83d[\ude80-\udeff]|\ud83e[\udd00-\uddff]|' + - '[\u2000-\u206F]|[\u2300-\u27BF]|[\u2B00-\u2BFF]|' + - '[\u3000-\u303F]|[\u3200-\u32FF])'), + unicodeemoji: possible_emoji_regex, usermention: /^@(_?)(?:\*\*([^\*]+)\*\*)/, // Match potentially multi-word string between @** ** groupmention: /^@(_?)(?:\*([^\*]+)\*)/, // Match multi-word string between @* * stream_topic: /^#\*\*([^\*>]+)>([^\*]+)\*\*/, diff --git a/zerver/lib/emoji_utils.py b/zerver/lib/emoji_utils.py new file mode 100644 index 0000000000..3fd693bd50 --- /dev/null +++ b/zerver/lib/emoji_utils.py @@ -0,0 +1,25 @@ +# This file doesn't import from django so that we can use it in `build_emoji` + + +def unqualify_emoji(emoji: str) -> str: + # Starting from version 4.0.0, `emoji_datasource` package has started to + # add an emoji presentation variation selector for certain emojis which + # have defined variation sequences. The emoji presentation selector + # "qualifies" an emoji, and an "unqualified" version of an emoji does + # not have an emoji presentation selector. + # + # Since in informal environments(like texting and chat), it is more + # appropriate for an emoji to have a colorful display so until emoji + # characters have a text presentation selector, it should have a + # colorful display. Hence we can continue using emoji characters + # without appending emoji presentation selector. + # (http://unicode.org/reports/tr51/index.html#Presentation_Style) + return emoji.replace("\ufe0f", "") + + +def emoji_to_hex_codepoint(emoji: str) -> str: + return "-".join(f"{ord(c):04x}" for c in emoji) + + +def hex_codepoint_to_emoji(hex: str) -> str: + return "".join(chr(int(h, 16)) for h in hex.split("-")) diff --git a/zerver/lib/markdown/__init__.py b/zerver/lib/markdown/__init__.py index b5f0e37a66..21b6f9ac7c 100644 --- a/zerver/lib/markdown/__init__.py +++ b/zerver/lib/markdown/__init__.py @@ -25,6 +25,7 @@ from typing import ( TypedDict, TypeVar, Union, + cast, ) from urllib.parse import parse_qs, urlencode, urljoin, urlsplit from xml.etree.ElementTree import Element, SubElement @@ -40,6 +41,7 @@ import markdown.postprocessors import markdown.treeprocessors import markdown.util import re2 +import regex import requests import uri_template from django.conf import settings @@ -53,6 +55,7 @@ from zerver.lib import mention from zerver.lib.cache import cache_with_key from zerver.lib.camo import get_camo_url from zerver.lib.emoji import EMOTICON_RE, codepoint_to_name, name_to_codepoint, translate_emoticons +from zerver.lib.emoji_utils import emoji_to_hex_codepoint, unqualify_emoji from zerver.lib.exceptions import MarkdownRenderingError from zerver.lib.markdown import fenced_code from zerver.lib.markdown.fenced_code import FENCE_RE @@ -840,7 +843,7 @@ class InlineInterestingLinkProcessor(markdown.treeprocessors.Treeprocessor): This works by using the URLs, user_mentions and media data from the twitter API and searching for Unicode emojis in the text using - `UNICODE_EMOJI_RE`. + `POSSIBLE_EMOJI_RE`. The first step is finding the locations of the URLs, mentions, media and emoji in the text. For each match we build a dictionary with type, the start @@ -897,9 +900,9 @@ class InlineInterestingLinkProcessor(markdown.treeprocessors.Treeprocessor): for match in re.finditer(re.escape(short_url), text, re.IGNORECASE) ) # Build dicts for emojis - for match in re.finditer(UNICODE_EMOJI_RE, text, re.IGNORECASE): + for match in POSSIBLE_EMOJI_RE.finditer(text): orig_syntax = match.group("syntax") - codepoint = unicode_emoji_to_codepoint(orig_syntax) + codepoint = emoji_to_hex_codepoint(unqualify_emoji(orig_syntax)) if codepoint in codepoint_to_name: display_string = ":" + codepoint_to_name[codepoint] + ":" to_process.append( @@ -1342,54 +1345,27 @@ class Timestamp(markdown.inlinepatterns.Pattern): return time_element -# All of our emojis (excluding ZWJ sequences) belong to one of these Unicode blocks: -# \U0001f100-\U0001f1ff - Enclosed Alphanumeric Supplement -# \U0001f200-\U0001f2ff - Enclosed Ideographic Supplement -# \U0001f300-\U0001f5ff - Miscellaneous Symbols and Pictographs -# \U0001f600-\U0001f64f - Emoticons (Emoji) -# \U0001f680-\U0001f6ff - Transport and Map Symbols -# \U0001f7e0-\U0001f7eb - Coloured Geometric Shapes (NOTE: Not Unicode standard category name) -# \U0001f900-\U0001f9ff - Supplemental Symbols and Pictographs -# \u2000-\u206f - General Punctuation -# \u2300-\u23ff - Miscellaneous Technical -# \u2400-\u243f - Control Pictures -# \u2440-\u245f - Optical Character Recognition -# \u2460-\u24ff - Enclosed Alphanumerics -# \u2500-\u257f - Box Drawing -# \u2580-\u259f - Block Elements -# \u25a0-\u25ff - Geometric Shapes -# \u2600-\u26ff - Miscellaneous Symbols -# \u2700-\u27bf - Dingbats -# \u2900-\u297f - Supplemental Arrows-B -# \u2b00-\u2bff - Miscellaneous Symbols and Arrows -# \u3000-\u303f - CJK Symbols and Punctuation -# \u3200-\u32ff - Enclosed CJK Letters and Months -UNICODE_EMOJI_RE = ( - "(?P[" - "\U0001F100-\U0001F64F" - "\U0001F680-\U0001F6FF" - "\U0001F7E0-\U0001F7EB" - "\U0001F900-\U0001F9FF" - "\u2000-\u206F" - "\u2300-\u27BF" - "\u2900-\u297F" - "\u2B00-\u2BFF" - "\u3000-\u303F" - "\u3200-\u32FF" - "])" +# From https://unicode.org/reports/tr51/#EBNF_and_Regex. Keep this synced with `possible_emoji_regex`. +POSSIBLE_EMOJI_RE = regex.compile( + r"""(?P +\p{RI} \p{RI} +| \p{Emoji} + (?: \p{Emoji_Modifier} + | \uFE0F \u20E3? + | [\U000E0020-\U000E007E]+ \U000E007F + )? + (?: \u200D + (?: \p{RI} \p{RI} + | \p{Emoji} + (?: \p{Emoji_Modifier} + | \uFE0F \u20E3? + | [\U000E0020-\U000E007E]+ \U000E007F + )? + ) + )*) +""", + regex.VERBOSE, ) -# The equivalent JS regex is \ud83c[\udd00-\udfff]|\ud83d[\udc00-\ude4f]|\ud83d[\ude80-\udeff]| -# \ud83e[\udd00-\uddff]|[\u2000-\u206f]|[\u2300-\u27bf]|[\u2b00-\u2bff]|[\u3000-\u303f]| -# [\u3200-\u32ff]. See below comments for explanation. The JS regex is used by marked.js for -# frontend Unicode emoji processing. -# The JS regex \ud83c[\udd00-\udfff]|\ud83d[\udc00-\ude4f] represents U0001f100-\U0001f64f -# The JS regex \ud83d[\ude80-\udeff] represents \U0001f680-\U0001f6ff -# The JS regex \ud83e[\udd00-\uddff] represents \U0001f900-\U0001f9ff -# The JS regex [\u2000-\u206f] represents \u2000-\u206f -# The JS regex [\u2300-\u27bf] represents \u2300-\u27bf -# Similarly other JS regexes can be mapped to the respective Unicode blocks. -# For more information, please refer to the following article: -# http://crocodillon.com/blog/parsing-emoji-unicode-in-javascript def make_emoji(codepoint: str, display_string: str) -> Element: @@ -1413,11 +1389,6 @@ def make_realm_emoji(src: str, display_string: str) -> Element: return elt -def unicode_emoji_to_codepoint(unicode_emoji: str) -> str: - # Unicode codepoints are minimum of length 4, padded with zeroes - return f"{ord(unicode_emoji):04x}" - - class EmoticonTranslation(markdown.inlinepatterns.Pattern): """Translates emoticons like `:)` into emoji like `:smile:`.""" @@ -1436,15 +1407,28 @@ class EmoticonTranslation(markdown.inlinepatterns.Pattern): return make_emoji(name_to_codepoint[name], translated) -class UnicodeEmoji(markdown.inlinepatterns.Pattern): - def handleMatch(self, match: Match[str]) -> Optional[Element]: +TEXT_PRESENTATION_RE = regex.compile(r"\P{Emoji_Presentation}\u20E3?") + + +class UnicodeEmoji(CompiledInlineProcessor): + def handleMatch( # type: ignore[override] # https://github.com/python/mypy/issues/10197 + self, match: Match[str], data: str + ) -> Union[Tuple[None, None, None], Tuple[Element, int, int]]: orig_syntax = match.group("syntax") - codepoint = unicode_emoji_to_codepoint(orig_syntax) + + # We want to avoid turning things like arrows (↔) and keycaps (numbers + # in boxes) into qualified emoji. + # More specifically, we skip anything with text in the second column of + # this table https://unicode.org/Public/emoji/1.0/emoji-data.txt + if TEXT_PRESENTATION_RE.fullmatch(orig_syntax): + return None, None, None + + codepoint = emoji_to_hex_codepoint(unqualify_emoji(orig_syntax)) if codepoint in codepoint_to_name: display_string = ":" + codepoint_to_name[codepoint] + ":" - return make_emoji(codepoint, display_string) + return make_emoji(codepoint, display_string), match.start(), match.end() else: - return None + return None, None, None class Emoji(markdown.inlinepatterns.Pattern): @@ -2224,7 +2208,7 @@ class ZulipMarkdown(markdown.Markdown): reg.register(Emoji(EMOJI_REGEX, self), "emoji", 15) reg.register(EmoticonTranslation(EMOTICON_RE, self), "translate_emoticons", 10) # We get priority 5 from 'nl2br' extension - reg.register(UnicodeEmoji(UNICODE_EMOJI_RE), "unicodeemoji", 0) + reg.register(UnicodeEmoji(cast(Pattern[str], POSSIBLE_EMOJI_RE), self), "unicodeemoji", 0) return reg def register_linkifiers(self, registry: markdown.util.Registry) -> markdown.util.Registry: diff --git a/zerver/tests/fixtures/markdown_test_cases.json b/zerver/tests/fixtures/markdown_test_cases.json index edee732e85..5be0ea6dcc 100644 --- a/zerver/tests/fixtures/markdown_test_cases.json +++ b/zerver/tests/fixtures/markdown_test_cases.json @@ -588,6 +588,41 @@ "input": ":poop:", "expected_output": "

:poop:

" }, + { + "name": "emoji_sequence_one", + "input": "🤷‍♀ī¸", + "expected_output": "

:woman_shrugging:

" + }, + { + "name": "emoji_sequence_two", + "input": "👁‍🗨 #ī¸âƒŖ", + "expected_output": "

:eye_in_speech_bubble: :hash:

" + }, + { + "name": "unrecognized_emoji_sequence_one", + "input": "pheonix bird from 15.1: đŸĻ‍đŸ”Ĩ", + "expected_output": "

pheonix bird from 15.1: đŸĻ‍đŸ”Ĩ

" + }, + { + "name": "unrecognized_emoji_sequence_two", + "input": "lime from 15.1: 🍋‍🟩", + "expected_output": "

lime from 15.1: 🍋‍🟩

" + }, + { + "name": "unrecognized_emoji_sequence_three", + "input": "normal lemon 🍋, purple lemon? 🍋‍đŸŸĒ", + "expected_output": "

normal lemon :lemon:, purple lemon? 🍋‍đŸŸĒ

" + }, + { + "name": "unrecognized_emoji_sequence_four", + "input": "medium-skinned person riding scooter back and forth: 🧑đŸŊ‍🛴‍↩ī¸", + "expected_output": "

medium-skinned person riding scooter back and forth: 🧑đŸŊ‍🛴‍↩ī¸

" + }, + { + "name": "unrecognized_emoji_sequence_skin_tone", + "input": "man in manual wheelchair: dark skin tone 👨đŸŋ‍đŸĻŊ", + "expected_output": "

man in manual wheelchair: dark skin tone 👨đŸŋ‍đŸĻŊ

" + }, { "name": "emojis_without_space", "input": ":cat:hello:dog::rabbit:", @@ -642,7 +677,7 @@ }, { "name": "miscellaneous_symbols_and_arrows", - "input": "Black upward arrow \u2b06", + "input": "Black upward arrow \u2b06\ufe0f", "expected_output":"

Black upward arrow :up:<\/p>" }, { diff --git a/zerver/tests/test_markdown.py b/zerver/tests/test_markdown.py index 6453300506..979a29f56b 100644 --- a/zerver/tests/test_markdown.py +++ b/zerver/tests/test_markdown.py @@ -22,9 +22,11 @@ from zerver.actions.users import change_user_is_active from zerver.lib.alert_words import get_alert_word_automaton from zerver.lib.camo import get_camo_url from zerver.lib.create_user import create_user -from zerver.lib.emoji import get_emoji_url +from zerver.lib.emoji import codepoint_to_name, get_emoji_url +from zerver.lib.emoji_utils import hex_codepoint_to_emoji from zerver.lib.exceptions import JsonableError, MarkdownRenderingError from zerver.lib.markdown import ( + POSSIBLE_EMOJI_RE, InlineInterestingLinkProcessor, MarkdownListPreprocessor, MessageRenderingResult, @@ -3156,3 +3158,17 @@ class MarkdownErrorTests(ZulipTestCase): result = processor.run(markdown_input) self.assertEqual(result, expected) + + +class MarkdownEmojiTest(ZulipTestCase): + def test_all_emoji_match_regex(self) -> None: + non_matching_emoji = [ + emoji + for codepoint in codepoint_to_name + if not POSSIBLE_EMOJI_RE.fullmatch(emoji := hex_codepoint_to_emoji(codepoint)) + ] + self.assertEqual( + non_matching_emoji, + # unqualified numbers in boxes shouldn't be converted to emoji images, so this is fine + ["#âƒŖ", "*âƒŖ", "0âƒŖ", "1âƒŖ", "2âƒŖ", "3âƒŖ", "4âƒŖ", "5âƒŖ", "6âƒŖ", "7âƒŖ", "8âƒŖ", "9âƒŖ"], + )