From 0289beb784217d8a24f31a91d5c3b72e3a7af691 Mon Sep 17 00:00:00 2001 From: evykassirer Date: Thu, 10 Aug 2023 12:00:45 -0700 Subject: [PATCH] emoji: Match emoji sequences in markdown. Fixes #11767. Previously multi-character emoji sequences weren't matched in the emoji regex, so we'd convert the characters to separate images, breaking the intended display. This change allows us to match the full emoji sequence, and therefore show the correct image. --- requirements/common.in | 1 + requirements/dev.txt | 8 +- requirements/mypy.in | 1 + requirements/prod.txt | 4 +- tools/setup/emoji/emoji_names.py | 20 ++-- tools/setup/emoji/emoji_setup_utils.py | 20 ++-- version.py | 2 +- web/src/markdown.js | 16 ++- web/third/marked/lib/marked.js | 8 +- zerver/lib/emoji_utils.py | 25 +++++ zerver/lib/markdown/__init__.py | 106 ++++++++---------- .../tests/fixtures/markdown_test_cases.json | 37 +++++- zerver/tests/test_markdown.py | 18 ++- 13 files changed, 173 insertions(+), 93 deletions(-) create mode 100644 zerver/lib/emoji_utils.py diff --git a/requirements/common.in b/requirements/common.in index 0c6e0d1870..8d62def08f 100644 --- a/requirements/common.in +++ b/requirements/common.in @@ -17,6 +17,7 @@ importlib-metadata ; python_version < "3.10" # for Markdown Pygments jsx-lexer uri-template +regex # Needed for manage.py ipython diff --git a/requirements/dev.txt b/requirements/dev.txt index 04aaaa2706..0fb43ce5de 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -2230,7 +2230,9 @@ regex==2023.8.8 \ --hash=sha256:f2181c20ef18747d5f4a7ea513e09ea03bdd50884a11ce46066bb90fe4213675 \ --hash=sha256:f2200e00b62568cfd920127782c61bc1c546062a879cdc741cfcc6976668dfcf \ --hash=sha256:fcbdc5f2b0f1cd0f6a56cdb46fe41d2cce1e644e3b68832f3eeebc5fb0f7712e - # via talon-core + # via + # -r requirements/common.in + # talon-core requests[security]==2.31.0 \ --hash=sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f \ --hash=sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1 @@ -2834,6 +2836,10 @@ types-redis==4.6.0.4 \ --hash=sha256:03a1e1659ae4d8f6543bc2b8b11e94b1ee53937f313b1dc6f67dc7bde7d38fe0 \ --hash=sha256:c475a9d3cf73dd696c3887d30644323fc56f5e00af96151035b3b5b52875c9b3 # via -r requirements/mypy.in +types-regex==2023.6.3.1 \ + --hash=sha256:21880e584e2bada8656abeeb3131287a89dcb215f24f1f5b1503eb9bca37f759 \ + --hash=sha256:f385191206021e48db0f452fe9479812710daf07058c8f6972cfb2f6202df136 + # via -r requirements/mypy.in types-requests==2.31.0.2 \ --hash=sha256:56d181c85b5925cbc59f4489a57e72a8b2166f18273fd8ba7b6fe0c0b986f12a \ --hash=sha256:6aa3f7faf0ea52d728bb18c0a0d1522d9bfd8c72d26ff6f61bfc3d06a411cf40 diff --git a/requirements/mypy.in b/requirements/mypy.in index 484a56f200..f8e38f1883 100644 --- a/requirements/mypy.in +++ b/requirements/mypy.in @@ -21,6 +21,7 @@ types-Pygments types-python-dateutil types-PyYAML types-redis +types-regex types-requests types-stripe types-zxcvbn diff --git a/requirements/prod.txt b/requirements/prod.txt index 51477a0d25..665c2453b5 100644 --- a/requirements/prod.txt +++ b/requirements/prod.txt @@ -1707,7 +1707,9 @@ regex==2023.8.8 \ --hash=sha256:f2181c20ef18747d5f4a7ea513e09ea03bdd50884a11ce46066bb90fe4213675 \ --hash=sha256:f2200e00b62568cfd920127782c61bc1c546062a879cdc741cfcc6976668dfcf \ --hash=sha256:fcbdc5f2b0f1cd0f6a56cdb46fe41d2cce1e644e3b68832f3eeebc5fb0f7712e - # via talon-core + # via + # -r requirements/common.in + # talon-core requests[security]==2.31.0 \ --hash=sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f \ --hash=sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1 diff --git a/tools/setup/emoji/emoji_names.py b/tools/setup/emoji/emoji_names.py index 0bd7f41770..ccac4ad382 100644 --- a/tools/setup/emoji/emoji_names.py +++ b/tools/setup/emoji/emoji_names.py @@ -524,11 +524,11 @@ EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = { "1f3ca-200d-2640": {"canonical_name": "woman_swimming", "aliases": []}, "1f3ca-200d-2642": {"canonical_name": "man_swimming", "aliases": []}, "1f3ca": {"canonical_name": "swim", "aliases": []}, - "1f3cb-fe0f-200d-2640-fe0f": {"canonical_name": "woman_lifting_weights", "aliases": []}, - "1f3cb-fe0f-200d-2642-fe0f": {"canonical_name": "man_lifting_weights", "aliases": []}, + "1f3cb-200d-2640": {"canonical_name": "woman_lifting_weights", "aliases": []}, + "1f3cb-200d-2642": {"canonical_name": "man_lifting_weights", "aliases": []}, "1f3cb": {"canonical_name": "lift", "aliases": ["work_out", "weight_lift", "gym"]}, - "1f3cc-fe0f-200d-2640-fe0f": {"canonical_name": "woman_golfing", "aliases": []}, - "1f3cc-fe0f-200d-2642-fe0f": {"canonical_name": "man_golfing", "aliases": []}, + "1f3cc-200d-2640": {"canonical_name": "woman_golfing", "aliases": []}, + "1f3cc-200d-2642": {"canonical_name": "man_golfing", "aliases": []}, "1f3cc": {"canonical_name": "golf", "aliases": []}, "1f3cd": {"canonical_name": "motorcycle", "aliases": []}, "1f3ce": {"canonical_name": "racecar", "aliases": []}, @@ -567,7 +567,7 @@ EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = { "1f3ef": {"canonical_name": "shiro", "aliases": []}, "1f3f0": {"canonical_name": "castle", "aliases": []}, "1f3f3-200d-1f308": {"canonical_name": "rainbow_flag", "aliases": []}, - "1f3f3-fe0f-200d-26a7-fe0f": {"canonical_name": "transgender_flag", "aliases": []}, + "1f3f3-200d-26a7": {"canonical_name": "transgender_flag", "aliases": []}, "1f3f3": {"canonical_name": "white_flag", "aliases": ["surrender"]}, "1f3f4-200d-2620": {"canonical_name": "pirate_flag", "aliases": ["jolly_roger", "plunder"]}, "1f3f4-e0067-e0062-e0065-e006e-e0067-e007f": {"canonical_name": "flag_england", "aliases": []}, @@ -648,7 +648,7 @@ EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = { "1f43e": {"canonical_name": "paw_prints", "aliases": ["paws"]}, "1f43f": {"canonical_name": "chipmunk", "aliases": []}, "1f440": {"canonical_name": "eyes", "aliases": ["looking"]}, - "1f441-fe0f-200d-1f5e8-fe0f": { + "1f441-200d-1f5e8": { "canonical_name": "eye_in_speech_bubble", "aliases": ["speech", "witness"], }, @@ -1056,8 +1056,8 @@ EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = { "1f570": {"canonical_name": "mantelpiece_clock", "aliases": []}, "1f573": {"canonical_name": "hole", "aliases": []}, "1f574": {"canonical_name": "levitating", "aliases": ["hover"]}, - "1f575-fe0f-200d-2640-fe0f": {"canonical_name": "woman_detective", "aliases": []}, - "1f575-fe0f-200d-2642-fe0f": {"canonical_name": "man_detective", "aliases": []}, + "1f575-200d-2640": {"canonical_name": "woman_detective", "aliases": []}, + "1f575-200d-2642": {"canonical_name": "man_detective", "aliases": []}, "1f575": {"canonical_name": "detective", "aliases": ["spy", "sleuth", "agent", "sneaky"]}, "1f576": {"canonical_name": "dark_sunglasses", "aliases": []}, "1f577": {"canonical_name": "spider", "aliases": []}, @@ -2022,8 +2022,8 @@ EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = { "26f5": {"canonical_name": "boat", "aliases": ["sailboat"]}, "26f7": {"canonical_name": "skier", "aliases": []}, "26f8": {"canonical_name": "ice_skate", "aliases": []}, - "26f9-fe0f-200d-2640-fe0f": {"canonical_name": "woman_bouncing_ball", "aliases": []}, - "26f9-fe0f-200d-2642-fe0f": {"canonical_name": "man_bouncing_ball", "aliases": []}, + "26f9-200d-2640": {"canonical_name": "woman_bouncing_ball", "aliases": []}, + "26f9-200d-2642": {"canonical_name": "man_bouncing_ball", "aliases": []}, "26f9": {"canonical_name": "ball", "aliases": ["sports"]}, "26fa": {"canonical_name": "tent", "aliases": ["camping"]}, "26fd": {"canonical_name": "fuel_pump", "aliases": ["gas_pump", "petrol_pump"]}, diff --git a/tools/setup/emoji/emoji_setup_utils.py b/tools/setup/emoji/emoji_setup_utils.py index 2baa139e72..95837aa3a7 100644 --- a/tools/setup/emoji/emoji_setup_utils.py +++ b/tools/setup/emoji/emoji_setup_utils.py @@ -3,6 +3,8 @@ from collections import defaultdict from typing import Any, Dict, List +from zerver.lib.emoji_utils import emoji_to_hex_codepoint, hex_codepoint_to_emoji, unqualify_emoji + # Emoji sets that we currently support. EMOJISETS = ["google", "twitter"] @@ -61,18 +63,12 @@ def emoji_names_for_picker(emoji_name_maps: Dict[str, Dict[str, Any]]) -> List[s def get_emoji_code(emoji_dict: Dict[str, Any]) -> str: - # Starting from version 4.0.0, `emoji_datasource` package has started to - # add an emoji presentation variation selector for certain emojis which - # have defined variation sequences. Since in informal environments(like - # texting and chat), it is more appropriate for an emoji to have a colorful - # display so until emoji characters have a text presentation selector, it - # should have a colorful display. Hence we can continue using emoji characters - # without appending emoji presentation selector. - # (http://unicode.org/reports/tr51/index.html#Presentation_Style) - # If `non_qualified` field is present and not None return it otherwise - # return `unified` field. - emoji_code = emoji_dict.get("non_qualified") or emoji_dict["unified"] - return emoji_code.lower() + # There is a `non_qualified` field on `emoji_dict` but it's + # inconsistently present, so we'll always use the unqualified + # emoji by unqualifying it ourselves. This gives us more consistent + # behaviour between emojis, and doesn't rely on the incomplete + # upstream package (https://github.com/iamcal/emoji-data/pull/217). + return emoji_to_hex_codepoint(unqualify_emoji(hex_codepoint_to_emoji(emoji_dict["unified"]))) # Returns a dict from categories to list of codepoints. The list of diff --git a/version.py b/version.py index fa8ef1ce22..48a7a6d6e1 100644 --- a/version.py +++ b/version.py @@ -48,4 +48,4 @@ API_FEATURE_LEVEL = 203 # historical commits sharing the same major version, in which case a # minor version bump suffices. -PROVISION_VERSION = (247, 4) +PROVISION_VERSION = (247, 5) diff --git a/web/src/markdown.js b/web/src/markdown.js index 89e3602765..f39f72669d 100644 --- a/web/src/markdown.js +++ b/web/src/markdown.js @@ -388,7 +388,21 @@ function make_emoji_span(codepoint, title, alt_text) { } function handleUnicodeEmoji({unicode_emoji, get_emoji_name}) { - const codepoint = unicode_emoji.codePointAt(0).toString(16); + // We want to avoid turning things like arrows (↔) and keycaps (numbers + // in boxes) into qualified emoji (images). + // More specifically, we skip anything with text in the second column of + // this table https://unicode.org/Public/emoji/1.0/emoji-data.txt + if (/^\P{Emoji_Presentation}\u20E3?$/u.test(unicode_emoji)) { + return unicode_emoji; + } + + // This unqualifies qualified emoji, which helps us make sure we + // can match both versions. + const unqualified_unicode_emoji = unicode_emoji.replace(/\uFE0F/, ""); + + const codepoint = [...unqualified_unicode_emoji] + .map((char) => char.codePointAt(0).toString(16).padStart(4, "0")) + .join("-"); const emoji_name = get_emoji_name(codepoint); if (emoji_name) { diff --git a/web/third/marked/lib/marked.js b/web/third/marked/lib/marked.js index f8eef60407..80d725a262 100644 --- a/web/third/marked/lib/marked.js +++ b/web/third/marked/lib/marked.js @@ -537,12 +537,12 @@ inline.breaks = merge({}, inline.gfm, { text: replace(inline.gfm.text)('{2,}', '*')() }); +// From https://unicode.org/reports/tr51/#EBNF_and_Regex. Keep this synced with POSSIBLE_EMOJI_RE. +const possible_emoji_regex = /^(\p{RI}\p{RI}|\p{Emoji}(?:\p{Emoji_Modifier}|\u{FE0F}\u{20E3}?|[\u{E0020}-\u{E007E}]+\u{E007F})?(?:\u{200D}(?:\p{RI}\p{RI}|\p{Emoji}(?:\p{Emoji_Modifier}|\u{FE0F}\u{20E3}?|[\u{E0020}-\u{E007E}]+\u{E007F})?))*)/u; + inline.zulip = merge({}, inline.breaks, { emoji: /^:([A-Za-z0-9_\-\+]+?):/, - unicodeemoji: RegExp('^(\ud83c[\udd00-\udfff]|\ud83d[\udc00-\ude4f]|' + - '\ud83d[\ude80-\udeff]|\ud83e[\udd00-\uddff]|' + - '[\u2000-\u206F]|[\u2300-\u27BF]|[\u2B00-\u2BFF]|' + - '[\u3000-\u303F]|[\u3200-\u32FF])'), + unicodeemoji: possible_emoji_regex, usermention: /^@(_?)(?:\*\*([^\*]+)\*\*)/, // Match potentially multi-word string between @** ** groupmention: /^@(_?)(?:\*([^\*]+)\*)/, // Match multi-word string between @* * stream_topic: /^#\*\*([^\*>]+)>([^\*]+)\*\*/, diff --git a/zerver/lib/emoji_utils.py b/zerver/lib/emoji_utils.py new file mode 100644 index 0000000000..3fd693bd50 --- /dev/null +++ b/zerver/lib/emoji_utils.py @@ -0,0 +1,25 @@ +# This file doesn't import from django so that we can use it in `build_emoji` + + +def unqualify_emoji(emoji: str) -> str: + # Starting from version 4.0.0, `emoji_datasource` package has started to + # add an emoji presentation variation selector for certain emojis which + # have defined variation sequences. The emoji presentation selector + # "qualifies" an emoji, and an "unqualified" version of an emoji does + # not have an emoji presentation selector. + # + # Since in informal environments(like texting and chat), it is more + # appropriate for an emoji to have a colorful display so until emoji + # characters have a text presentation selector, it should have a + # colorful display. Hence we can continue using emoji characters + # without appending emoji presentation selector. + # (http://unicode.org/reports/tr51/index.html#Presentation_Style) + return emoji.replace("\ufe0f", "") + + +def emoji_to_hex_codepoint(emoji: str) -> str: + return "-".join(f"{ord(c):04x}" for c in emoji) + + +def hex_codepoint_to_emoji(hex: str) -> str: + return "".join(chr(int(h, 16)) for h in hex.split("-")) diff --git a/zerver/lib/markdown/__init__.py b/zerver/lib/markdown/__init__.py index b5f0e37a66..21b6f9ac7c 100644 --- a/zerver/lib/markdown/__init__.py +++ b/zerver/lib/markdown/__init__.py @@ -25,6 +25,7 @@ from typing import ( TypedDict, TypeVar, Union, + cast, ) from urllib.parse import parse_qs, urlencode, urljoin, urlsplit from xml.etree.ElementTree import Element, SubElement @@ -40,6 +41,7 @@ import markdown.postprocessors import markdown.treeprocessors import markdown.util import re2 +import regex import requests import uri_template from django.conf import settings @@ -53,6 +55,7 @@ from zerver.lib import mention from zerver.lib.cache import cache_with_key from zerver.lib.camo import get_camo_url from zerver.lib.emoji import EMOTICON_RE, codepoint_to_name, name_to_codepoint, translate_emoticons +from zerver.lib.emoji_utils import emoji_to_hex_codepoint, unqualify_emoji from zerver.lib.exceptions import MarkdownRenderingError from zerver.lib.markdown import fenced_code from zerver.lib.markdown.fenced_code import FENCE_RE @@ -840,7 +843,7 @@ class InlineInterestingLinkProcessor(markdown.treeprocessors.Treeprocessor): This works by using the URLs, user_mentions and media data from the twitter API and searching for Unicode emojis in the text using - `UNICODE_EMOJI_RE`. + `POSSIBLE_EMOJI_RE`. The first step is finding the locations of the URLs, mentions, media and emoji in the text. For each match we build a dictionary with type, the start @@ -897,9 +900,9 @@ class InlineInterestingLinkProcessor(markdown.treeprocessors.Treeprocessor): for match in re.finditer(re.escape(short_url), text, re.IGNORECASE) ) # Build dicts for emojis - for match in re.finditer(UNICODE_EMOJI_RE, text, re.IGNORECASE): + for match in POSSIBLE_EMOJI_RE.finditer(text): orig_syntax = match.group("syntax") - codepoint = unicode_emoji_to_codepoint(orig_syntax) + codepoint = emoji_to_hex_codepoint(unqualify_emoji(orig_syntax)) if codepoint in codepoint_to_name: display_string = ":" + codepoint_to_name[codepoint] + ":" to_process.append( @@ -1342,54 +1345,27 @@ class Timestamp(markdown.inlinepatterns.Pattern): return time_element -# All of our emojis (excluding ZWJ sequences) belong to one of these Unicode blocks: -# \U0001f100-\U0001f1ff - Enclosed Alphanumeric Supplement -# \U0001f200-\U0001f2ff - Enclosed Ideographic Supplement -# \U0001f300-\U0001f5ff - Miscellaneous Symbols and Pictographs -# \U0001f600-\U0001f64f - Emoticons (Emoji) -# \U0001f680-\U0001f6ff - Transport and Map Symbols -# \U0001f7e0-\U0001f7eb - Coloured Geometric Shapes (NOTE: Not Unicode standard category name) -# \U0001f900-\U0001f9ff - Supplemental Symbols and Pictographs -# \u2000-\u206f - General Punctuation -# \u2300-\u23ff - Miscellaneous Technical -# \u2400-\u243f - Control Pictures -# \u2440-\u245f - Optical Character Recognition -# \u2460-\u24ff - Enclosed Alphanumerics -# \u2500-\u257f - Box Drawing -# \u2580-\u259f - Block Elements -# \u25a0-\u25ff - Geometric Shapes -# \u2600-\u26ff - Miscellaneous Symbols -# \u2700-\u27bf - Dingbats -# \u2900-\u297f - Supplemental Arrows-B -# \u2b00-\u2bff - Miscellaneous Symbols and Arrows -# \u3000-\u303f - CJK Symbols and Punctuation -# \u3200-\u32ff - Enclosed CJK Letters and Months -UNICODE_EMOJI_RE = ( - "(?P[" - "\U0001F100-\U0001F64F" - "\U0001F680-\U0001F6FF" - "\U0001F7E0-\U0001F7EB" - "\U0001F900-\U0001F9FF" - "\u2000-\u206F" - "\u2300-\u27BF" - "\u2900-\u297F" - "\u2B00-\u2BFF" - "\u3000-\u303F" - "\u3200-\u32FF" - "])" +# From https://unicode.org/reports/tr51/#EBNF_and_Regex. Keep this synced with `possible_emoji_regex`. +POSSIBLE_EMOJI_RE = regex.compile( + r"""(?P +\p{RI} \p{RI} +| \p{Emoji} + (?: \p{Emoji_Modifier} + | \uFE0F \u20E3? + | [\U000E0020-\U000E007E]+ \U000E007F + )? + (?: \u200D + (?: \p{RI} \p{RI} + | \p{Emoji} + (?: \p{Emoji_Modifier} + | \uFE0F \u20E3? + | [\U000E0020-\U000E007E]+ \U000E007F + )? + ) + )*) +""", + regex.VERBOSE, ) -# The equivalent JS regex is \ud83c[\udd00-\udfff]|\ud83d[\udc00-\ude4f]|\ud83d[\ude80-\udeff]| -# \ud83e[\udd00-\uddff]|[\u2000-\u206f]|[\u2300-\u27bf]|[\u2b00-\u2bff]|[\u3000-\u303f]| -# [\u3200-\u32ff]. See below comments for explanation. The JS regex is used by marked.js for -# frontend Unicode emoji processing. -# The JS regex \ud83c[\udd00-\udfff]|\ud83d[\udc00-\ude4f] represents U0001f100-\U0001f64f -# The JS regex \ud83d[\ude80-\udeff] represents \U0001f680-\U0001f6ff -# The JS regex \ud83e[\udd00-\uddff] represents \U0001f900-\U0001f9ff -# The JS regex [\u2000-\u206f] represents \u2000-\u206f -# The JS regex [\u2300-\u27bf] represents \u2300-\u27bf -# Similarly other JS regexes can be mapped to the respective Unicode blocks. -# For more information, please refer to the following article: -# http://crocodillon.com/blog/parsing-emoji-unicode-in-javascript def make_emoji(codepoint: str, display_string: str) -> Element: @@ -1413,11 +1389,6 @@ def make_realm_emoji(src: str, display_string: str) -> Element: return elt -def unicode_emoji_to_codepoint(unicode_emoji: str) -> str: - # Unicode codepoints are minimum of length 4, padded with zeroes - return f"{ord(unicode_emoji):04x}" - - class EmoticonTranslation(markdown.inlinepatterns.Pattern): """Translates emoticons like `:)` into emoji like `:smile:`.""" @@ -1436,15 +1407,28 @@ class EmoticonTranslation(markdown.inlinepatterns.Pattern): return make_emoji(name_to_codepoint[name], translated) -class UnicodeEmoji(markdown.inlinepatterns.Pattern): - def handleMatch(self, match: Match[str]) -> Optional[Element]: +TEXT_PRESENTATION_RE = regex.compile(r"\P{Emoji_Presentation}\u20E3?") + + +class UnicodeEmoji(CompiledInlineProcessor): + def handleMatch( # type: ignore[override] # https://github.com/python/mypy/issues/10197 + self, match: Match[str], data: str + ) -> Union[Tuple[None, None, None], Tuple[Element, int, int]]: orig_syntax = match.group("syntax") - codepoint = unicode_emoji_to_codepoint(orig_syntax) + + # We want to avoid turning things like arrows (↔) and keycaps (numbers + # in boxes) into qualified emoji. + # More specifically, we skip anything with text in the second column of + # this table https://unicode.org/Public/emoji/1.0/emoji-data.txt + if TEXT_PRESENTATION_RE.fullmatch(orig_syntax): + return None, None, None + + codepoint = emoji_to_hex_codepoint(unqualify_emoji(orig_syntax)) if codepoint in codepoint_to_name: display_string = ":" + codepoint_to_name[codepoint] + ":" - return make_emoji(codepoint, display_string) + return make_emoji(codepoint, display_string), match.start(), match.end() else: - return None + return None, None, None class Emoji(markdown.inlinepatterns.Pattern): @@ -2224,7 +2208,7 @@ class ZulipMarkdown(markdown.Markdown): reg.register(Emoji(EMOJI_REGEX, self), "emoji", 15) reg.register(EmoticonTranslation(EMOTICON_RE, self), "translate_emoticons", 10) # We get priority 5 from 'nl2br' extension - reg.register(UnicodeEmoji(UNICODE_EMOJI_RE), "unicodeemoji", 0) + reg.register(UnicodeEmoji(cast(Pattern[str], POSSIBLE_EMOJI_RE), self), "unicodeemoji", 0) return reg def register_linkifiers(self, registry: markdown.util.Registry) -> markdown.util.Registry: diff --git a/zerver/tests/fixtures/markdown_test_cases.json b/zerver/tests/fixtures/markdown_test_cases.json index edee732e85..5be0ea6dcc 100644 --- a/zerver/tests/fixtures/markdown_test_cases.json +++ b/zerver/tests/fixtures/markdown_test_cases.json @@ -588,6 +588,41 @@ "input": ":poop:", "expected_output": "

:poop:

" }, + { + "name": "emoji_sequence_one", + "input": "🤷‍♀ī¸", + "expected_output": "

:woman_shrugging:

" + }, + { + "name": "emoji_sequence_two", + "input": "👁‍🗨 #ī¸âƒŖ", + "expected_output": "

:eye_in_speech_bubble: :hash:

" + }, + { + "name": "unrecognized_emoji_sequence_one", + "input": "pheonix bird from 15.1: đŸĻ‍đŸ”Ĩ", + "expected_output": "

pheonix bird from 15.1: đŸĻ‍đŸ”Ĩ

" + }, + { + "name": "unrecognized_emoji_sequence_two", + "input": "lime from 15.1: 🍋‍🟩", + "expected_output": "

lime from 15.1: 🍋‍🟩

" + }, + { + "name": "unrecognized_emoji_sequence_three", + "input": "normal lemon 🍋, purple lemon? 🍋‍đŸŸĒ", + "expected_output": "

normal lemon :lemon:, purple lemon? 🍋‍đŸŸĒ

" + }, + { + "name": "unrecognized_emoji_sequence_four", + "input": "medium-skinned person riding scooter back and forth: 🧑đŸŊ‍🛴‍↩ī¸", + "expected_output": "

medium-skinned person riding scooter back and forth: 🧑đŸŊ‍🛴‍↩ī¸

" + }, + { + "name": "unrecognized_emoji_sequence_skin_tone", + "input": "man in manual wheelchair: dark skin tone 👨đŸŋ‍đŸĻŊ", + "expected_output": "

man in manual wheelchair: dark skin tone 👨đŸŋ‍đŸĻŊ

" + }, { "name": "emojis_without_space", "input": ":cat:hello:dog::rabbit:", @@ -642,7 +677,7 @@ }, { "name": "miscellaneous_symbols_and_arrows", - "input": "Black upward arrow \u2b06", + "input": "Black upward arrow \u2b06\ufe0f", "expected_output":"

Black upward arrow :up:<\/p>" }, { diff --git a/zerver/tests/test_markdown.py b/zerver/tests/test_markdown.py index 6453300506..979a29f56b 100644 --- a/zerver/tests/test_markdown.py +++ b/zerver/tests/test_markdown.py @@ -22,9 +22,11 @@ from zerver.actions.users import change_user_is_active from zerver.lib.alert_words import get_alert_word_automaton from zerver.lib.camo import get_camo_url from zerver.lib.create_user import create_user -from zerver.lib.emoji import get_emoji_url +from zerver.lib.emoji import codepoint_to_name, get_emoji_url +from zerver.lib.emoji_utils import hex_codepoint_to_emoji from zerver.lib.exceptions import JsonableError, MarkdownRenderingError from zerver.lib.markdown import ( + POSSIBLE_EMOJI_RE, InlineInterestingLinkProcessor, MarkdownListPreprocessor, MessageRenderingResult, @@ -3156,3 +3158,17 @@ class MarkdownErrorTests(ZulipTestCase): result = processor.run(markdown_input) self.assertEqual(result, expected) + + +class MarkdownEmojiTest(ZulipTestCase): + def test_all_emoji_match_regex(self) -> None: + non_matching_emoji = [ + emoji + for codepoint in codepoint_to_name + if not POSSIBLE_EMOJI_RE.fullmatch(emoji := hex_codepoint_to_emoji(codepoint)) + ] + self.assertEqual( + non_matching_emoji, + # unqualified numbers in boxes shouldn't be converted to emoji images, so this is fine + ["#âƒŖ", "*âƒŖ", "0âƒŖ", "1âƒŖ", "2âƒŖ", "3âƒŖ", "4âƒŖ", "5âƒŖ", "6âƒŖ", "7âƒŖ", "8âƒŖ", "9âƒŖ"], + )