emoji: Match emoji sequences in markdown.

Fixes #11767. Previously multi-character emoji sequences weren't matched in the emoji regex, so we'd convert the characters to separate images, breaking the intended display. This change allows us to match the full emoji sequence, and therefore show the correct image.
2023-08-10 12:00:45 -07:00 · 2023-08-10 12:00:45 -07:00 · 0289beb784
parent 78f0dca269
commit 0289beb784
13 changed files with 173 additions and 93 deletions
--- a/requirements/common.in
+++ b/requirements/common.in
@ -17,6 +17,7 @@ importlib-metadata ; python_version < "3.10"  # for Markdown
 Pygments
 jsx-lexer
 uri-template
+regex

 # Needed for manage.py
 ipython
--- a/requirements/dev.txt
+++ b/requirements/dev.txt
@ -2230,7 +2230,9 @@ regex==2023.8.8 \
    --hash=sha256:f2181c20ef18747d5f4a7ea513e09ea03bdd50884a11ce46066bb90fe4213675 \
    --hash=sha256:f2200e00b62568cfd920127782c61bc1c546062a879cdc741cfcc6976668dfcf \
    --hash=sha256:fcbdc5f2b0f1cd0f6a56cdb46fe41d2cce1e644e3b68832f3eeebc5fb0f7712e
-    # via talon-core
+    # via
+    #   -r requirements/common.in
+    #   talon-core
 requests[security]==2.31.0 \
    --hash=sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f \
    --hash=sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1
@ -2834,6 +2836,10 @@ types-redis==4.6.0.4 \
    --hash=sha256:03a1e1659ae4d8f6543bc2b8b11e94b1ee53937f313b1dc6f67dc7bde7d38fe0 \
    --hash=sha256:c475a9d3cf73dd696c3887d30644323fc56f5e00af96151035b3b5b52875c9b3
    # via -r requirements/mypy.in
+types-regex==2023.6.3.1 \
+    --hash=sha256:21880e584e2bada8656abeeb3131287a89dcb215f24f1f5b1503eb9bca37f759 \
+    --hash=sha256:f385191206021e48db0f452fe9479812710daf07058c8f6972cfb2f6202df136
+    # via -r requirements/mypy.in
 types-requests==2.31.0.2 \
    --hash=sha256:56d181c85b5925cbc59f4489a57e72a8b2166f18273fd8ba7b6fe0c0b986f12a \
    --hash=sha256:6aa3f7faf0ea52d728bb18c0a0d1522d9bfd8c72d26ff6f61bfc3d06a411cf40
--- a/requirements/mypy.in
+++ b/requirements/mypy.in
@ -21,6 +21,7 @@ types-Pygments
 types-python-dateutil
 types-PyYAML
 types-redis
+types-regex
 types-requests
 types-stripe
 types-zxcvbn
--- a/requirements/prod.txt
+++ b/requirements/prod.txt
@ -1707,7 +1707,9 @@ regex==2023.8.8 \
    --hash=sha256:f2181c20ef18747d5f4a7ea513e09ea03bdd50884a11ce46066bb90fe4213675 \
    --hash=sha256:f2200e00b62568cfd920127782c61bc1c546062a879cdc741cfcc6976668dfcf \
    --hash=sha256:fcbdc5f2b0f1cd0f6a56cdb46fe41d2cce1e644e3b68832f3eeebc5fb0f7712e
-    # via talon-core
+    # via
+    #   -r requirements/common.in
+    #   talon-core
 requests[security]==2.31.0 \
    --hash=sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f \
    --hash=sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1
--- a/tools/setup/emoji/emoji_names.py
+++ b/tools/setup/emoji/emoji_names.py
@ -524,11 +524,11 @@ EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {
    "1f3ca-200d-2640": {"canonical_name": "woman_swimming", "aliases": []},
    "1f3ca-200d-2642": {"canonical_name": "man_swimming", "aliases": []},
    "1f3ca": {"canonical_name": "swim", "aliases": []},
-    "1f3cb-fe0f-200d-2640-fe0f": {"canonical_name": "woman_lifting_weights", "aliases": []},
-    "1f3cb-fe0f-200d-2642-fe0f": {"canonical_name": "man_lifting_weights", "aliases": []},
+    "1f3cb-200d-2640": {"canonical_name": "woman_lifting_weights", "aliases": []},
+    "1f3cb-200d-2642": {"canonical_name": "man_lifting_weights", "aliases": []},
    "1f3cb": {"canonical_name": "lift", "aliases": ["work_out", "weight_lift", "gym"]},
-    "1f3cc-fe0f-200d-2640-fe0f": {"canonical_name": "woman_golfing", "aliases": []},
-    "1f3cc-fe0f-200d-2642-fe0f": {"canonical_name": "man_golfing", "aliases": []},
+    "1f3cc-200d-2640": {"canonical_name": "woman_golfing", "aliases": []},
+    "1f3cc-200d-2642": {"canonical_name": "man_golfing", "aliases": []},
    "1f3cc": {"canonical_name": "golf", "aliases": []},
    "1f3cd": {"canonical_name": "motorcycle", "aliases": []},
    "1f3ce": {"canonical_name": "racecar", "aliases": []},
@ -567,7 +567,7 @@ EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {
    "1f3ef": {"canonical_name": "shiro", "aliases": []},
    "1f3f0": {"canonical_name": "castle", "aliases": []},
    "1f3f3-200d-1f308": {"canonical_name": "rainbow_flag", "aliases": []},
-    "1f3f3-fe0f-200d-26a7-fe0f": {"canonical_name": "transgender_flag", "aliases": []},
+    "1f3f3-200d-26a7": {"canonical_name": "transgender_flag", "aliases": []},
    "1f3f3": {"canonical_name": "white_flag", "aliases": ["surrender"]},
    "1f3f4-200d-2620": {"canonical_name": "pirate_flag", "aliases": ["jolly_roger", "plunder"]},
    "1f3f4-e0067-e0062-e0065-e006e-e0067-e007f": {"canonical_name": "flag_england", "aliases": []},
@ -648,7 +648,7 @@ EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {
    "1f43e": {"canonical_name": "paw_prints", "aliases": ["paws"]},
    "1f43f": {"canonical_name": "chipmunk", "aliases": []},
    "1f440": {"canonical_name": "eyes", "aliases": ["looking"]},
-    "1f441-fe0f-200d-1f5e8-fe0f": {
+    "1f441-200d-1f5e8": {
        "canonical_name": "eye_in_speech_bubble",
        "aliases": ["speech", "witness"],
    },
@ -1056,8 +1056,8 @@ EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {
    "1f570": {"canonical_name": "mantelpiece_clock", "aliases": []},
    "1f573": {"canonical_name": "hole", "aliases": []},
    "1f574": {"canonical_name": "levitating", "aliases": ["hover"]},
-    "1f575-fe0f-200d-2640-fe0f": {"canonical_name": "woman_detective", "aliases": []},
-    "1f575-fe0f-200d-2642-fe0f": {"canonical_name": "man_detective", "aliases": []},
+    "1f575-200d-2640": {"canonical_name": "woman_detective", "aliases": []},
+    "1f575-200d-2642": {"canonical_name": "man_detective", "aliases": []},
    "1f575": {"canonical_name": "detective", "aliases": ["spy", "sleuth", "agent", "sneaky"]},
    "1f576": {"canonical_name": "dark_sunglasses", "aliases": []},
    "1f577": {"canonical_name": "spider", "aliases": []},
@ -2022,8 +2022,8 @@ EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {
    "26f5": {"canonical_name": "boat", "aliases": ["sailboat"]},
    "26f7": {"canonical_name": "skier", "aliases": []},
    "26f8": {"canonical_name": "ice_skate", "aliases": []},
-    "26f9-fe0f-200d-2640-fe0f": {"canonical_name": "woman_bouncing_ball", "aliases": []},
-    "26f9-fe0f-200d-2642-fe0f": {"canonical_name": "man_bouncing_ball", "aliases": []},
+    "26f9-200d-2640": {"canonical_name": "woman_bouncing_ball", "aliases": []},
+    "26f9-200d-2642": {"canonical_name": "man_bouncing_ball", "aliases": []},
    "26f9": {"canonical_name": "ball", "aliases": ["sports"]},
    "26fa": {"canonical_name": "tent", "aliases": ["camping"]},
    "26fd": {"canonical_name": "fuel_pump", "aliases": ["gas_pump", "petrol_pump"]},
--- a/tools/setup/emoji/emoji_setup_utils.py
+++ b/tools/setup/emoji/emoji_setup_utils.py
@ -3,6 +3,8 @@
 from collections import defaultdict
 from typing import Any, Dict, List

+from zerver.lib.emoji_utils import emoji_to_hex_codepoint, hex_codepoint_to_emoji, unqualify_emoji
+
 # Emoji sets that we currently support.
 EMOJISETS = ["google", "twitter"]

@ -61,18 +63,12 @@ def emoji_names_for_picker(emoji_name_maps: Dict[str, Dict[str, Any]]) -> List[s


 def get_emoji_code(emoji_dict: Dict[str, Any]) -> str:
-    # Starting from version 4.0.0, `emoji_datasource` package has started to
-    # add an emoji presentation variation selector for certain emojis which
-    # have defined variation sequences. Since in informal environments(like
-    # texting and chat), it is more appropriate for an emoji to have a colorful
-    # display so until emoji characters have a text presentation selector, it
-    # should have a colorful display. Hence we can continue using emoji characters
-    # without appending emoji presentation selector.
-    # (http://unicode.org/reports/tr51/index.html#Presentation_Style)
-    # If `non_qualified` field is present and not None return it otherwise
-    # return `unified` field.
-    emoji_code = emoji_dict.get("non_qualified") or emoji_dict["unified"]
-    return emoji_code.lower()
+    # There is a `non_qualified` field on `emoji_dict` but it's
+    # inconsistently present, so we'll always use the unqualified
+    # emoji by unqualifying it ourselves. This gives us more consistent
+    # behaviour between emojis, and doesn't rely on the incomplete
+    # upstream package (https://github.com/iamcal/emoji-data/pull/217).
+    return emoji_to_hex_codepoint(unqualify_emoji(hex_codepoint_to_emoji(emoji_dict["unified"])))


 # Returns a dict from categories to list of codepoints. The list of
--- a/version.py
+++ b/version.py
@ -48,4 +48,4 @@ API_FEATURE_LEVEL = 203
 #   historical commits sharing the same major version, in which case a
 #   minor version bump suffices.

-PROVISION_VERSION = (247, 4)
+PROVISION_VERSION = (247, 5)
--- a/web/src/markdown.js
+++ b/web/src/markdown.js
@ -388,7 +388,21 @@ function make_emoji_span(codepoint, title, alt_text) {
 }

 function handleUnicodeEmoji({unicode_emoji, get_emoji_name}) {
-    const codepoint = unicode_emoji.codePointAt(0).toString(16);
+    // We want to avoid turning things like arrows (↔) and keycaps (numbers
+    // in boxes) into qualified emoji (images).
+    // More specifically, we skip anything with text in the second column of
+    // this table https://unicode.org/Public/emoji/1.0/emoji-data.txt
+    if (/^\P{Emoji_Presentation}\u20E3?$/u.test(unicode_emoji)) {
+        return unicode_emoji;
+    }
+
+    // This unqualifies qualified emoji, which helps us make sure we
+    // can match both versions.
+    const unqualified_unicode_emoji = unicode_emoji.replace(/\uFE0F/, "");
+
+    const codepoint = [...unqualified_unicode_emoji]
+        .map((char) => char.codePointAt(0).toString(16).padStart(4, "0"))
+        .join("-");
    const emoji_name = get_emoji_name(codepoint);

    if (emoji_name) {
--- a/web/third/marked/lib/marked.js
+++ b/web/third/marked/lib/marked.js
@ -537,12 +537,12 @@ inline.breaks = merge({}, inline.gfm, {
  text: replace(inline.gfm.text)('{2,}', '*')()
 });

+// From https://unicode.org/reports/tr51/#EBNF_and_Regex. Keep this synced with POSSIBLE_EMOJI_RE.
+const possible_emoji_regex = /^(\p{RI}\p{RI}|\p{Emoji}(?:\p{Emoji_Modifier}|\u{FE0F}\u{20E3}?|[\u{E0020}-\u{E007E}]+\u{E007F})?(?:\u{200D}(?:\p{RI}\p{RI}|\p{Emoji}(?:\p{Emoji_Modifier}|\u{FE0F}\u{20E3}?|[\u{E0020}-\u{E007E}]+\u{E007F})?))*)/u;
+
 inline.zulip = merge({}, inline.breaks, {
  emoji: /^:([A-Za-z0-9_\-\+]+?):/,
-  unicodeemoji: RegExp('^(\ud83c[\udd00-\udfff]|\ud83d[\udc00-\ude4f]|' +
-                       '\ud83d[\ude80-\udeff]|\ud83e[\udd00-\uddff]|' +
-                       '[\u2000-\u206F]|[\u2300-\u27BF]|[\u2B00-\u2BFF]|' +
-                       '[\u3000-\u303F]|[\u3200-\u32FF])'),
+  unicodeemoji: possible_emoji_regex,
  usermention: /^@(_?)(?:\*\*([^\*]+)\*\*)/, // Match potentially multi-word string between @** **
  groupmention: /^@(_?)(?:\*([^\*]+)\*)/, // Match multi-word string between @* *
  stream_topic: /^#\*\*([^\*>]+)>([^\*]+)\*\*/,
--- a/zerver/lib/emoji_utils.py
+++ b/zerver/lib/emoji_utils.py
@ -0,0 +1,25 @@
+# This file doesn't import from django so that we can use it in `build_emoji`
+
+
+def unqualify_emoji(emoji: str) -> str:
+    # Starting from version 4.0.0, `emoji_datasource` package has started to
+    # add an emoji presentation variation selector for certain emojis which
+    # have defined variation sequences. The emoji presentation selector
+    # "qualifies" an emoji, and an "unqualified" version of an emoji does
+    # not have an emoji presentation selector.
+    #
+    # Since in informal environments(like texting and chat), it is more
+    # appropriate for an emoji to have a colorful display so until emoji
+    # characters have a text presentation selector, it should have a
+    # colorful display. Hence we can continue using emoji characters
+    # without appending emoji presentation selector.
+    # (http://unicode.org/reports/tr51/index.html#Presentation_Style)
+    return emoji.replace("\ufe0f", "")
+
+
+def emoji_to_hex_codepoint(emoji: str) -> str:
+    return "-".join(f"{ord(c):04x}" for c in emoji)
+
+
+def hex_codepoint_to_emoji(hex: str) -> str:
+    return "".join(chr(int(h, 16)) for h in hex.split("-"))
--- a/zerver/lib/markdown/init.py
+++ b/zerver/lib/markdown/init.py
@ -25,6 +25,7 @@ from typing import (
    TypedDict,
    TypeVar,
    Union,
+    cast,
 )
 from urllib.parse import parse_qs, urlencode, urljoin, urlsplit
 from xml.etree.ElementTree import Element, SubElement
@ -40,6 +41,7 @@ import markdown.postprocessors
 import markdown.treeprocessors
 import markdown.util
 import re2
+import regex
 import requests
 import uri_template
 from django.conf import settings
@ -53,6 +55,7 @@ from zerver.lib import mention
 from zerver.lib.cache import cache_with_key
 from zerver.lib.camo import get_camo_url
 from zerver.lib.emoji import EMOTICON_RE, codepoint_to_name, name_to_codepoint, translate_emoticons
+from zerver.lib.emoji_utils import emoji_to_hex_codepoint, unqualify_emoji
 from zerver.lib.exceptions import MarkdownRenderingError
 from zerver.lib.markdown import fenced_code
 from zerver.lib.markdown.fenced_code import FENCE_RE
@ -840,7 +843,7 @@ class InlineInterestingLinkProcessor(markdown.treeprocessors.Treeprocessor):

        This works by using the URLs, user_mentions and media data from
        the twitter API and searching for Unicode emojis in the text using
-        `UNICODE_EMOJI_RE`.
+        `POSSIBLE_EMOJI_RE`.

        The first step is finding the locations of the URLs, mentions, media and
        emoji in the text. For each match we build a dictionary with type, the start
@ -897,9 +900,9 @@ class InlineInterestingLinkProcessor(markdown.treeprocessors.Treeprocessor):
                for match in re.finditer(re.escape(short_url), text, re.IGNORECASE)
            )
        # Build dicts for emojis
-        for match in re.finditer(UNICODE_EMOJI_RE, text, re.IGNORECASE):
+        for match in POSSIBLE_EMOJI_RE.finditer(text):
            orig_syntax = match.group("syntax")
-            codepoint = unicode_emoji_to_codepoint(orig_syntax)
+            codepoint = emoji_to_hex_codepoint(unqualify_emoji(orig_syntax))
            if codepoint in codepoint_to_name:
                display_string = ":" + codepoint_to_name[codepoint] + ":"
                to_process.append(
@ -1342,54 +1345,27 @@ class Timestamp(markdown.inlinepatterns.Pattern):
        return time_element


-# All of our emojis (excluding ZWJ sequences) belong to one of these Unicode blocks:
-# \U0001f100-\U0001f1ff - Enclosed Alphanumeric Supplement
-# \U0001f200-\U0001f2ff - Enclosed Ideographic Supplement
-# \U0001f300-\U0001f5ff - Miscellaneous Symbols and Pictographs
-# \U0001f600-\U0001f64f - Emoticons (Emoji)
-# \U0001f680-\U0001f6ff - Transport and Map Symbols
-# \U0001f7e0-\U0001f7eb - Coloured Geometric Shapes (NOTE: Not Unicode standard category name)
-# \U0001f900-\U0001f9ff - Supplemental Symbols and Pictographs
-# \u2000-\u206f         - General Punctuation
-# \u2300-\u23ff         - Miscellaneous Technical
-# \u2400-\u243f         - Control Pictures
-# \u2440-\u245f         - Optical Character Recognition
-# \u2460-\u24ff         - Enclosed Alphanumerics
-# \u2500-\u257f         - Box Drawing
-# \u2580-\u259f         - Block Elements
-# \u25a0-\u25ff         - Geometric Shapes
-# \u2600-\u26ff         - Miscellaneous Symbols
-# \u2700-\u27bf         - Dingbats
-# \u2900-\u297f         - Supplemental Arrows-B
-# \u2b00-\u2bff         - Miscellaneous Symbols and Arrows
-# \u3000-\u303f         - CJK Symbols and Punctuation
-# \u3200-\u32ff         - Enclosed CJK Letters and Months
-UNICODE_EMOJI_RE = (
-    "(?P<syntax>["
-    "\U0001F100-\U0001F64F"
-    "\U0001F680-\U0001F6FF"
-    "\U0001F7E0-\U0001F7EB"
-    "\U0001F900-\U0001F9FF"
-    "\u2000-\u206F"
-    "\u2300-\u27BF"
-    "\u2900-\u297F"
-    "\u2B00-\u2BFF"
-    "\u3000-\u303F"
-    "\u3200-\u32FF"
-    "])"
+# From https://unicode.org/reports/tr51/#EBNF_and_Regex. Keep this synced with `possible_emoji_regex`.
+POSSIBLE_EMOJI_RE = regex.compile(
+    r"""(?P<syntax>
+\p{RI} \p{RI}
+| \p{Emoji}
+  (?: \p{Emoji_Modifier}
+  | \uFE0F \u20E3?
+  | [\U000E0020-\U000E007E]+ \U000E007F
+  )?
+  (?: \u200D
+    (?: \p{RI} \p{RI}
+    | \p{Emoji}
+      (?: \p{Emoji_Modifier}
+      | \uFE0F \u20E3?
+      | [\U000E0020-\U000E007E]+ \U000E007F
+      )?
+    )
+  )*)
+""",
+    regex.VERBOSE,
 )
-# The equivalent JS regex is \ud83c[\udd00-\udfff]|\ud83d[\udc00-\ude4f]|\ud83d[\ude80-\udeff]|
-# \ud83e[\udd00-\uddff]|[\u2000-\u206f]|[\u2300-\u27bf]|[\u2b00-\u2bff]|[\u3000-\u303f]|
-# [\u3200-\u32ff]. See below comments for explanation. The JS regex is used by marked.js for
-# frontend Unicode emoji processing.
-# The JS regex \ud83c[\udd00-\udfff]|\ud83d[\udc00-\ude4f] represents U0001f100-\U0001f64f
-# The JS regex \ud83d[\ude80-\udeff] represents \U0001f680-\U0001f6ff
-# The JS regex \ud83e[\udd00-\uddff] represents \U0001f900-\U0001f9ff
-# The JS regex [\u2000-\u206f] represents \u2000-\u206f
-# The JS regex [\u2300-\u27bf] represents \u2300-\u27bf
-# Similarly other JS regexes can be mapped to the respective Unicode blocks.
-# For more information, please refer to the following article:
-# http://crocodillon.com/blog/parsing-emoji-unicode-in-javascript


 def make_emoji(codepoint: str, display_string: str) -> Element:
@ -1413,11 +1389,6 @@ def make_realm_emoji(src: str, display_string: str) -> Element:
    return elt


-def unicode_emoji_to_codepoint(unicode_emoji: str) -> str:
-    # Unicode codepoints are minimum of length 4, padded with zeroes
-    return f"{ord(unicode_emoji):04x}"
-
-
 class EmoticonTranslation(markdown.inlinepatterns.Pattern):
    """Translates emoticons like `:)` into emoji like `:smile:`."""

@ -1436,15 +1407,28 @@ class EmoticonTranslation(markdown.inlinepatterns.Pattern):
        return make_emoji(name_to_codepoint[name], translated)


-class UnicodeEmoji(markdown.inlinepatterns.Pattern):
-    def handleMatch(self, match: Match[str]) -> Optional[Element]:
+TEXT_PRESENTATION_RE = regex.compile(r"\P{Emoji_Presentation}\u20E3?")
+
+
+class UnicodeEmoji(CompiledInlineProcessor):
+    def handleMatch(  # type: ignore[override] # https://github.com/python/mypy/issues/10197
+        self, match: Match[str], data: str
+    ) -> Union[Tuple[None, None, None], Tuple[Element, int, int]]:
        orig_syntax = match.group("syntax")
-        codepoint = unicode_emoji_to_codepoint(orig_syntax)
+
+        # We want to avoid turning things like arrows (↔) and keycaps (numbers
+        # in boxes) into qualified emoji.
+        # More specifically, we skip anything with text in the second column of
+        # this table https://unicode.org/Public/emoji/1.0/emoji-data.txt
+        if TEXT_PRESENTATION_RE.fullmatch(orig_syntax):
+            return None, None, None
+
+        codepoint = emoji_to_hex_codepoint(unqualify_emoji(orig_syntax))
        if codepoint in codepoint_to_name:
            display_string = ":" + codepoint_to_name[codepoint] + ":"
-            return make_emoji(codepoint, display_string)
+            return make_emoji(codepoint, display_string), match.start(), match.end()
        else:
-            return None
+            return None, None, None


 class Emoji(markdown.inlinepatterns.Pattern):
@ -2224,7 +2208,7 @@ class ZulipMarkdown(markdown.Markdown):
        reg.register(Emoji(EMOJI_REGEX, self), "emoji", 15)
        reg.register(EmoticonTranslation(EMOTICON_RE, self), "translate_emoticons", 10)
        # We get priority 5 from 'nl2br' extension
-        reg.register(UnicodeEmoji(UNICODE_EMOJI_RE), "unicodeemoji", 0)
+        reg.register(UnicodeEmoji(cast(Pattern[str], POSSIBLE_EMOJI_RE), self), "unicodeemoji", 0)
        return reg

    def register_linkifiers(self, registry: markdown.util.Registry) -> markdown.util.Registry:
--- a/zerver/tests/fixtures/markdown_test_cases.json
+++ b/zerver/tests/fixtures/markdown_test_cases.json
@ -588,6 +588,41 @@
      "input": ":poop:",
      "expected_output": "<p><span aria-label=\"poop\" class=\"emoji emoji-1f4a9\" role=\"img\" title=\"poop\">:poop:</span></p>"
    },
+    {
+      "name": "emoji_sequence_one",
+      "input": "🤷‍♀️",
+      "expected_output": "<p><span aria-label=\"woman shrugging\" class=\"emoji emoji-1f937-200d-2640\" role=\"img\" title=\"woman shrugging\">:woman_shrugging:</span></p>"
+    },
+    {
+      "name": "emoji_sequence_two",
+      "input": "👁‍🗨 #️⃣",
+      "expected_output": "<p><span aria-label=\"eye in speech bubble\" class=\"emoji emoji-1f441-200d-1f5e8\" role=\"img\" title=\"eye in speech bubble\">:eye_in_speech_bubble:</span> <span aria-label=\"hash\" class=\"emoji emoji-0023-20e3\" role=\"img\" title=\"hash\">:hash:</span></p>"
+    },
+    {
+      "name": "unrecognized_emoji_sequence_one",
+      "input": "pheonix bird from 15.1: 🐦‍🔥",
+      "expected_output": "<p>pheonix bird from 15.1: 🐦‍🔥</p>"
+    },
+    {
+      "name": "unrecognized_emoji_sequence_two",
+      "input": "lime from 15.1: 🍋‍🟩",
+      "expected_output": "<p>lime from 15.1: 🍋‍🟩</p>"
+    },
+    {
+      "name": "unrecognized_emoji_sequence_three",
+      "input": "normal lemon 🍋, purple lemon? 🍋‍🟪",
+      "expected_output": "<p>normal lemon <span aria-label=\"lemon\" class=\"emoji emoji-1f34b\" role=\"img\" title=\"lemon\">:lemon:</span>, purple lemon? 🍋‍🟪</p>"
+    },
+    {
+      "name": "unrecognized_emoji_sequence_four",
+      "input": "medium-skinned person riding scooter back and forth: 🧑🏽‍🛴‍↩️",
+      "expected_output": "<p>medium-skinned person riding scooter back and forth: 🧑🏽‍🛴‍↩️</p>"
+    },
+    {
+      "name": "unrecognized_emoji_sequence_skin_tone",
+      "input": "man in manual wheelchair: dark skin tone 👨🏿‍🦽",
+      "expected_output": "<p>man in manual wheelchair: dark skin tone 👨🏿‍🦽</p>"
+    },
    {
      "name": "emojis_without_space",
      "input": ":cat:hello:dog::rabbit:",
@ -642,7 +677,7 @@
    },
    {
      "name": "miscellaneous_symbols_and_arrows",
-      "input": "Black upward arrow \u2b06",
+      "input": "Black upward arrow \u2b06\ufe0f",
      "expected_output":"<p>Black upward arrow <span aria-label=\"up\" class=\"emoji emoji-2b06\" role=\"img\" title=\"up\">:up:</span><\/p>"
    },
    {
--- a/zerver/tests/test_markdown.py
+++ b/zerver/tests/test_markdown.py
@ -22,9 +22,11 @@ from zerver.actions.users import change_user_is_active
 from zerver.lib.alert_words import get_alert_word_automaton
 from zerver.lib.camo import get_camo_url
 from zerver.lib.create_user import create_user
-from zerver.lib.emoji import get_emoji_url
+from zerver.lib.emoji import codepoint_to_name, get_emoji_url
+from zerver.lib.emoji_utils import hex_codepoint_to_emoji
 from zerver.lib.exceptions import JsonableError, MarkdownRenderingError
 from zerver.lib.markdown import (
+    POSSIBLE_EMOJI_RE,
    InlineInterestingLinkProcessor,
    MarkdownListPreprocessor,
    MessageRenderingResult,
@ -3156,3 +3158,17 @@ class MarkdownErrorTests(ZulipTestCase):

        result = processor.run(markdown_input)
        self.assertEqual(result, expected)
+
+
+class MarkdownEmojiTest(ZulipTestCase):
+    def test_all_emoji_match_regex(self) -> None:
+        non_matching_emoji = [
+            emoji
+            for codepoint in codepoint_to_name
+            if not POSSIBLE_EMOJI_RE.fullmatch(emoji := hex_codepoint_to_emoji(codepoint))
+        ]
+        self.assertEqual(
+            non_matching_emoji,
+            # unqualified numbers in boxes shouldn't be converted to emoji images, so this is fine
+            ["#⃣", "*⃣", "0⃣", "1⃣", "2⃣", "3⃣", "4⃣", "5⃣", "6⃣", "7⃣", "8⃣", "9⃣"],
+        )