emoji: Match emoji sequences in markdown.

Fixes #11767.

Previously multi-character emoji sequences weren't matched in the
emoji regex, so we'd convert the characters to separate images,
breaking the intended display.

This change allows us to match the full emoji sequence, and
therefore show the correct image.
This commit is contained in:
evykassirer 2023-08-10 12:00:45 -07:00 committed by Tim Abbott
parent 78f0dca269
commit 0289beb784
13 changed files with 173 additions and 93 deletions

View File

@ -17,6 +17,7 @@ importlib-metadata ; python_version < "3.10" # for Markdown
Pygments Pygments
jsx-lexer jsx-lexer
uri-template uri-template
regex
# Needed for manage.py # Needed for manage.py
ipython ipython

View File

@ -2230,7 +2230,9 @@ regex==2023.8.8 \
--hash=sha256:f2181c20ef18747d5f4a7ea513e09ea03bdd50884a11ce46066bb90fe4213675 \ --hash=sha256:f2181c20ef18747d5f4a7ea513e09ea03bdd50884a11ce46066bb90fe4213675 \
--hash=sha256:f2200e00b62568cfd920127782c61bc1c546062a879cdc741cfcc6976668dfcf \ --hash=sha256:f2200e00b62568cfd920127782c61bc1c546062a879cdc741cfcc6976668dfcf \
--hash=sha256:fcbdc5f2b0f1cd0f6a56cdb46fe41d2cce1e644e3b68832f3eeebc5fb0f7712e --hash=sha256:fcbdc5f2b0f1cd0f6a56cdb46fe41d2cce1e644e3b68832f3eeebc5fb0f7712e
# via talon-core # via
# -r requirements/common.in
# talon-core
requests[security]==2.31.0 \ requests[security]==2.31.0 \
--hash=sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f \ --hash=sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f \
--hash=sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1 --hash=sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1
@ -2834,6 +2836,10 @@ types-redis==4.6.0.4 \
--hash=sha256:03a1e1659ae4d8f6543bc2b8b11e94b1ee53937f313b1dc6f67dc7bde7d38fe0 \ --hash=sha256:03a1e1659ae4d8f6543bc2b8b11e94b1ee53937f313b1dc6f67dc7bde7d38fe0 \
--hash=sha256:c475a9d3cf73dd696c3887d30644323fc56f5e00af96151035b3b5b52875c9b3 --hash=sha256:c475a9d3cf73dd696c3887d30644323fc56f5e00af96151035b3b5b52875c9b3
# via -r requirements/mypy.in # via -r requirements/mypy.in
types-regex==2023.6.3.1 \
--hash=sha256:21880e584e2bada8656abeeb3131287a89dcb215f24f1f5b1503eb9bca37f759 \
--hash=sha256:f385191206021e48db0f452fe9479812710daf07058c8f6972cfb2f6202df136
# via -r requirements/mypy.in
types-requests==2.31.0.2 \ types-requests==2.31.0.2 \
--hash=sha256:56d181c85b5925cbc59f4489a57e72a8b2166f18273fd8ba7b6fe0c0b986f12a \ --hash=sha256:56d181c85b5925cbc59f4489a57e72a8b2166f18273fd8ba7b6fe0c0b986f12a \
--hash=sha256:6aa3f7faf0ea52d728bb18c0a0d1522d9bfd8c72d26ff6f61bfc3d06a411cf40 --hash=sha256:6aa3f7faf0ea52d728bb18c0a0d1522d9bfd8c72d26ff6f61bfc3d06a411cf40

View File

@ -21,6 +21,7 @@ types-Pygments
types-python-dateutil types-python-dateutil
types-PyYAML types-PyYAML
types-redis types-redis
types-regex
types-requests types-requests
types-stripe types-stripe
types-zxcvbn types-zxcvbn

View File

@ -1707,7 +1707,9 @@ regex==2023.8.8 \
--hash=sha256:f2181c20ef18747d5f4a7ea513e09ea03bdd50884a11ce46066bb90fe4213675 \ --hash=sha256:f2181c20ef18747d5f4a7ea513e09ea03bdd50884a11ce46066bb90fe4213675 \
--hash=sha256:f2200e00b62568cfd920127782c61bc1c546062a879cdc741cfcc6976668dfcf \ --hash=sha256:f2200e00b62568cfd920127782c61bc1c546062a879cdc741cfcc6976668dfcf \
--hash=sha256:fcbdc5f2b0f1cd0f6a56cdb46fe41d2cce1e644e3b68832f3eeebc5fb0f7712e --hash=sha256:fcbdc5f2b0f1cd0f6a56cdb46fe41d2cce1e644e3b68832f3eeebc5fb0f7712e
# via talon-core # via
# -r requirements/common.in
# talon-core
requests[security]==2.31.0 \ requests[security]==2.31.0 \
--hash=sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f \ --hash=sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f \
--hash=sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1 --hash=sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1

View File

@ -524,11 +524,11 @@ EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {
"1f3ca-200d-2640": {"canonical_name": "woman_swimming", "aliases": []}, "1f3ca-200d-2640": {"canonical_name": "woman_swimming", "aliases": []},
"1f3ca-200d-2642": {"canonical_name": "man_swimming", "aliases": []}, "1f3ca-200d-2642": {"canonical_name": "man_swimming", "aliases": []},
"1f3ca": {"canonical_name": "swim", "aliases": []}, "1f3ca": {"canonical_name": "swim", "aliases": []},
"1f3cb-fe0f-200d-2640-fe0f": {"canonical_name": "woman_lifting_weights", "aliases": []}, "1f3cb-200d-2640": {"canonical_name": "woman_lifting_weights", "aliases": []},
"1f3cb-fe0f-200d-2642-fe0f": {"canonical_name": "man_lifting_weights", "aliases": []}, "1f3cb-200d-2642": {"canonical_name": "man_lifting_weights", "aliases": []},
"1f3cb": {"canonical_name": "lift", "aliases": ["work_out", "weight_lift", "gym"]}, "1f3cb": {"canonical_name": "lift", "aliases": ["work_out", "weight_lift", "gym"]},
"1f3cc-fe0f-200d-2640-fe0f": {"canonical_name": "woman_golfing", "aliases": []}, "1f3cc-200d-2640": {"canonical_name": "woman_golfing", "aliases": []},
"1f3cc-fe0f-200d-2642-fe0f": {"canonical_name": "man_golfing", "aliases": []}, "1f3cc-200d-2642": {"canonical_name": "man_golfing", "aliases": []},
"1f3cc": {"canonical_name": "golf", "aliases": []}, "1f3cc": {"canonical_name": "golf", "aliases": []},
"1f3cd": {"canonical_name": "motorcycle", "aliases": []}, "1f3cd": {"canonical_name": "motorcycle", "aliases": []},
"1f3ce": {"canonical_name": "racecar", "aliases": []}, "1f3ce": {"canonical_name": "racecar", "aliases": []},
@ -567,7 +567,7 @@ EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {
"1f3ef": {"canonical_name": "shiro", "aliases": []}, "1f3ef": {"canonical_name": "shiro", "aliases": []},
"1f3f0": {"canonical_name": "castle", "aliases": []}, "1f3f0": {"canonical_name": "castle", "aliases": []},
"1f3f3-200d-1f308": {"canonical_name": "rainbow_flag", "aliases": []}, "1f3f3-200d-1f308": {"canonical_name": "rainbow_flag", "aliases": []},
"1f3f3-fe0f-200d-26a7-fe0f": {"canonical_name": "transgender_flag", "aliases": []}, "1f3f3-200d-26a7": {"canonical_name": "transgender_flag", "aliases": []},
"1f3f3": {"canonical_name": "white_flag", "aliases": ["surrender"]}, "1f3f3": {"canonical_name": "white_flag", "aliases": ["surrender"]},
"1f3f4-200d-2620": {"canonical_name": "pirate_flag", "aliases": ["jolly_roger", "plunder"]}, "1f3f4-200d-2620": {"canonical_name": "pirate_flag", "aliases": ["jolly_roger", "plunder"]},
"1f3f4-e0067-e0062-e0065-e006e-e0067-e007f": {"canonical_name": "flag_england", "aliases": []}, "1f3f4-e0067-e0062-e0065-e006e-e0067-e007f": {"canonical_name": "flag_england", "aliases": []},
@ -648,7 +648,7 @@ EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {
"1f43e": {"canonical_name": "paw_prints", "aliases": ["paws"]}, "1f43e": {"canonical_name": "paw_prints", "aliases": ["paws"]},
"1f43f": {"canonical_name": "chipmunk", "aliases": []}, "1f43f": {"canonical_name": "chipmunk", "aliases": []},
"1f440": {"canonical_name": "eyes", "aliases": ["looking"]}, "1f440": {"canonical_name": "eyes", "aliases": ["looking"]},
"1f441-fe0f-200d-1f5e8-fe0f": { "1f441-200d-1f5e8": {
"canonical_name": "eye_in_speech_bubble", "canonical_name": "eye_in_speech_bubble",
"aliases": ["speech", "witness"], "aliases": ["speech", "witness"],
}, },
@ -1056,8 +1056,8 @@ EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {
"1f570": {"canonical_name": "mantelpiece_clock", "aliases": []}, "1f570": {"canonical_name": "mantelpiece_clock", "aliases": []},
"1f573": {"canonical_name": "hole", "aliases": []}, "1f573": {"canonical_name": "hole", "aliases": []},
"1f574": {"canonical_name": "levitating", "aliases": ["hover"]}, "1f574": {"canonical_name": "levitating", "aliases": ["hover"]},
"1f575-fe0f-200d-2640-fe0f": {"canonical_name": "woman_detective", "aliases": []}, "1f575-200d-2640": {"canonical_name": "woman_detective", "aliases": []},
"1f575-fe0f-200d-2642-fe0f": {"canonical_name": "man_detective", "aliases": []}, "1f575-200d-2642": {"canonical_name": "man_detective", "aliases": []},
"1f575": {"canonical_name": "detective", "aliases": ["spy", "sleuth", "agent", "sneaky"]}, "1f575": {"canonical_name": "detective", "aliases": ["spy", "sleuth", "agent", "sneaky"]},
"1f576": {"canonical_name": "dark_sunglasses", "aliases": []}, "1f576": {"canonical_name": "dark_sunglasses", "aliases": []},
"1f577": {"canonical_name": "spider", "aliases": []}, "1f577": {"canonical_name": "spider", "aliases": []},
@ -2022,8 +2022,8 @@ EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {
"26f5": {"canonical_name": "boat", "aliases": ["sailboat"]}, "26f5": {"canonical_name": "boat", "aliases": ["sailboat"]},
"26f7": {"canonical_name": "skier", "aliases": []}, "26f7": {"canonical_name": "skier", "aliases": []},
"26f8": {"canonical_name": "ice_skate", "aliases": []}, "26f8": {"canonical_name": "ice_skate", "aliases": []},
"26f9-fe0f-200d-2640-fe0f": {"canonical_name": "woman_bouncing_ball", "aliases": []}, "26f9-200d-2640": {"canonical_name": "woman_bouncing_ball", "aliases": []},
"26f9-fe0f-200d-2642-fe0f": {"canonical_name": "man_bouncing_ball", "aliases": []}, "26f9-200d-2642": {"canonical_name": "man_bouncing_ball", "aliases": []},
"26f9": {"canonical_name": "ball", "aliases": ["sports"]}, "26f9": {"canonical_name": "ball", "aliases": ["sports"]},
"26fa": {"canonical_name": "tent", "aliases": ["camping"]}, "26fa": {"canonical_name": "tent", "aliases": ["camping"]},
"26fd": {"canonical_name": "fuel_pump", "aliases": ["gas_pump", "petrol_pump"]}, "26fd": {"canonical_name": "fuel_pump", "aliases": ["gas_pump", "petrol_pump"]},

View File

@ -3,6 +3,8 @@
from collections import defaultdict from collections import defaultdict
from typing import Any, Dict, List from typing import Any, Dict, List
from zerver.lib.emoji_utils import emoji_to_hex_codepoint, hex_codepoint_to_emoji, unqualify_emoji
# Emoji sets that we currently support. # Emoji sets that we currently support.
EMOJISETS = ["google", "twitter"] EMOJISETS = ["google", "twitter"]
@ -61,18 +63,12 @@ def emoji_names_for_picker(emoji_name_maps: Dict[str, Dict[str, Any]]) -> List[s
def get_emoji_code(emoji_dict: Dict[str, Any]) -> str: def get_emoji_code(emoji_dict: Dict[str, Any]) -> str:
# Starting from version 4.0.0, `emoji_datasource` package has started to # There is a `non_qualified` field on `emoji_dict` but it's
# add an emoji presentation variation selector for certain emojis which # inconsistently present, so we'll always use the unqualified
# have defined variation sequences. Since in informal environments(like # emoji by unqualifying it ourselves. This gives us more consistent
# texting and chat), it is more appropriate for an emoji to have a colorful # behaviour between emojis, and doesn't rely on the incomplete
# display so until emoji characters have a text presentation selector, it # upstream package (https://github.com/iamcal/emoji-data/pull/217).
# should have a colorful display. Hence we can continue using emoji characters return emoji_to_hex_codepoint(unqualify_emoji(hex_codepoint_to_emoji(emoji_dict["unified"])))
# without appending emoji presentation selector.
# (http://unicode.org/reports/tr51/index.html#Presentation_Style)
# If `non_qualified` field is present and not None return it otherwise
# return `unified` field.
emoji_code = emoji_dict.get("non_qualified") or emoji_dict["unified"]
return emoji_code.lower()
# Returns a dict from categories to list of codepoints. The list of # Returns a dict from categories to list of codepoints. The list of

View File

@ -48,4 +48,4 @@ API_FEATURE_LEVEL = 203
# historical commits sharing the same major version, in which case a # historical commits sharing the same major version, in which case a
# minor version bump suffices. # minor version bump suffices.
PROVISION_VERSION = (247, 4) PROVISION_VERSION = (247, 5)

View File

@ -388,7 +388,21 @@ function make_emoji_span(codepoint, title, alt_text) {
} }
function handleUnicodeEmoji({unicode_emoji, get_emoji_name}) { function handleUnicodeEmoji({unicode_emoji, get_emoji_name}) {
const codepoint = unicode_emoji.codePointAt(0).toString(16); // We want to avoid turning things like arrows (↔) and keycaps (numbers
// in boxes) into qualified emoji (images).
// More specifically, we skip anything with text in the second column of
// this table https://unicode.org/Public/emoji/1.0/emoji-data.txt
if (/^\P{Emoji_Presentation}\u20E3?$/u.test(unicode_emoji)) {
return unicode_emoji;
}
// This unqualifies qualified emoji, which helps us make sure we
// can match both versions.
const unqualified_unicode_emoji = unicode_emoji.replace(/\uFE0F/, "");
const codepoint = [...unqualified_unicode_emoji]
.map((char) => char.codePointAt(0).toString(16).padStart(4, "0"))
.join("-");
const emoji_name = get_emoji_name(codepoint); const emoji_name = get_emoji_name(codepoint);
if (emoji_name) { if (emoji_name) {

View File

@ -537,12 +537,12 @@ inline.breaks = merge({}, inline.gfm, {
text: replace(inline.gfm.text)('{2,}', '*')() text: replace(inline.gfm.text)('{2,}', '*')()
}); });
// From https://unicode.org/reports/tr51/#EBNF_and_Regex. Keep this synced with POSSIBLE_EMOJI_RE.
const possible_emoji_regex = /^(\p{RI}\p{RI}|\p{Emoji}(?:\p{Emoji_Modifier}|\u{FE0F}\u{20E3}?|[\u{E0020}-\u{E007E}]+\u{E007F})?(?:\u{200D}(?:\p{RI}\p{RI}|\p{Emoji}(?:\p{Emoji_Modifier}|\u{FE0F}\u{20E3}?|[\u{E0020}-\u{E007E}]+\u{E007F})?))*)/u;
inline.zulip = merge({}, inline.breaks, { inline.zulip = merge({}, inline.breaks, {
emoji: /^:([A-Za-z0-9_\-\+]+?):/, emoji: /^:([A-Za-z0-9_\-\+]+?):/,
unicodeemoji: RegExp('^(\ud83c[\udd00-\udfff]|\ud83d[\udc00-\ude4f]|' + unicodeemoji: possible_emoji_regex,
'\ud83d[\ude80-\udeff]|\ud83e[\udd00-\uddff]|' +
'[\u2000-\u206F]|[\u2300-\u27BF]|[\u2B00-\u2BFF]|' +
'[\u3000-\u303F]|[\u3200-\u32FF])'),
usermention: /^@(_?)(?:\*\*([^\*]+)\*\*)/, // Match potentially multi-word string between @** ** usermention: /^@(_?)(?:\*\*([^\*]+)\*\*)/, // Match potentially multi-word string between @** **
groupmention: /^@(_?)(?:\*([^\*]+)\*)/, // Match multi-word string between @* * groupmention: /^@(_?)(?:\*([^\*]+)\*)/, // Match multi-word string between @* *
stream_topic: /^#\*\*([^\*>]+)>([^\*]+)\*\*/, stream_topic: /^#\*\*([^\*>]+)>([^\*]+)\*\*/,

25
zerver/lib/emoji_utils.py Normal file
View File

@ -0,0 +1,25 @@
# This file doesn't import from django so that we can use it in `build_emoji`
def unqualify_emoji(emoji: str) -> str:
# Starting from version 4.0.0, `emoji_datasource` package has started to
# add an emoji presentation variation selector for certain emojis which
# have defined variation sequences. The emoji presentation selector
# "qualifies" an emoji, and an "unqualified" version of an emoji does
# not have an emoji presentation selector.
#
# Since in informal environments(like texting and chat), it is more
# appropriate for an emoji to have a colorful display so until emoji
# characters have a text presentation selector, it should have a
# colorful display. Hence we can continue using emoji characters
# without appending emoji presentation selector.
# (http://unicode.org/reports/tr51/index.html#Presentation_Style)
return emoji.replace("\ufe0f", "")
def emoji_to_hex_codepoint(emoji: str) -> str:
return "-".join(f"{ord(c):04x}" for c in emoji)
def hex_codepoint_to_emoji(hex: str) -> str:
return "".join(chr(int(h, 16)) for h in hex.split("-"))

View File

@ -25,6 +25,7 @@ from typing import (
TypedDict, TypedDict,
TypeVar, TypeVar,
Union, Union,
cast,
) )
from urllib.parse import parse_qs, urlencode, urljoin, urlsplit from urllib.parse import parse_qs, urlencode, urljoin, urlsplit
from xml.etree.ElementTree import Element, SubElement from xml.etree.ElementTree import Element, SubElement
@ -40,6 +41,7 @@ import markdown.postprocessors
import markdown.treeprocessors import markdown.treeprocessors
import markdown.util import markdown.util
import re2 import re2
import regex
import requests import requests
import uri_template import uri_template
from django.conf import settings from django.conf import settings
@ -53,6 +55,7 @@ from zerver.lib import mention
from zerver.lib.cache import cache_with_key from zerver.lib.cache import cache_with_key
from zerver.lib.camo import get_camo_url from zerver.lib.camo import get_camo_url
from zerver.lib.emoji import EMOTICON_RE, codepoint_to_name, name_to_codepoint, translate_emoticons from zerver.lib.emoji import EMOTICON_RE, codepoint_to_name, name_to_codepoint, translate_emoticons
from zerver.lib.emoji_utils import emoji_to_hex_codepoint, unqualify_emoji
from zerver.lib.exceptions import MarkdownRenderingError from zerver.lib.exceptions import MarkdownRenderingError
from zerver.lib.markdown import fenced_code from zerver.lib.markdown import fenced_code
from zerver.lib.markdown.fenced_code import FENCE_RE from zerver.lib.markdown.fenced_code import FENCE_RE
@ -840,7 +843,7 @@ class InlineInterestingLinkProcessor(markdown.treeprocessors.Treeprocessor):
This works by using the URLs, user_mentions and media data from This works by using the URLs, user_mentions and media data from
the twitter API and searching for Unicode emojis in the text using the twitter API and searching for Unicode emojis in the text using
`UNICODE_EMOJI_RE`. `POSSIBLE_EMOJI_RE`.
The first step is finding the locations of the URLs, mentions, media and The first step is finding the locations of the URLs, mentions, media and
emoji in the text. For each match we build a dictionary with type, the start emoji in the text. For each match we build a dictionary with type, the start
@ -897,9 +900,9 @@ class InlineInterestingLinkProcessor(markdown.treeprocessors.Treeprocessor):
for match in re.finditer(re.escape(short_url), text, re.IGNORECASE) for match in re.finditer(re.escape(short_url), text, re.IGNORECASE)
) )
# Build dicts for emojis # Build dicts for emojis
for match in re.finditer(UNICODE_EMOJI_RE, text, re.IGNORECASE): for match in POSSIBLE_EMOJI_RE.finditer(text):
orig_syntax = match.group("syntax") orig_syntax = match.group("syntax")
codepoint = unicode_emoji_to_codepoint(orig_syntax) codepoint = emoji_to_hex_codepoint(unqualify_emoji(orig_syntax))
if codepoint in codepoint_to_name: if codepoint in codepoint_to_name:
display_string = ":" + codepoint_to_name[codepoint] + ":" display_string = ":" + codepoint_to_name[codepoint] + ":"
to_process.append( to_process.append(
@ -1342,54 +1345,27 @@ class Timestamp(markdown.inlinepatterns.Pattern):
return time_element return time_element
# All of our emojis (excluding ZWJ sequences) belong to one of these Unicode blocks: # From https://unicode.org/reports/tr51/#EBNF_and_Regex. Keep this synced with `possible_emoji_regex`.
# \U0001f100-\U0001f1ff - Enclosed Alphanumeric Supplement POSSIBLE_EMOJI_RE = regex.compile(
# \U0001f200-\U0001f2ff - Enclosed Ideographic Supplement r"""(?P<syntax>
# \U0001f300-\U0001f5ff - Miscellaneous Symbols and Pictographs \p{RI} \p{RI}
# \U0001f600-\U0001f64f - Emoticons (Emoji) | \p{Emoji}
# \U0001f680-\U0001f6ff - Transport and Map Symbols (?: \p{Emoji_Modifier}
# \U0001f7e0-\U0001f7eb - Coloured Geometric Shapes (NOTE: Not Unicode standard category name) | \uFE0F \u20E3?
# \U0001f900-\U0001f9ff - Supplemental Symbols and Pictographs | [\U000E0020-\U000E007E]+ \U000E007F
# \u2000-\u206f - General Punctuation )?
# \u2300-\u23ff - Miscellaneous Technical (?: \u200D
# \u2400-\u243f - Control Pictures (?: \p{RI} \p{RI}
# \u2440-\u245f - Optical Character Recognition | \p{Emoji}
# \u2460-\u24ff - Enclosed Alphanumerics (?: \p{Emoji_Modifier}
# \u2500-\u257f - Box Drawing | \uFE0F \u20E3?
# \u2580-\u259f - Block Elements | [\U000E0020-\U000E007E]+ \U000E007F
# \u25a0-\u25ff - Geometric Shapes )?
# \u2600-\u26ff - Miscellaneous Symbols )
# \u2700-\u27bf - Dingbats )*)
# \u2900-\u297f - Supplemental Arrows-B """,
# \u2b00-\u2bff - Miscellaneous Symbols and Arrows regex.VERBOSE,
# \u3000-\u303f - CJK Symbols and Punctuation
# \u3200-\u32ff - Enclosed CJK Letters and Months
UNICODE_EMOJI_RE = (
"(?P<syntax>["
"\U0001F100-\U0001F64F"
"\U0001F680-\U0001F6FF"
"\U0001F7E0-\U0001F7EB"
"\U0001F900-\U0001F9FF"
"\u2000-\u206F"
"\u2300-\u27BF"
"\u2900-\u297F"
"\u2B00-\u2BFF"
"\u3000-\u303F"
"\u3200-\u32FF"
"])"
) )
# The equivalent JS regex is \ud83c[\udd00-\udfff]|\ud83d[\udc00-\ude4f]|\ud83d[\ude80-\udeff]|
# \ud83e[\udd00-\uddff]|[\u2000-\u206f]|[\u2300-\u27bf]|[\u2b00-\u2bff]|[\u3000-\u303f]|
# [\u3200-\u32ff]. See below comments for explanation. The JS regex is used by marked.js for
# frontend Unicode emoji processing.
# The JS regex \ud83c[\udd00-\udfff]|\ud83d[\udc00-\ude4f] represents U0001f100-\U0001f64f
# The JS regex \ud83d[\ude80-\udeff] represents \U0001f680-\U0001f6ff
# The JS regex \ud83e[\udd00-\uddff] represents \U0001f900-\U0001f9ff
# The JS regex [\u2000-\u206f] represents \u2000-\u206f
# The JS regex [\u2300-\u27bf] represents \u2300-\u27bf
# Similarly other JS regexes can be mapped to the respective Unicode blocks.
# For more information, please refer to the following article:
# http://crocodillon.com/blog/parsing-emoji-unicode-in-javascript
def make_emoji(codepoint: str, display_string: str) -> Element: def make_emoji(codepoint: str, display_string: str) -> Element:
@ -1413,11 +1389,6 @@ def make_realm_emoji(src: str, display_string: str) -> Element:
return elt return elt
def unicode_emoji_to_codepoint(unicode_emoji: str) -> str:
# Unicode codepoints are minimum of length 4, padded with zeroes
return f"{ord(unicode_emoji):04x}"
class EmoticonTranslation(markdown.inlinepatterns.Pattern): class EmoticonTranslation(markdown.inlinepatterns.Pattern):
"""Translates emoticons like `:)` into emoji like `:smile:`.""" """Translates emoticons like `:)` into emoji like `:smile:`."""
@ -1436,15 +1407,28 @@ class EmoticonTranslation(markdown.inlinepatterns.Pattern):
return make_emoji(name_to_codepoint[name], translated) return make_emoji(name_to_codepoint[name], translated)
class UnicodeEmoji(markdown.inlinepatterns.Pattern): TEXT_PRESENTATION_RE = regex.compile(r"\P{Emoji_Presentation}\u20E3?")
def handleMatch(self, match: Match[str]) -> Optional[Element]:
class UnicodeEmoji(CompiledInlineProcessor):
def handleMatch( # type: ignore[override] # https://github.com/python/mypy/issues/10197
self, match: Match[str], data: str
) -> Union[Tuple[None, None, None], Tuple[Element, int, int]]:
orig_syntax = match.group("syntax") orig_syntax = match.group("syntax")
codepoint = unicode_emoji_to_codepoint(orig_syntax)
# We want to avoid turning things like arrows (↔) and keycaps (numbers
# in boxes) into qualified emoji.
# More specifically, we skip anything with text in the second column of
# this table https://unicode.org/Public/emoji/1.0/emoji-data.txt
if TEXT_PRESENTATION_RE.fullmatch(orig_syntax):
return None, None, None
codepoint = emoji_to_hex_codepoint(unqualify_emoji(orig_syntax))
if codepoint in codepoint_to_name: if codepoint in codepoint_to_name:
display_string = ":" + codepoint_to_name[codepoint] + ":" display_string = ":" + codepoint_to_name[codepoint] + ":"
return make_emoji(codepoint, display_string) return make_emoji(codepoint, display_string), match.start(), match.end()
else: else:
return None return None, None, None
class Emoji(markdown.inlinepatterns.Pattern): class Emoji(markdown.inlinepatterns.Pattern):
@ -2224,7 +2208,7 @@ class ZulipMarkdown(markdown.Markdown):
reg.register(Emoji(EMOJI_REGEX, self), "emoji", 15) reg.register(Emoji(EMOJI_REGEX, self), "emoji", 15)
reg.register(EmoticonTranslation(EMOTICON_RE, self), "translate_emoticons", 10) reg.register(EmoticonTranslation(EMOTICON_RE, self), "translate_emoticons", 10)
# We get priority 5 from 'nl2br' extension # We get priority 5 from 'nl2br' extension
reg.register(UnicodeEmoji(UNICODE_EMOJI_RE), "unicodeemoji", 0) reg.register(UnicodeEmoji(cast(Pattern[str], POSSIBLE_EMOJI_RE), self), "unicodeemoji", 0)
return reg return reg
def register_linkifiers(self, registry: markdown.util.Registry) -> markdown.util.Registry: def register_linkifiers(self, registry: markdown.util.Registry) -> markdown.util.Registry:

View File

@ -588,6 +588,41 @@
"input": ":poop:", "input": ":poop:",
"expected_output": "<p><span aria-label=\"poop\" class=\"emoji emoji-1f4a9\" role=\"img\" title=\"poop\">:poop:</span></p>" "expected_output": "<p><span aria-label=\"poop\" class=\"emoji emoji-1f4a9\" role=\"img\" title=\"poop\">:poop:</span></p>"
}, },
{
"name": "emoji_sequence_one",
"input": "🤷‍♀️",
"expected_output": "<p><span aria-label=\"woman shrugging\" class=\"emoji emoji-1f937-200d-2640\" role=\"img\" title=\"woman shrugging\">:woman_shrugging:</span></p>"
},
{
"name": "emoji_sequence_two",
"input": "👁‍🗨 #️⃣",
"expected_output": "<p><span aria-label=\"eye in speech bubble\" class=\"emoji emoji-1f441-200d-1f5e8\" role=\"img\" title=\"eye in speech bubble\">:eye_in_speech_bubble:</span> <span aria-label=\"hash\" class=\"emoji emoji-0023-20e3\" role=\"img\" title=\"hash\">:hash:</span></p>"
},
{
"name": "unrecognized_emoji_sequence_one",
"input": "pheonix bird from 15.1: 🐦‍🔥",
"expected_output": "<p>pheonix bird from 15.1: 🐦‍🔥</p>"
},
{
"name": "unrecognized_emoji_sequence_two",
"input": "lime from 15.1: 🍋‍🟩",
"expected_output": "<p>lime from 15.1: 🍋‍🟩</p>"
},
{
"name": "unrecognized_emoji_sequence_three",
"input": "normal lemon 🍋, purple lemon? 🍋‍🟪",
"expected_output": "<p>normal lemon <span aria-label=\"lemon\" class=\"emoji emoji-1f34b\" role=\"img\" title=\"lemon\">:lemon:</span>, purple lemon? 🍋‍🟪</p>"
},
{
"name": "unrecognized_emoji_sequence_four",
"input": "medium-skinned person riding scooter back and forth: 🧑🏽‍🛴‍↩️",
"expected_output": "<p>medium-skinned person riding scooter back and forth: 🧑🏽‍🛴‍↩️</p>"
},
{
"name": "unrecognized_emoji_sequence_skin_tone",
"input": "man in manual wheelchair: dark skin tone 👨🏿‍🦽",
"expected_output": "<p>man in manual wheelchair: dark skin tone 👨🏿‍🦽</p>"
},
{ {
"name": "emojis_without_space", "name": "emojis_without_space",
"input": ":cat:hello:dog::rabbit:", "input": ":cat:hello:dog::rabbit:",
@ -642,7 +677,7 @@
}, },
{ {
"name": "miscellaneous_symbols_and_arrows", "name": "miscellaneous_symbols_and_arrows",
"input": "Black upward arrow \u2b06", "input": "Black upward arrow \u2b06\ufe0f",
"expected_output":"<p>Black upward arrow <span aria-label=\"up\" class=\"emoji emoji-2b06\" role=\"img\" title=\"up\">:up:</span><\/p>" "expected_output":"<p>Black upward arrow <span aria-label=\"up\" class=\"emoji emoji-2b06\" role=\"img\" title=\"up\">:up:</span><\/p>"
}, },
{ {

View File

@ -22,9 +22,11 @@ from zerver.actions.users import change_user_is_active
from zerver.lib.alert_words import get_alert_word_automaton from zerver.lib.alert_words import get_alert_word_automaton
from zerver.lib.camo import get_camo_url from zerver.lib.camo import get_camo_url
from zerver.lib.create_user import create_user from zerver.lib.create_user import create_user
from zerver.lib.emoji import get_emoji_url from zerver.lib.emoji import codepoint_to_name, get_emoji_url
from zerver.lib.emoji_utils import hex_codepoint_to_emoji
from zerver.lib.exceptions import JsonableError, MarkdownRenderingError from zerver.lib.exceptions import JsonableError, MarkdownRenderingError
from zerver.lib.markdown import ( from zerver.lib.markdown import (
POSSIBLE_EMOJI_RE,
InlineInterestingLinkProcessor, InlineInterestingLinkProcessor,
MarkdownListPreprocessor, MarkdownListPreprocessor,
MessageRenderingResult, MessageRenderingResult,
@ -3156,3 +3158,17 @@ class MarkdownErrorTests(ZulipTestCase):
result = processor.run(markdown_input) result = processor.run(markdown_input)
self.assertEqual(result, expected) self.assertEqual(result, expected)
class MarkdownEmojiTest(ZulipTestCase):
def test_all_emoji_match_regex(self) -> None:
non_matching_emoji = [
emoji
for codepoint in codepoint_to_name
if not POSSIBLE_EMOJI_RE.fullmatch(emoji := hex_codepoint_to_emoji(codepoint))
]
self.assertEqual(
non_matching_emoji,
# unqualified numbers in boxes shouldn't be converted to emoji images, so this is fine
["#⃣", "*⃣", "0⃣", "1⃣", "2⃣", "3⃣", "4⃣", "5⃣", "6⃣", "7⃣", "8⃣", "9⃣"],
)