emoji: Finish script to generate emoji_names.py with CLDR data.

This script pulls from our previously custom-written emoji strings and fills in the rest from CLDR. It also removes 4 custom emoji which collide with some of the new CLDR names (they will now just be called by their CLDR name).
2022-06-10 13:40:31 -07:00 · 2022-06-10 13:40:31 -07:00 · 8a9e68e026
parent 44df15e19b
commit 8a9e68e026
7 changed files with 1100 additions and 34 deletions
--- a/frontend_tests/node_tests/emoji_picker.js
+++ b/frontend_tests/node_tests/emoji_picker.js
@ -21,7 +21,7 @@ run_test("initialize", () => {
    const complete_emoji_catalog = _.sortBy(emoji_picker.complete_emoji_catalog, "name");
    assert.equal(complete_emoji_catalog.length, 11);
-    assert.equal(emoji.emojis_by_name.size, 1052);
+    assert.equal(emoji.emojis_by_name.size, 1817);
    let total_emoji_in_categories = 0;
@ -42,17 +42,17 @@ run_test("initialize", () => {
    }
    const popular_emoji_count = 6;
    const zulip_emoji_count = 1;
-    assert_emoji_category(complete_emoji_catalog.pop(), "fa-car", 170);
+    assert_emoji_category(complete_emoji_catalog.pop(), "fa-car", 195);
-    assert_emoji_category(complete_emoji_catalog.pop(), "fa-hashtag", 197);
+    assert_emoji_category(complete_emoji_catalog.pop(), "fa-hashtag", 221);
-    assert_emoji_category(complete_emoji_catalog.pop(), "fa-smile-o", 129);
+    assert_emoji_category(complete_emoji_catalog.pop(), "fa-smile-o", 162);
    assert_emoji_category(complete_emoji_catalog.pop(), "fa-star-o", popular_emoji_count);
-    assert_emoji_category(complete_emoji_catalog.pop(), "fa-thumbs-o-up", 102);
+    assert_emoji_category(complete_emoji_catalog.pop(), "fa-thumbs-o-up", 353);
-    assert_emoji_category(complete_emoji_catalog.pop(), "fa-lightbulb-o", 189);
+    assert_emoji_category(complete_emoji_catalog.pop(), "fa-lightbulb-o", 255);
-    assert_emoji_category(complete_emoji_catalog.pop(), "fa-cutlery", 92);
+    assert_emoji_category(complete_emoji_catalog.pop(), "fa-cutlery", 132);
-    assert_emoji_category(complete_emoji_catalog.pop(), "fa-flag", 5);
+    assert_emoji_category(complete_emoji_catalog.pop(), "fa-flag", 268);
    assert_emoji_category(complete_emoji_catalog.pop(), "fa-cog", 1);
-    assert_emoji_category(complete_emoji_catalog.pop(), "fa-leaf", 104);
+    assert_emoji_category(complete_emoji_catalog.pop(), "fa-leaf", 144);
-    assert_emoji_category(complete_emoji_catalog.pop(), "fa-soccer-ball-o", 63);
+    assert_emoji_category(complete_emoji_catalog.pop(), "fa-soccer-ball-o", 86);
    // The popular emoji appear twice in the picker, and the zulip emoji is special
    assert.equal(
--- a/package.json
+++ b/package.json
@ -98,6 +98,8 @@
    "@typescript-eslint/parser": "^5.0.0",
    "babel-plugin-rewire-ts": "^1.4.0",
    "callsites": "^3.1.0",
    "cldr-annotations-derived-modern": "^41.0.0",
    "cldr-annotations-modern": "^41.0.0",
    "diff": "^5.0.0",
    "difflib": "^0.2.4",
    "enhanced-resolve": "^5.8.2",
--- a/tools/setup/emoji/custom_emoji_names.py
+++ b/tools/setup/emoji/custom_emoji_names.py
@ -28,7 +28,6 @@ CUSTOM_EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {
    "1f60d": {"canonical_name": "heart_eyes", "aliases": ["in_love"]},
    # blow_a_kiss from https://beebom.com/emoji-meanings/
    "1f618": {"canonical_name": "heart_kiss", "aliases": ["blow_a_kiss"]},
    "1f617": {"canonical_name": "kiss", "aliases": []},
    "1f619": {"canonical_name": "kiss_smiling_eyes", "aliases": []},
    "1f61a": {"canonical_name": "kiss_with_blush", "aliases": []},
    "1f60b": {"canonical_name": "yum", "aliases": []},
@ -411,10 +410,6 @@ CUSTOM_EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {
    # spiral_shell from Unicode/gemoji, the others seemed like reasonable
    # additions
    "1f41a": {"canonical_name": "shell", "aliases": ["seashell", "conch", "spiral_shell"]},
    # Unicode/gemoji have lady_beetle; hopefully with ladybug we get both the
    # people that prefer lady_beetle (with beetle) and ladybug. There is also
    # ladybird, but seems a bit much for this to complete for bird.
    "1f41e": {"canonical_name": "beetle", "aliases": ["ladybug"]},
    "1f41c": {"canonical_name": "ant", "aliases": []},
    "1f577": {"canonical_name": "spider", "aliases": []},
    "1f578": {"canonical_name": "web", "aliases": ["spider_web"]},
@ -738,8 +733,6 @@ CUSTOM_EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {
    "1f945": {"canonical_name": "gooooooooal", "aliases": ["goal"]},
    "1f3d2": {"canonical_name": "ice_hockey", "aliases": []},
    "1f3d1": {"canonical_name": "field_hockey", "aliases": []},
    # would say bat, but taken by Nature/30
    "1f3cf": {"canonical_name": "cricket", "aliases": ["cricket_bat"]},
    # hole_in_one seems like a more useful name to have. Sent golf to
    # Activity/39
    "26f3": {"canonical_name": "hole_in_one", "aliases": []},
@ -1210,8 +1203,6 @@ CUSTOM_EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {
    "1f4ee": {"canonical_name": "mail_dropoff", "aliases": []},
    "1f4ef": {"canonical_name": "horn", "aliases": []},
    "1f4dc": {"canonical_name": "scroll", "aliases": []},
    # receipt seems more useful?
    "1f4c3": {"canonical_name": "receipt", "aliases": []},
    "1f4c4": {"canonical_name": "document", "aliases": ["paper", "file", "page"]},
    "1f4d1": {"canonical_name": "place_holder", "aliases": []},
    "1f4ca": {"canonical_name": "bar_chart", "aliases": []},
--- a/tools/setup/emoji/emoji_names.py
+++ b/tools/setup/emoji/emoji_names.py
--- a/tools/setup/emoji/generate_emoji_names
+++ b/tools/setup/emoji/generate_emoji_names
@ -1,7 +1,8 @@
 #!/usr/bin/env python3
 import json
 import os
 import re
 import sys
 from collections import defaultdict
 import orjson
@ -10,11 +11,26 @@ from emoji_setup_utils import get_emoji_code
 ZULIP_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../../")
 sys.path.append(ZULIP_PATH)
 CLDR_DATA_FILE = os.path.join(
    ZULIP_PATH, "node_modules", "cldr-annotations-modern", "annotations", "en", "annotations.json"
 )
 CLDR_DERIVED_DATA_FILE = os.path.join(
    ZULIP_PATH,
    "node_modules",
    "cldr-annotations-derived-modern",
    "annotationsDerived",
    "en",
    "annotations.json",
 )
 EMOJI_DATA_FILE = os.path.join(ZULIP_PATH, "node_modules", "emoji-datasource-google", "emoji.json")
 OUT_EMOJI_FILE = os.path.join(ZULIP_PATH, "tools", "setup", "emoji", "emoji_names.py")
 with open(EMOJI_DATA_FILE, "rb") as fp:
    EMOJI_DATA = orjson.loads(fp.read())
 with open(CLDR_DATA_FILE, "rb") as fp:
    CLDR_DATA = orjson.loads(fp.read())["annotations"]["annotations"]
 with open(CLDR_DERIVED_DATA_FILE, "rb") as fp:
    CLDR_DATA.update(orjson.loads(fp.read())["annotationsDerived"]["annotations"])
 # We don't include most clock emojis. See `custom_emoji_names` for more context.
 SKIPPED_CLOCK_EMOJI_CODES = [
@ -53,17 +69,62 @@ SKIN_TONE_EMOJI_CODES = [
 ]
 def cleanup_name(name: str) -> str:
    replacements = {
        " ": "_",
        "-": "_",
        "–": "_",
        "“": "",
        "”": "",
        ":": "",
        ".": "",
        "&": "and",
        "‘": "'",
        "’": "'",
    }
    for before, after in replacements.items():
        name = name.replace(before, after)
    name = re.sub("_{2,}", "_", name)
    return name.lower()
 def convert_non_ascii_chars(name: str) -> str:
    replacements = {
        "è": "e",
        "ǐ": "i",
        "ó": "o",
        "ā": "a",
        "ō": "o",
        "ñ": "n",
        "ô": "o",
        "ç": "c",
        "é": "e",
        "ã": "a",
        "í": "i",
        "å": "a",
    }
    for before, after in replacements.items():
        name = name.replace(before, after)
    assert (
        name.isascii()
    ), f"{name} still contains non-ascii characters. Add them to convert_non_ascii_chars."
    return name
 def main() -> None:
    all_emojis = {}
    all_canonical_names = set()
    alias_to_emoji_code = defaultdict(list)
    # STEP 1: Generate first draft of all_emojis.
    for emoji_dict in EMOJI_DATA:
        emoji_code = get_emoji_code(emoji_dict)
        if emoji_code in SKIPPED_CLOCK_EMOJI_CODES or emoji_code in SKIN_TONE_EMOJI_CODES:
            continue
        if emoji_code in CUSTOM_EMOJI_NAME_MAPS:
-            canonical_name = CUSTOM_EMOJI_NAME_MAPS[emoji_code]["canonical_name"]
+            canonical_name = cleanup_name(CUSTOM_EMOJI_NAME_MAPS[emoji_code]["canonical_name"])
            if canonical_name in all_canonical_names:
                raise Exception(
                    f"{canonical_name} was already added with a different codepoint. "
@ -72,9 +133,70 @@ def main() -> None:
            all_canonical_names.add(canonical_name)
            all_emojis[emoji_code] = CUSTOM_EMOJI_NAME_MAPS[emoji_code]
        else:
            continue  # this commit doesn't add CLDR data yet.
            # create the unicode character(s) for the emoji, since this is the key into the CLDR data
            emoji = "".join(
                chr(int(h, 16))
                for h in (emoji_dict["non_qualified"] or emoji_dict["unified"]).split("-")
            )
            if emoji not in CLDR_DATA:
                print(
                    f"{emoji} not found in custom emoji name maps, but also not found in CLDR data. Skipping."
                )
                continue
            # CLDR_DATA[emoji] is of the form {'default': [...], 'tts': [...]}
            # * "tts" is what's used for text-to-speech and always has one item, so we use that
            #    as the canonical name.
            # * "default" has several items in it that we use as aliases.
            # See also: https://www.unicode.org/reports/tr35/tr35-general.html#14-annotations-and-labels
            assert len(CLDR_DATA[emoji]["tts"]) == 1
            canonical_name = cleanup_name(CLDR_DATA[emoji]["tts"][0].strip())
            if canonical_name in all_canonical_names:
                raise Exception(
                    f"{canonical_name} was already added with a different codepoint. "
                    f"Rename it in `custom_emoji_names` or add an entry for {emoji_code}."
                )
            aliases = [cleanup_name(alias.strip()) for alias in CLDR_DATA[emoji]["default"]]
            all_emojis[emoji_code] = {"canonical_name": canonical_name, "aliases": aliases}
            all_canonical_names.add(canonical_name)
    # STEP 2: We don't support having aliases that collide with canonical names for emoji, so remove them.
    for (emoji_code, emoji_names) in all_emojis.items():
        # Copy the list to not iterate while elements are being deleted.
        aliases = emoji_names["aliases"][:]
        for alias in aliases:
            if alias in all_canonical_names:
                emoji_names["aliases"].remove(alias)
            else:
                alias_to_emoji_code[alias].append(emoji_code)  # This is used in STEP 3.
    # STEP 3: We don't support multiple emoji sharing the same alias, but the CLDR data
    # doesn't have that same restriction, so we have to fix this up to have unique aliases.
    # If the alias was specifically specified in custom_emoji_names, then we can keep just
    # that one, but otherwise there's no particular emoji that is an obvious candidate
    # for the alias so just remove the alias for all relevant emoji.
    for alias in alias_to_emoji_code.keys():
        if len(alias_to_emoji_code[alias]) > 1:
            for emoji_code in alias_to_emoji_code[alias]:
                if emoji_code not in CUSTOM_EMOJI_NAME_MAPS:
                    all_emojis[emoji_code]["aliases"].remove(alias)
    # STEP 4: We keep non-ascii (non-"English") characters in some emoji names if that's the correct
    # way to spell that word, but always add an alias for an ascii-only version of the word.
    for (emoji_code, emoji_names) in all_emojis.items():
        for name in [emoji_names["canonical_name"]] + emoji_names["aliases"]:
            # These are known names where we don't have an ascii-only version and there are ascii aliases
            # that a user can still enter instead to get the same emoji.
            if name in ["ココ", "サ", "指", "空"]:
                assert any(alias.isascii() for alias in aliases)
                continue
            if not name.isascii():
                ascii_alias = convert_non_ascii_chars(name)
                # Now no other emoji can use this alias.
                for code in alias_to_emoji_code[ascii_alias]:
                    all_emojis[code]["aliases"].remove(ascii_alias)
                all_emojis[emoji_code]["aliases"].append(ascii_alias)
    # STEP 5: Write final dictionary to `emoji_names.py`.
    with open(OUT_EMOJI_FILE, "w") as f:
        f.write(
            "from typing import Any, Dict\n\n"
@ -82,7 +204,7 @@ def main() -> None:
            "EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {\n"
        )
        for (key, emoji_names) in all_emojis.items():
-            f.write(f'    "{key}": {json.dumps(emoji_names)},\n')
+            f.write(f"    {key!r}: {emoji_names!r},\n")
        f.write("}\n")
    print(
--- a/yarn.lock
+++ b/yarn.lock
@ -3073,6 +3073,16 @@ clamp@^1.0.1:
  resolved "https://registry.yarnpkg.com/clamp/-/clamp-1.0.1.tgz#66a0e64011816e37196828fdc8c8c147312c8634"
  integrity sha512-kgMuFyE78OC6Dyu3Dy7vcx4uy97EIbVxJB/B0eJ3bUNAkwdNcxYzgKltnyADiYwsR7SEqkkUPsEUT//OVS6XMA==
 cldr-annotations-derived-modern@^41.0.0:
  version "41.0.0"
  resolved "https://registry.yarnpkg.com/cldr-annotations-derived-modern/-/cldr-annotations-derived-modern-41.0.0.tgz#da55423006c5b9dd742d9395a2318b05d6a6efa2"
  integrity sha512-V9N8CW+DDem3NZlpGKPdvTkoqZV/rNbZq27UZO1d0JXhM+pZo6e8n3+GSqSDgIAXphxn1v9Yd9U+X9NzZ+PCcA==
 cldr-annotations-modern@^41.0.0:
  version "41.0.0"
  resolved "https://registry.yarnpkg.com/cldr-annotations-modern/-/cldr-annotations-modern-41.0.0.tgz#5eeaef3b250e30e0dd7e8babc28a3a70c71a7192"
  integrity sha512-ymcfrliWq6IFB9vnDuT9awT7z9bTkh03g39eeU5RIYK0hFuZAPsYdpgs461GEuUKCL7SmDXXyZJwmf9iGYsCdg==
 clean-css@^5.1.0, clean-css@^5.2.2:
  version "5.3.1"
  resolved "https://registry.yarnpkg.com/clean-css/-/clean-css-5.3.1.tgz#d0610b0b90d125196a2894d35366f734e5d7aa32"
--- a/zerver/tests/test_rocketchat_importer.py
+++ b/zerver/tests/test_rocketchat_importer.py
@ -758,14 +758,15 @@ class RocketChatImporter(ZulipTestCase):
            zerver_realmemoji=zerver_realmemoji,
        )
-        # :grin: and :star_struck: are not present in Zulip's default
+        # :grin: is not present in Zulip's default emoji set,
-        # emoji set, or in Reaction.UNICODE_EMOJI reaction type.
+        # or in Reaction.UNICODE_EMOJI reaction type.
-        self.assert_length(total_reactions, 7)
+        self.assert_length(total_reactions, 8)
        grinning_emoji_code = name_to_codepoint["grinning"]
        innocent_emoji_code = name_to_codepoint["innocent"]
        heart_emoji_code = name_to_codepoint["heart"]
        rocket_emoji_code = name_to_codepoint["rocket"]
        star_struck_emoji_code = name_to_codepoint["star_struck"]
        realmemoji_code = {}
        for emoji in zerver_realmemoji:
@ -777,7 +778,16 @@ class RocketChatImporter(ZulipTestCase):
        )
        self.assertEqual(
            self.get_set(total_reactions, "emoji_name"),
-            {"grinning", "innocent", "heart", "rocket", "check", "zulip", "harry-ron"},
+            {
                "grinning",
                "innocent",
                "star_struck",
                "heart",
                "rocket",
                "check",
                "zulip",
                "harry-ron",
            },
        )
        self.assertEqual(
            self.get_set(total_reactions, "emoji_code"),
@ -786,13 +796,14 @@ class RocketChatImporter(ZulipTestCase):
                innocent_emoji_code,
                heart_emoji_code,
                rocket_emoji_code,
                star_struck_emoji_code,
                realmemoji_code["check"],
                realmemoji_code["zulip"],
                realmemoji_code["harry-ron"],
            },
        )
        self.assertEqual(self.get_set(total_reactions, "user_profile"), {2, 3, 4})
-        self.assert_length(self.get_set(total_reactions, "id"), 7)
+        self.assert_length(self.get_set(total_reactions, "id"), 8)
        self.assert_length(self.get_set(total_reactions, "message"), 1)
    def test_process_message_attachment(self) -> None: