emoji: Finish script to generate emoji_names.py with CLDR data.

This script pulls from our previously custom-written emoji strings and fills in the rest from CLDR. It also removes 4 custom emoji which collide with some of the new CLDR names (they will now just be called by their CLDR name).
2022-06-10 13:40:31 -07:00 · 2022-06-10 13:40:31 -07:00 · 8a9e68e026
parent 44df15e19b
commit 8a9e68e026
7 changed files with 1100 additions and 34 deletions
--- a/frontend_tests/node_tests/emoji_picker.js
+++ b/frontend_tests/node_tests/emoji_picker.js
@ -21,7 +21,7 @@ run_test("initialize", () => {

    const complete_emoji_catalog = _.sortBy(emoji_picker.complete_emoji_catalog, "name");
    assert.equal(complete_emoji_catalog.length, 11);
-    assert.equal(emoji.emojis_by_name.size, 1052);
+    assert.equal(emoji.emojis_by_name.size, 1817);

    let total_emoji_in_categories = 0;

@ -42,17 +42,17 @@ run_test("initialize", () => {
    }
    const popular_emoji_count = 6;
    const zulip_emoji_count = 1;
-    assert_emoji_category(complete_emoji_catalog.pop(), "fa-car", 170);
-    assert_emoji_category(complete_emoji_catalog.pop(), "fa-hashtag", 197);
-    assert_emoji_category(complete_emoji_catalog.pop(), "fa-smile-o", 129);
+    assert_emoji_category(complete_emoji_catalog.pop(), "fa-car", 195);
+    assert_emoji_category(complete_emoji_catalog.pop(), "fa-hashtag", 221);
+    assert_emoji_category(complete_emoji_catalog.pop(), "fa-smile-o", 162);
    assert_emoji_category(complete_emoji_catalog.pop(), "fa-star-o", popular_emoji_count);
-    assert_emoji_category(complete_emoji_catalog.pop(), "fa-thumbs-o-up", 102);
-    assert_emoji_category(complete_emoji_catalog.pop(), "fa-lightbulb-o", 189);
-    assert_emoji_category(complete_emoji_catalog.pop(), "fa-cutlery", 92);
-    assert_emoji_category(complete_emoji_catalog.pop(), "fa-flag", 5);
+    assert_emoji_category(complete_emoji_catalog.pop(), "fa-thumbs-o-up", 353);
+    assert_emoji_category(complete_emoji_catalog.pop(), "fa-lightbulb-o", 255);
+    assert_emoji_category(complete_emoji_catalog.pop(), "fa-cutlery", 132);
+    assert_emoji_category(complete_emoji_catalog.pop(), "fa-flag", 268);
    assert_emoji_category(complete_emoji_catalog.pop(), "fa-cog", 1);
-    assert_emoji_category(complete_emoji_catalog.pop(), "fa-leaf", 104);
-    assert_emoji_category(complete_emoji_catalog.pop(), "fa-soccer-ball-o", 63);
+    assert_emoji_category(complete_emoji_catalog.pop(), "fa-leaf", 144);
+    assert_emoji_category(complete_emoji_catalog.pop(), "fa-soccer-ball-o", 86);

    // The popular emoji appear twice in the picker, and the zulip emoji is special
    assert.equal(
--- a/package.json
+++ b/package.json
@ -98,6 +98,8 @@
    "@typescript-eslint/parser": "^5.0.0",
    "babel-plugin-rewire-ts": "^1.4.0",
    "callsites": "^3.1.0",
+    "cldr-annotations-derived-modern": "^41.0.0",
+    "cldr-annotations-modern": "^41.0.0",
    "diff": "^5.0.0",
    "difflib": "^0.2.4",
    "enhanced-resolve": "^5.8.2",
--- a/tools/setup/emoji/custom_emoji_names.py
+++ b/tools/setup/emoji/custom_emoji_names.py
@ -28,7 +28,6 @@ CUSTOM_EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {
    "1f60d": {"canonical_name": "heart_eyes", "aliases": ["in_love"]},
    # blow_a_kiss from https://beebom.com/emoji-meanings/
    "1f618": {"canonical_name": "heart_kiss", "aliases": ["blow_a_kiss"]},
-    "1f617": {"canonical_name": "kiss", "aliases": []},
    "1f619": {"canonical_name": "kiss_smiling_eyes", "aliases": []},
    "1f61a": {"canonical_name": "kiss_with_blush", "aliases": []},
    "1f60b": {"canonical_name": "yum", "aliases": []},
@ -411,10 +410,6 @@ CUSTOM_EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {
    # spiral_shell from Unicode/gemoji, the others seemed like reasonable
    # additions
    "1f41a": {"canonical_name": "shell", "aliases": ["seashell", "conch", "spiral_shell"]},
-    # Unicode/gemoji have lady_beetle; hopefully with ladybug we get both the
-    # people that prefer lady_beetle (with beetle) and ladybug. There is also
-    # ladybird, but seems a bit much for this to complete for bird.
-    "1f41e": {"canonical_name": "beetle", "aliases": ["ladybug"]},
    "1f41c": {"canonical_name": "ant", "aliases": []},
    "1f577": {"canonical_name": "spider", "aliases": []},
    "1f578": {"canonical_name": "web", "aliases": ["spider_web"]},
@ -738,8 +733,6 @@ CUSTOM_EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {
    "1f945": {"canonical_name": "gooooooooal", "aliases": ["goal"]},
    "1f3d2": {"canonical_name": "ice_hockey", "aliases": []},
    "1f3d1": {"canonical_name": "field_hockey", "aliases": []},
-    # would say bat, but taken by Nature/30
-    "1f3cf": {"canonical_name": "cricket", "aliases": ["cricket_bat"]},
    # hole_in_one seems like a more useful name to have. Sent golf to
    # Activity/39
    "26f3": {"canonical_name": "hole_in_one", "aliases": []},
@ -1210,8 +1203,6 @@ CUSTOM_EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {
    "1f4ee": {"canonical_name": "mail_dropoff", "aliases": []},
    "1f4ef": {"canonical_name": "horn", "aliases": []},
    "1f4dc": {"canonical_name": "scroll", "aliases": []},
-    # receipt seems more useful?
-    "1f4c3": {"canonical_name": "receipt", "aliases": []},
    "1f4c4": {"canonical_name": "document", "aliases": ["paper", "file", "page"]},
    "1f4d1": {"canonical_name": "place_holder", "aliases": []},
    "1f4ca": {"canonical_name": "bar_chart", "aliases": []},
--- a/tools/setup/emoji/emoji_names.py
+++ b/tools/setup/emoji/emoji_names.py
--- a/tools/setup/emoji/generate_emoji_names
+++ b/tools/setup/emoji/generate_emoji_names
@ -1,7 +1,8 @@
 #!/usr/bin/env python3
-import json
 import os
+import re
 import sys
+from collections import defaultdict

 import orjson

@ -10,11 +11,26 @@ from emoji_setup_utils import get_emoji_code

 ZULIP_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../../")
 sys.path.append(ZULIP_PATH)
+CLDR_DATA_FILE = os.path.join(
+    ZULIP_PATH, "node_modules", "cldr-annotations-modern", "annotations", "en", "annotations.json"
+)
+CLDR_DERIVED_DATA_FILE = os.path.join(
+    ZULIP_PATH,
+    "node_modules",
+    "cldr-annotations-derived-modern",
+    "annotationsDerived",
+    "en",
+    "annotations.json",
+)
 EMOJI_DATA_FILE = os.path.join(ZULIP_PATH, "node_modules", "emoji-datasource-google", "emoji.json")
 OUT_EMOJI_FILE = os.path.join(ZULIP_PATH, "tools", "setup", "emoji", "emoji_names.py")

 with open(EMOJI_DATA_FILE, "rb") as fp:
    EMOJI_DATA = orjson.loads(fp.read())
+with open(CLDR_DATA_FILE, "rb") as fp:
+    CLDR_DATA = orjson.loads(fp.read())["annotations"]["annotations"]
+with open(CLDR_DERIVED_DATA_FILE, "rb") as fp:
+    CLDR_DATA.update(orjson.loads(fp.read())["annotationsDerived"]["annotations"])

 # We don't include most clock emojis. See `custom_emoji_names` for more context.
 SKIPPED_CLOCK_EMOJI_CODES = [
@ -53,17 +69,62 @@ SKIN_TONE_EMOJI_CODES = [
 ]


+def cleanup_name(name: str) -> str:
+    replacements = {
+        " ": "_",
+        "-": "_",
+        "–": "_",
+        "“": "",
+        "”": "",
+        ":": "",
+        ".": "",
+        "&": "and",
+        "‘": "'",
+        "’": "'",
+    }
+    for before, after in replacements.items():
+        name = name.replace(before, after)
+    name = re.sub("_{2,}", "_", name)
+    return name.lower()
+
+
+def convert_non_ascii_chars(name: str) -> str:
+    replacements = {
+        "è": "e",
+        "ǐ": "i",
+        "ó": "o",
+        "ā": "a",
+        "ō": "o",
+        "ñ": "n",
+        "ô": "o",
+        "ç": "c",
+        "é": "e",
+        "ã": "a",
+        "í": "i",
+        "å": "a",
+    }
+    for before, after in replacements.items():
+        name = name.replace(before, after)
+    assert (
+        name.isascii()
+    ), f"{name} still contains non-ascii characters. Add them to convert_non_ascii_chars."
+    return name
+
+
 def main() -> None:
    all_emojis = {}
    all_canonical_names = set()

+    alias_to_emoji_code = defaultdict(list)
+
+    # STEP 1: Generate first draft of all_emojis.
    for emoji_dict in EMOJI_DATA:
        emoji_code = get_emoji_code(emoji_dict)
        if emoji_code in SKIPPED_CLOCK_EMOJI_CODES or emoji_code in SKIN_TONE_EMOJI_CODES:
            continue

        if emoji_code in CUSTOM_EMOJI_NAME_MAPS:
-            canonical_name = CUSTOM_EMOJI_NAME_MAPS[emoji_code]["canonical_name"]
+            canonical_name = cleanup_name(CUSTOM_EMOJI_NAME_MAPS[emoji_code]["canonical_name"])
            if canonical_name in all_canonical_names:
                raise Exception(
                    f"{canonical_name} was already added with a different codepoint. "
@ -72,9 +133,70 @@ def main() -> None:
            all_canonical_names.add(canonical_name)
            all_emojis[emoji_code] = CUSTOM_EMOJI_NAME_MAPS[emoji_code]
        else:
-            continue  # this commit doesn't add CLDR data yet.
            # create the unicode character(s) for the emoji, since this is the key into the CLDR data
+            emoji = "".join(
+                chr(int(h, 16))
+                for h in (emoji_dict["non_qualified"] or emoji_dict["unified"]).split("-")
+            )
+            if emoji not in CLDR_DATA:
+                print(
+                    f"{emoji} not found in custom emoji name maps, but also not found in CLDR data. Skipping."
+                )
+                continue
+            # CLDR_DATA[emoji] is of the form {'default': [...], 'tts': [...]}
+            # * "tts" is what's used for text-to-speech and always has one item, so we use that
+            #    as the canonical name.
+            # * "default" has several items in it that we use as aliases.
+            # See also: https://www.unicode.org/reports/tr35/tr35-general.html#14-annotations-and-labels
+            assert len(CLDR_DATA[emoji]["tts"]) == 1
+            canonical_name = cleanup_name(CLDR_DATA[emoji]["tts"][0].strip())
+            if canonical_name in all_canonical_names:
+                raise Exception(
+                    f"{canonical_name} was already added with a different codepoint. "
+                    f"Rename it in `custom_emoji_names` or add an entry for {emoji_code}."
+                )
+            aliases = [cleanup_name(alias.strip()) for alias in CLDR_DATA[emoji]["default"]]
+            all_emojis[emoji_code] = {"canonical_name": canonical_name, "aliases": aliases}
+            all_canonical_names.add(canonical_name)

+    # STEP 2: We don't support having aliases that collide with canonical names for emoji, so remove them.
+    for (emoji_code, emoji_names) in all_emojis.items():
+        # Copy the list to not iterate while elements are being deleted.
+        aliases = emoji_names["aliases"][:]
+        for alias in aliases:
+            if alias in all_canonical_names:
+                emoji_names["aliases"].remove(alias)
+            else:
+                alias_to_emoji_code[alias].append(emoji_code)  # This is used in STEP 3.
+
+    # STEP 3: We don't support multiple emoji sharing the same alias, but the CLDR data
+    # doesn't have that same restriction, so we have to fix this up to have unique aliases.
+    # If the alias was specifically specified in custom_emoji_names, then we can keep just
+    # that one, but otherwise there's no particular emoji that is an obvious candidate
+    # for the alias so just remove the alias for all relevant emoji.
+    for alias in alias_to_emoji_code.keys():
+        if len(alias_to_emoji_code[alias]) > 1:
+            for emoji_code in alias_to_emoji_code[alias]:
+                if emoji_code not in CUSTOM_EMOJI_NAME_MAPS:
+                    all_emojis[emoji_code]["aliases"].remove(alias)
+
+    # STEP 4: We keep non-ascii (non-"English") characters in some emoji names if that's the correct
+    # way to spell that word, but always add an alias for an ascii-only version of the word.
+    for (emoji_code, emoji_names) in all_emojis.items():
+        for name in [emoji_names["canonical_name"]] + emoji_names["aliases"]:
+            # These are known names where we don't have an ascii-only version and there are ascii aliases
+            # that a user can still enter instead to get the same emoji.
+            if name in ["ココ", "サ", "指", "空"]:
+                assert any(alias.isascii() for alias in aliases)
+                continue
+            if not name.isascii():
+                ascii_alias = convert_non_ascii_chars(name)
+                # Now no other emoji can use this alias.
+                for code in alias_to_emoji_code[ascii_alias]:
+                    all_emojis[code]["aliases"].remove(ascii_alias)
+                all_emojis[emoji_code]["aliases"].append(ascii_alias)
+
+    # STEP 5: Write final dictionary to `emoji_names.py`.
    with open(OUT_EMOJI_FILE, "w") as f:
        f.write(
            "from typing import Any, Dict\n\n"
@ -82,7 +204,7 @@ def main() -> None:
            "EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {\n"
        )
        for (key, emoji_names) in all_emojis.items():
-            f.write(f'    "{key}": {json.dumps(emoji_names)},\n')
+            f.write(f"    {key!r}: {emoji_names!r},\n")
        f.write("}\n")

    print(
--- a/yarn.lock
+++ b/yarn.lock
@ -3073,6 +3073,16 @@ clamp@^1.0.1:
  resolved "https://registry.yarnpkg.com/clamp/-/clamp-1.0.1.tgz#66a0e64011816e37196828fdc8c8c147312c8634"
  integrity sha512-kgMuFyE78OC6Dyu3Dy7vcx4uy97EIbVxJB/B0eJ3bUNAkwdNcxYzgKltnyADiYwsR7SEqkkUPsEUT//OVS6XMA==

+cldr-annotations-derived-modern@^41.0.0:
+  version "41.0.0"
+  resolved "https://registry.yarnpkg.com/cldr-annotations-derived-modern/-/cldr-annotations-derived-modern-41.0.0.tgz#da55423006c5b9dd742d9395a2318b05d6a6efa2"
+  integrity sha512-V9N8CW+DDem3NZlpGKPdvTkoqZV/rNbZq27UZO1d0JXhM+pZo6e8n3+GSqSDgIAXphxn1v9Yd9U+X9NzZ+PCcA==
+
+cldr-annotations-modern@^41.0.0:
+  version "41.0.0"
+  resolved "https://registry.yarnpkg.com/cldr-annotations-modern/-/cldr-annotations-modern-41.0.0.tgz#5eeaef3b250e30e0dd7e8babc28a3a70c71a7192"
+  integrity sha512-ymcfrliWq6IFB9vnDuT9awT7z9bTkh03g39eeU5RIYK0hFuZAPsYdpgs461GEuUKCL7SmDXXyZJwmf9iGYsCdg==
+
 clean-css@^5.1.0, clean-css@^5.2.2:
  version "5.3.1"
  resolved "https://registry.yarnpkg.com/clean-css/-/clean-css-5.3.1.tgz#d0610b0b90d125196a2894d35366f734e5d7aa32"
--- a/zerver/tests/test_rocketchat_importer.py
+++ b/zerver/tests/test_rocketchat_importer.py
@ -758,14 +758,15 @@ class RocketChatImporter(ZulipTestCase):
            zerver_realmemoji=zerver_realmemoji,
        )

-        # :grin: and :star_struck: are not present in Zulip's default
-        # emoji set, or in Reaction.UNICODE_EMOJI reaction type.
-        self.assert_length(total_reactions, 7)
+        # :grin: is not present in Zulip's default emoji set,
+        # or in Reaction.UNICODE_EMOJI reaction type.
+        self.assert_length(total_reactions, 8)

        grinning_emoji_code = name_to_codepoint["grinning"]
        innocent_emoji_code = name_to_codepoint["innocent"]
        heart_emoji_code = name_to_codepoint["heart"]
        rocket_emoji_code = name_to_codepoint["rocket"]
+        star_struck_emoji_code = name_to_codepoint["star_struck"]

        realmemoji_code = {}
        for emoji in zerver_realmemoji:
@ -777,7 +778,16 @@ class RocketChatImporter(ZulipTestCase):
        )
        self.assertEqual(
            self.get_set(total_reactions, "emoji_name"),
-            {"grinning", "innocent", "heart", "rocket", "check", "zulip", "harry-ron"},
+            {
+                "grinning",
+                "innocent",
+                "star_struck",
+                "heart",
+                "rocket",
+                "check",
+                "zulip",
+                "harry-ron",
+            },
        )
        self.assertEqual(
            self.get_set(total_reactions, "emoji_code"),
@ -786,13 +796,14 @@ class RocketChatImporter(ZulipTestCase):
                innocent_emoji_code,
                heart_emoji_code,
                rocket_emoji_code,
+                star_struck_emoji_code,
                realmemoji_code["check"],
                realmemoji_code["zulip"],
                realmemoji_code["harry-ron"],
            },
        )
        self.assertEqual(self.get_set(total_reactions, "user_profile"), {2, 3, 4})
-        self.assert_length(self.get_set(total_reactions, "id"), 7)
+        self.assert_length(self.get_set(total_reactions, "id"), 8)
        self.assert_length(self.get_set(total_reactions, "message"), 1)

    def test_process_message_attachment(self) -> None: