zulip/tools/setup/emoji/generate_emoji_names

#!/usr/bin/env python3
import os
import re
import sys
from collections import defaultdict

import orjson

ZULIP_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../../")
sys.path.append(ZULIP_PATH)

from tools.setup.emoji.custom_emoji_names import CUSTOM_EMOJI_NAME_MAPS
from tools.setup.emoji.emoji_setup_utils import get_emoji_code
from zerver.lib.emoji_utils import hex_codepoint_to_emoji, unqualify_emoji

CLDR_DATA_FILE = os.path.join(
    ZULIP_PATH, "node_modules", "cldr-annotations-modern", "annotations", "en", "annotations.json"
)
CLDR_DERIVED_DATA_FILE = os.path.join(
    ZULIP_PATH,
    "node_modules",
    "cldr-annotations-derived-modern",
    "annotationsDerived",
    "en",
    "annotations.json",
)
EMOJI_DATA_FILE = os.path.join(ZULIP_PATH, "node_modules", "emoji-datasource-google", "emoji.json")
OUT_EMOJI_FILE = os.path.join(ZULIP_PATH, "tools", "setup", "emoji", "emoji_names.py")

with open(EMOJI_DATA_FILE, "rb") as fp:
    EMOJI_DATA = orjson.loads(fp.read())
with open(CLDR_DATA_FILE, "rb") as fp:
    CLDR_DATA = orjson.loads(fp.read())["annotations"]["annotations"]
with open(CLDR_DERIVED_DATA_FILE, "rb") as fp:
    CLDR_DATA.update(orjson.loads(fp.read())["annotationsDerived"]["annotations"])

# We don't include most clock emojis. See `custom_emoji_names` for more context.
SKIPPED_CLOCK_EMOJI_CODES = [
    "1f550",
    "1f551",
    "1f552",
    "1f553",
    "1f554",
    "1f555",
    "1f556",
    "1f558",
    "1f559",
    "1f55a",
    "1f55b",
    "1f55c",
    "1f55d",
    "1f55e",
    "1f55f",
    "1f560",
    "1f561",
    "1f562",
    "1f563",
    "1f564",
    "1f565",
    "1f566",
    "1f567",
]

# We don't include the skin tones as emojis that one can search for on their own.
SKIN_TONE_EMOJI_CODES = [
    "1f3fb",
    "1f3fc",
    "1f3fd",
    "1f3fe",
    "1f3ff",
]


def cleanup_name(name: str) -> str:
    replacements = {
        " ": "_",
        "-": "_",
        "–": "_",
        "“": "",
        "”": "",
        ":": "",
        ".": "",
        ",": "",
        "(": "",
        ")": "",
        "&": "and",
        "‘": "",
        "’": "",
        "'": "",
    }
    for before, after in replacements.items():
        name = name.replace(before, after)
    name = re.sub(r"_{2,}", "_", name)
    return name.lower()


def convert_non_ascii_chars(name: str) -> str:
    replacements = {
        "è": "e",
        "ǐ": "i",
        "ó": "o",
        "ā": "a",
        "ō": "o",
        "ñ": "n",
        "ô": "o",
        "ç": "c",
        "é": "e",
        "ã": "a",
        "í": "i",
        "å": "a",
        "ü": "u",
    }
    for before, after in replacements.items():
        name = name.replace(before, after)
    assert (
        name.isascii()
    ), f"{name} still contains non-ascii characters. Add them to convert_non_ascii_chars."
    return name


def main() -> None:
    all_emojis = {}
    all_canonical_names = set()

    alias_to_emoji_code = defaultdict(list)

    # STEP 1: Generate first draft of all_emojis.
    for emoji_dict in EMOJI_DATA:
        emoji_code = get_emoji_code(emoji_dict)
        if emoji_code in SKIPPED_CLOCK_EMOJI_CODES or emoji_code in SKIN_TONE_EMOJI_CODES:
            continue

        if emoji_code in CUSTOM_EMOJI_NAME_MAPS:
            canonical_name = cleanup_name(CUSTOM_EMOJI_NAME_MAPS[emoji_code]["canonical_name"])
            if canonical_name in all_canonical_names:
                raise Exception(
                    f"{canonical_name} was already added with a different codepoint. "
                    f"Rename it in `custom_emoji_names` or add an entry for {emoji_code}."
                )
            all_canonical_names.add(canonical_name)
            all_emojis[emoji_code] = CUSTOM_EMOJI_NAME_MAPS[emoji_code]
        else:
            # create the unicode character(s) for the emoji, since this is the key into the CLDR data
            # We can't just use emoji_dict["non_qualified"] because of this upstream bug:
            # https://github.com/iamcal/emoji-data/pull/217
            emoji = unqualify_emoji(hex_codepoint_to_emoji(emoji_dict["unified"]))
            if emoji not in CLDR_DATA:
                print(
                    f"{emoji} not found in custom emoji name maps, but also not found in CLDR data. Skipping."
                )
                continue
            # CLDR_DATA[emoji] is of the form {'default': [...], 'tts': [...]}
            # * "tts" is what's used for text-to-speech and always has one item, so we use that
            #    as the canonical name.
            # * "default" has several items in it that we use as aliases.
            # See also: https://www.unicode.org/reports/tr35/tr35-general.html#14-annotations-and-labels
            assert len(CLDR_DATA[emoji]["tts"]) == 1
            canonical_name = cleanup_name(CLDR_DATA[emoji]["tts"][0].strip())
            if canonical_name in all_canonical_names:
                raise Exception(
                    f"{canonical_name} was already added with a different codepoint. "
                    f"Rename it in `custom_emoji_names` or add an entry for {emoji_code}."
                )
            aliases = [cleanup_name(alias.strip()) for alias in CLDR_DATA[emoji]["default"]]
            all_emojis[emoji_code] = {"canonical_name": canonical_name, "aliases": aliases}
            all_canonical_names.add(canonical_name)

    # STEP 2: We don't support having aliases that collide with canonical names for emoji, so remove them.
    for emoji_code, emoji_names in all_emojis.items():
        # Copy the list to not iterate while elements are being deleted.
        aliases = emoji_names["aliases"][:]
        for alias in aliases:
            if alias in all_canonical_names:
                emoji_names["aliases"].remove(alias)
            else:
                alias_to_emoji_code[alias].append(emoji_code)  # This is used in STEP 3.

    # STEP 3: We don't support multiple emoji sharing the same alias, but the CLDR data
    # doesn't have that same restriction, so we have to fix this up to have unique aliases.
    # If the alias was specifically specified in custom_emoji_names, then we can keep just
    # that one, but otherwise there's no particular emoji that is an obvious candidate
    # for the alias so just remove the alias for all relevant emoji.
    for alias in alias_to_emoji_code:
        if len(alias_to_emoji_code[alias]) > 1:
            for emoji_code in alias_to_emoji_code[alias]:
                if emoji_code not in CUSTOM_EMOJI_NAME_MAPS:
                    all_emojis[emoji_code]["aliases"].remove(alias)

    # STEP 4: We keep non-ascii (non-"English") characters in some emoji names if that's the correct
    # way to spell that word, but always add an alias for an ascii-only version of the word.
    for emoji_code, emoji_names in all_emojis.items():
        for name in [emoji_names["canonical_name"]] + emoji_names["aliases"]:
            # These are known names where we don't have an ascii-only version and there are ascii aliases
            # that a user can still enter instead to get the same emoji.
            if name in ["ココ", "サ", "指", "空"]:
                assert any(alias.isascii() for alias in emoji_names["aliases"])
                continue
            if not name.isascii():
                ascii_alias = convert_non_ascii_chars(name)
                # Now no other emoji can use this alias.
                for code in alias_to_emoji_code[ascii_alias]:
                    all_emojis[code]["aliases"].remove(ascii_alias)
                all_emojis[emoji_code]["aliases"].append(ascii_alias)

    # STEP 5: Write final dictionary to `emoji_names.py`.
    with open(OUT_EMOJI_FILE, "w") as f:
        f.write(
            "from typing import Any, Dict\n\n"
            "# Generated with `generate_emoji_names`.\n\n"
            "EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {\n"
        )
        for key, emoji_names in all_emojis.items():
            f.write(f"    {key!r}: {emoji_names!r},\n")
        f.write("}\n")

    print(
        "\n\nDone! You should run the linter to format emoji_names.py with `./tools/lint --fix -m --only black`"
    )


if __name__ == "__main__":
    main()