zulip/tools/setup/emoji/generate_emoji_names

#!/usr/bin/env python3
import os
import re
import sys
from collections import defaultdict

import orjson

from custom_emoji_names import CUSTOM_EMOJI_NAME_MAPS
from emoji_setup_utils import get_emoji_code

ZULIP_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../../")
sys.path.append(ZULIP_PATH)
CLDR_DATA_FILE = os.path.join(
    ZULIP_PATH, "node_modules", "cldr-annotations-modern", "annotations", "en", "annotations.json"
)
CLDR_DERIVED_DATA_FILE = os.path.join(
    ZULIP_PATH,
    "node_modules",
    "cldr-annotations-derived-modern",
    "annotationsDerived",
    "en",
    "annotations.json",
)
EMOJI_DATA_FILE = os.path.join(ZULIP_PATH, "node_modules", "emoji-datasource-google", "emoji.json")
OUT_EMOJI_FILE = os.path.join(ZULIP_PATH, "tools", "setup", "emoji", "emoji_names.py")

with open(EMOJI_DATA_FILE, "rb") as fp:
    EMOJI_DATA = orjson.loads(fp.read())
with open(CLDR_DATA_FILE, "rb") as fp:
    CLDR_DATA = orjson.loads(fp.read())["annotations"]["annotations"]
with open(CLDR_DERIVED_DATA_FILE, "rb") as fp:
    CLDR_DATA.update(orjson.loads(fp.read())["annotationsDerived"]["annotations"])

# We don't include most clock emojis. See `custom_emoji_names` for more context.
SKIPPED_CLOCK_EMOJI_CODES = [
    "1f550",
    "1f551",
    "1f552",
    "1f553",
    "1f554",
    "1f555",
    "1f556",
    "1f558",
    "1f559",
    "1f55a",
    "1f55b",
    "1f55c",
    "1f55d",
    "1f55e",
    "1f55f",
    "1f560",
    "1f561",
    "1f562",
    "1f563",
    "1f564",
    "1f565",
    "1f566",
    "1f567",
]

# We don't include the skin tones as emojis that one can search for on their own.
SKIN_TONE_EMOJI_CODES = [
    "1f3fb",
    "1f3fc",
    "1f3fd",
    "1f3fe",
    "1f3ff",
]


def cleanup_name(name: str) -> str:
    replacements = {
        " ": "_",
        "-": "_",
        "–": "_",
        "“": "",
        "”": "",
        ":": "",
        ".": "",
        ",": "",
        "&": "and",
        "‘": "'",
        "’": "'",
    }
    for before, after in replacements.items():
        name = name.replace(before, after)
    name = re.sub("_{2,}", "_", name)
    return name.lower()


def convert_non_ascii_chars(name: str) -> str:
    replacements = {
        "è": "e",
        "ǐ": "i",
        "ó": "o",
        "ā": "a",
        "ō": "o",
        "ñ": "n",
        "ô": "o",
        "ç": "c",
        "é": "e",
        "ã": "a",
        "í": "i",
        "å": "a",
    }
    for before, after in replacements.items():
        name = name.replace(before, after)
    assert (
        name.isascii()
    ), f"{name} still contains non-ascii characters. Add them to convert_non_ascii_chars."
    return name


def main() -> None:
    all_emojis = {}
    all_canonical_names = set()

    alias_to_emoji_code = defaultdict(list)

    # STEP 1: Generate first draft of all_emojis.
    for emoji_dict in EMOJI_DATA:
        emoji_code = get_emoji_code(emoji_dict)
        if emoji_code in SKIPPED_CLOCK_EMOJI_CODES or emoji_code in SKIN_TONE_EMOJI_CODES:
            continue

        if emoji_code in CUSTOM_EMOJI_NAME_MAPS:
            canonical_name = cleanup_name(CUSTOM_EMOJI_NAME_MAPS[emoji_code]["canonical_name"])
            if canonical_name in all_canonical_names:
                raise Exception(
                    f"{canonical_name} was already added with a different codepoint. "
                    f"Rename it in `custom_emoji_names` or add an entry for {emoji_code}."
                )
            all_canonical_names.add(canonical_name)
            all_emojis[emoji_code] = CUSTOM_EMOJI_NAME_MAPS[emoji_code]
        else:
            # create the unicode character(s) for the emoji, since this is the key into the CLDR data
            emoji = "".join(
                chr(int(h, 16))
                for h in (emoji_dict["non_qualified"] or emoji_dict["unified"]).split("-")
            )
            if emoji not in CLDR_DATA:
                print(
                    f"{emoji} not found in custom emoji name maps, but also not found in CLDR data. Skipping."
                )
                continue
            # CLDR_DATA[emoji] is of the form {'default': [...], 'tts': [...]}
            # * "tts" is what's used for text-to-speech and always has one item, so we use that
            #    as the canonical name.
            # * "default" has several items in it that we use as aliases.
            # See also: https://www.unicode.org/reports/tr35/tr35-general.html#14-annotations-and-labels
            assert len(CLDR_DATA[emoji]["tts"]) == 1
            canonical_name = cleanup_name(CLDR_DATA[emoji]["tts"][0].strip())
            if canonical_name in all_canonical_names:
                raise Exception(
                    f"{canonical_name} was already added with a different codepoint. "
                    f"Rename it in `custom_emoji_names` or add an entry for {emoji_code}."
                )
            aliases = [cleanup_name(alias.strip()) for alias in CLDR_DATA[emoji]["default"]]
            all_emojis[emoji_code] = {"canonical_name": canonical_name, "aliases": aliases}
            all_canonical_names.add(canonical_name)

    # STEP 2: We don't support having aliases that collide with canonical names for emoji, so remove them.
    for (emoji_code, emoji_names) in all_emojis.items():
        # Copy the list to not iterate while elements are being deleted.
        aliases = emoji_names["aliases"][:]
        for alias in aliases:
            if alias in all_canonical_names:
                emoji_names["aliases"].remove(alias)
            else:
                alias_to_emoji_code[alias].append(emoji_code)  # This is used in STEP 3.

    # STEP 3: We don't support multiple emoji sharing the same alias, but the CLDR data
    # doesn't have that same restriction, so we have to fix this up to have unique aliases.
    # If the alias was specifically specified in custom_emoji_names, then we can keep just
    # that one, but otherwise there's no particular emoji that is an obvious candidate
    # for the alias so just remove the alias for all relevant emoji.
    for alias in alias_to_emoji_code.keys():
        if len(alias_to_emoji_code[alias]) > 1:
            for emoji_code in alias_to_emoji_code[alias]:
                if emoji_code not in CUSTOM_EMOJI_NAME_MAPS:
                    all_emojis[emoji_code]["aliases"].remove(alias)

    # STEP 4: We keep non-ascii (non-"English") characters in some emoji names if that's the correct
    # way to spell that word, but always add an alias for an ascii-only version of the word.
    for (emoji_code, emoji_names) in all_emojis.items():
        for name in [emoji_names["canonical_name"]] + emoji_names["aliases"]:
            # These are known names where we don't have an ascii-only version and there are ascii aliases
            # that a user can still enter instead to get the same emoji.
            if name in ["ココ", "サ", "指", "空"]:
                assert any(alias.isascii() for alias in aliases)
                continue
            if not name.isascii():
                ascii_alias = convert_non_ascii_chars(name)
                # Now no other emoji can use this alias.
                for code in alias_to_emoji_code[ascii_alias]:
                    all_emojis[code]["aliases"].remove(ascii_alias)
                all_emojis[emoji_code]["aliases"].append(ascii_alias)

    # STEP 5: Write final dictionary to `emoji_names.py`.
    with open(OUT_EMOJI_FILE, "w") as f:
        f.write(
            "from typing import Any, Dict\n\n"
            "# Generated with `generate_emoji_names`.\n\n"
            "EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {\n"
        )
        for (key, emoji_names) in all_emojis.items():
            f.write(f"    {key!r}: {emoji_names!r},\n")
        f.write("}\n")

    print(
        "\n\nDone! You should run the linter to format emoji_names.py with `./tools/lint --fix -m --only black`"
    )


if __name__ == "__main__":
    main()
-												emoji: Add script to generate emoji_names.py, only with custom names.

This script pulls from our previously custom-written emoji strings
to prepare to fill in the rest from CLDR.

This commit has no user-facing changes.

											
										
										
											2022-06-10 22:43:59 +02:00
+								#!/usr/bin/env python3
 								import os
-												emoji: Finish script to generate emoji_names.py with CLDR data.

This script pulls from our previously custom-written emoji strings
and fills in the rest from CLDR. It also removes 4 custom emoji which
collide with some of the new CLDR names (they will now just be called
by their CLDR name).

											
										
										
											2022-06-10 22:40:31 +02:00
+								import re
-												emoji: Add script to generate emoji_names.py, only with custom names.

This script pulls from our previously custom-written emoji strings
to prepare to fill in the rest from CLDR.

This commit has no user-facing changes.

											
										
										
											2022-06-10 22:43:59 +02:00
+								import sys
-												emoji: Finish script to generate emoji_names.py with CLDR data.

This script pulls from our previously custom-written emoji strings
and fills in the rest from CLDR. It also removes 4 custom emoji which
collide with some of the new CLDR names (they will now just be called
by their CLDR name).

											
										
										
											2022-06-10 22:40:31 +02:00
+								from collections import defaultdict
-												emoji: Add script to generate emoji_names.py, only with custom names.

This script pulls from our previously custom-written emoji strings
to prepare to fill in the rest from CLDR.

This commit has no user-facing changes.

											
										
										
											2022-06-10 22:43:59 +02:00
 								import orjson
 								from custom_emoji_names import CUSTOM_EMOJI_NAME_MAPS
 								from emoji_setup_utils import get_emoji_code
 								ZULIP_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../../")
 								sys.path.append(ZULIP_PATH)
-												emoji: Finish script to generate emoji_names.py with CLDR data.

This script pulls from our previously custom-written emoji strings
and fills in the rest from CLDR. It also removes 4 custom emoji which
collide with some of the new CLDR names (they will now just be called
by their CLDR name).

											
										
										
											2022-06-10 22:40:31 +02:00
+								CLDR_DATA_FILE = os.path.join(
 								    ZULIP_PATH, "node_modules", "cldr-annotations-modern", "annotations", "en", "annotations.json"
 								)
 								CLDR_DERIVED_DATA_FILE = os.path.join(
 								    ZULIP_PATH,
 								    "node_modules",
 								    "cldr-annotations-derived-modern",
 								    "annotationsDerived",
 								    "en",
 								    "annotations.json",
 								)
-												emoji: Add script to generate emoji_names.py, only with custom names.

This script pulls from our previously custom-written emoji strings
to prepare to fill in the rest from CLDR.

This commit has no user-facing changes.

											
										
										
											2022-06-10 22:43:59 +02:00
+								EMOJI_DATA_FILE = os.path.join(ZULIP_PATH, "node_modules", "emoji-datasource-google", "emoji.json")
 								OUT_EMOJI_FILE = os.path.join(ZULIP_PATH, "tools", "setup", "emoji", "emoji_names.py")
 								with open(EMOJI_DATA_FILE, "rb") as fp:
 								    EMOJI_DATA = orjson.loads(fp.read())
-												emoji: Finish script to generate emoji_names.py with CLDR data.

This script pulls from our previously custom-written emoji strings
and fills in the rest from CLDR. It also removes 4 custom emoji which
collide with some of the new CLDR names (they will now just be called
by their CLDR name).

											
										
										
											2022-06-10 22:40:31 +02:00
+								with open(CLDR_DATA_FILE, "rb") as fp:
 								    CLDR_DATA = orjson.loads(fp.read())["annotations"]["annotations"]
 								with open(CLDR_DERIVED_DATA_FILE, "rb") as fp:
 								    CLDR_DATA.update(orjson.loads(fp.read())["annotationsDerived"]["annotations"])
-												emoji: Add script to generate emoji_names.py, only with custom names.

This script pulls from our previously custom-written emoji strings
to prepare to fill in the rest from CLDR.

This commit has no user-facing changes.

											
										
										
											2022-06-10 22:43:59 +02:00
 								# We don't include most clock emojis. See `custom_emoji_names` for more context.
 								SKIPPED_CLOCK_EMOJI_CODES = [
 								    "1f550",
 								    "1f551",
 								    "1f552",
 								    "1f553",
 								    "1f554",
 								    "1f555",
 								    "1f556",
 								    "1f558",
 								    "1f559",
 								    "1f55a",
 								    "1f55b",
 								    "1f55c",
 								    "1f55d",
 								    "1f55e",
 								    "1f55f",
 								    "1f560",
 								    "1f561",
 								    "1f562",
 								    "1f563",
 								    "1f564",
 								    "1f565",
 								    "1f566",
 								    "1f567",
 								]
 								# We don't include the skin tones as emojis that one can search for on their own.
 								SKIN_TONE_EMOJI_CODES = [
 								    "1f3fb",
 								    "1f3fc",
 								    "1f3fd",
 								    "1f3fe",
 								    "1f3ff",
 								]
-												emoji: Finish script to generate emoji_names.py with CLDR data.

This script pulls from our previously custom-written emoji strings
and fills in the rest from CLDR. It also removes 4 custom emoji which
collide with some of the new CLDR names (they will now just be called
by their CLDR name).

											
										
										
											2022-06-10 22:40:31 +02:00
+								def cleanup_name(name: str) -> str:
 								    replacements = {
 								        " ": "_",
 								        "-": "_",
 								        "–": "_",
 								        "“": "",
 								        "”": "",
 								        ":": "",
 								        ".": "",
-												emoji: Remove commas from emoji names.

Fixes part of #23626.

This was preventing emoji like `:family_man,_woman,_girl,_boy:`
from displaying.

											
										
										
											2022-11-24 21:51:56 +01:00
+								        ",": "",
-												emoji: Finish script to generate emoji_names.py with CLDR data.

This script pulls from our previously custom-written emoji strings
and fills in the rest from CLDR. It also removes 4 custom emoji which
collide with some of the new CLDR names (they will now just be called
by their CLDR name).

											
										
										
											2022-06-10 22:40:31 +02:00
+								        "&": "and",
 								        "‘": "'",
 								        "’": "'",
 								    }
 								    for before, after in replacements.items():
 								        name = name.replace(before, after)
 								    name = re.sub("_{2,}", "_", name)
 								    return name.lower()
 								def convert_non_ascii_chars(name: str) -> str:
 								    replacements = {
 								        "è": "e",
 								        "ǐ": "i",
 								        "ó": "o",
 								        "ā": "a",
 								        "ō": "o",
 								        "ñ": "n",
 								        "ô": "o",
 								        "ç": "c",
 								        "é": "e",
 								        "ã": "a",
 								        "í": "i",
 								        "å": "a",
 								    }
 								    for before, after in replacements.items():
 								        name = name.replace(before, after)
 								    assert (
 								        name.isascii()
 								    ), f"{name} still contains non-ascii characters. Add them to convert_non_ascii_chars."
 								    return name
-												emoji: Add script to generate emoji_names.py, only with custom names.

This script pulls from our previously custom-written emoji strings
to prepare to fill in the rest from CLDR.

This commit has no user-facing changes.

											
										
										
											2022-06-10 22:43:59 +02:00
+								def main() -> None:
 								    all_emojis = {}
 								    all_canonical_names = set()
-												emoji: Finish script to generate emoji_names.py with CLDR data.

This script pulls from our previously custom-written emoji strings
and fills in the rest from CLDR. It also removes 4 custom emoji which
collide with some of the new CLDR names (they will now just be called
by their CLDR name).

											
										
										
											2022-06-10 22:40:31 +02:00
+								    alias_to_emoji_code = defaultdict(list)
 								    # STEP 1: Generate first draft of all_emojis.
-												emoji: Add script to generate emoji_names.py, only with custom names.

This script pulls from our previously custom-written emoji strings
to prepare to fill in the rest from CLDR.

This commit has no user-facing changes.

											
										
										
											2022-06-10 22:43:59 +02:00
+								    for emoji_dict in EMOJI_DATA:
 								        emoji_code = get_emoji_code(emoji_dict)
 								        if emoji_code in SKIPPED_CLOCK_EMOJI_CODES or emoji_code in SKIN_TONE_EMOJI_CODES:
 								            continue
 								        if emoji_code in CUSTOM_EMOJI_NAME_MAPS:
-												emoji: Finish script to generate emoji_names.py with CLDR data.

This script pulls from our previously custom-written emoji strings
and fills in the rest from CLDR. It also removes 4 custom emoji which
collide with some of the new CLDR names (they will now just be called
by their CLDR name).

											
										
										
											2022-06-10 22:40:31 +02:00
+								            canonical_name = cleanup_name(CUSTOM_EMOJI_NAME_MAPS[emoji_code]["canonical_name"])
-												emoji: Add script to generate emoji_names.py, only with custom names.

This script pulls from our previously custom-written emoji strings
to prepare to fill in the rest from CLDR.

This commit has no user-facing changes.

											
										
										
											2022-06-10 22:43:59 +02:00
+								            if canonical_name in all_canonical_names:
 								                raise Exception(
 								                    f"{canonical_name} was already added with a different codepoint. "
 								                    f"Rename it in `custom_emoji_names` or add an entry for {emoji_code}."
 								                )
 								            all_canonical_names.add(canonical_name)
 								            all_emojis[emoji_code] = CUSTOM_EMOJI_NAME_MAPS[emoji_code]
 								        else:
 								            # create the unicode character(s) for the emoji, since this is the key into the CLDR data
-												emoji: Finish script to generate emoji_names.py with CLDR data.

This script pulls from our previously custom-written emoji strings
and fills in the rest from CLDR. It also removes 4 custom emoji which
collide with some of the new CLDR names (they will now just be called
by their CLDR name).

											
										
										
											2022-06-10 22:40:31 +02:00
+								            emoji = "".join(
 								                chr(int(h, 16))
 								                for h in (emoji_dict["non_qualified"] or emoji_dict["unified"]).split("-")
 								            )
 								            if emoji not in CLDR_DATA:
 								                print(
 								                    f"{emoji} not found in custom emoji name maps, but also not found in CLDR data. Skipping."
 								                )
 								                continue
 								            # CLDR_DATA[emoji] is of the form {'default': [...], 'tts': [...]}
 								            # * "tts" is what's used for text-to-speech and always has one item, so we use that
 								            #    as the canonical name.
 								            # * "default" has several items in it that we use as aliases.
 								            # See also: https://www.unicode.org/reports/tr35/tr35-general.html#14-annotations-and-labels
 								            assert len(CLDR_DATA[emoji]["tts"]) == 1
 								            canonical_name = cleanup_name(CLDR_DATA[emoji]["tts"][0].strip())
 								            if canonical_name in all_canonical_names:
 								                raise Exception(
 								                    f"{canonical_name} was already added with a different codepoint. "
 								                    f"Rename it in `custom_emoji_names` or add an entry for {emoji_code}."
 								                )
 								            aliases = [cleanup_name(alias.strip()) for alias in CLDR_DATA[emoji]["default"]]
 								            all_emojis[emoji_code] = {"canonical_name": canonical_name, "aliases": aliases}
 								            all_canonical_names.add(canonical_name)
 								    # STEP 2: We don't support having aliases that collide with canonical names for emoji, so remove them.
 								    for (emoji_code, emoji_names) in all_emojis.items():
 								        # Copy the list to not iterate while elements are being deleted.
 								        aliases = emoji_names["aliases"][:]
 								        for alias in aliases:
 								            if alias in all_canonical_names:
 								                emoji_names["aliases"].remove(alias)
 								            else:
 								                alias_to_emoji_code[alias].append(emoji_code)  # This is used in STEP 3.
 								    # STEP 3: We don't support multiple emoji sharing the same alias, but the CLDR data
 								    # doesn't have that same restriction, so we have to fix this up to have unique aliases.
 								    # If the alias was specifically specified in custom_emoji_names, then we can keep just
 								    # that one, but otherwise there's no particular emoji that is an obvious candidate
 								    # for the alias so just remove the alias for all relevant emoji.
 								    for alias in alias_to_emoji_code.keys():
 								        if len(alias_to_emoji_code[alias]) > 1:
 								            for emoji_code in alias_to_emoji_code[alias]:
 								                if emoji_code not in CUSTOM_EMOJI_NAME_MAPS:
 								                    all_emojis[emoji_code]["aliases"].remove(alias)
 								    # STEP 4: We keep non-ascii (non-"English") characters in some emoji names if that's the correct
 								    # way to spell that word, but always add an alias for an ascii-only version of the word.
 								    for (emoji_code, emoji_names) in all_emojis.items():
 								        for name in [emoji_names["canonical_name"]] + emoji_names["aliases"]:
 								            # These are known names where we don't have an ascii-only version and there are ascii aliases
 								            # that a user can still enter instead to get the same emoji.
 								            if name in ["ココ", "サ", "指", "空"]:
 								                assert any(alias.isascii() for alias in aliases)
 								                continue
 								            if not name.isascii():
 								                ascii_alias = convert_non_ascii_chars(name)
 								                # Now no other emoji can use this alias.
 								                for code in alias_to_emoji_code[ascii_alias]:
 								                    all_emojis[code]["aliases"].remove(ascii_alias)
 								                all_emojis[emoji_code]["aliases"].append(ascii_alias)
-												emoji: Add script to generate emoji_names.py, only with custom names.

This script pulls from our previously custom-written emoji strings
to prepare to fill in the rest from CLDR.

This commit has no user-facing changes.

											
										
										
											2022-06-10 22:43:59 +02:00
-												emoji: Finish script to generate emoji_names.py with CLDR data.

This script pulls from our previously custom-written emoji strings
and fills in the rest from CLDR. It also removes 4 custom emoji which
collide with some of the new CLDR names (they will now just be called
by their CLDR name).

											
										
										
											2022-06-10 22:40:31 +02:00
+								    # STEP 5: Write final dictionary to `emoji_names.py`.
-												emoji: Add script to generate emoji_names.py, only with custom names.

This script pulls from our previously custom-written emoji strings
to prepare to fill in the rest from CLDR.

This commit has no user-facing changes.

											
										
										
											2022-06-10 22:43:59 +02:00
+								    with open(OUT_EMOJI_FILE, "w") as f:
 								        f.write(
 								            "from typing import Any, Dict\n\n"
 								            "# Generated with `generate_emoji_names`.\n\n"
 								            "EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {\n"
 								        )
 								        for (key, emoji_names) in all_emojis.items():
-												emoji: Finish script to generate emoji_names.py with CLDR data.

This script pulls from our previously custom-written emoji strings
and fills in the rest from CLDR. It also removes 4 custom emoji which
collide with some of the new CLDR names (they will now just be called
by their CLDR name).

											
										
										
											2022-06-10 22:40:31 +02:00
+								            f.write(f"    {key!r}: {emoji_names!r},\n")
-												emoji: Add script to generate emoji_names.py, only with custom names.

This script pulls from our previously custom-written emoji strings
to prepare to fill in the rest from CLDR.

This commit has no user-facing changes.

											
										
										
											2022-06-10 22:43:59 +02:00
+								        f.write("}\n")
 								    print(
 								        "\n\nDone! You should run the linter to format emoji_names.py with `./tools/lint --fix -m --only black`"
 								    )
 								if __name__ == "__main__":
 								    main()