zulip/tools/setup/emoji/generate_emoji_names

223 lines
8.0 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
import os
import re
import sys
from collections import defaultdict
import orjson
ZULIP_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../../")
sys.path.append(ZULIP_PATH)
from tools.setup.emoji.custom_emoji_names import CUSTOM_EMOJI_NAME_MAPS
from tools.setup.emoji.emoji_setup_utils import get_emoji_code
from zerver.lib.emoji_utils import hex_codepoint_to_emoji, unqualify_emoji
CLDR_DATA_FILE = os.path.join(
ZULIP_PATH, "node_modules", "cldr-annotations-modern", "annotations", "en", "annotations.json"
)
CLDR_DERIVED_DATA_FILE = os.path.join(
ZULIP_PATH,
"node_modules",
"cldr-annotations-derived-modern",
"annotationsDerived",
"en",
"annotations.json",
)
EMOJI_DATA_FILE = os.path.join(ZULIP_PATH, "node_modules", "emoji-datasource-google", "emoji.json")
OUT_EMOJI_FILE = os.path.join(ZULIP_PATH, "tools", "setup", "emoji", "emoji_names.py")
with open(EMOJI_DATA_FILE, "rb") as fp:
EMOJI_DATA = orjson.loads(fp.read())
with open(CLDR_DATA_FILE, "rb") as fp:
CLDR_DATA = orjson.loads(fp.read())["annotations"]["annotations"]
with open(CLDR_DERIVED_DATA_FILE, "rb") as fp:
CLDR_DATA.update(orjson.loads(fp.read())["annotationsDerived"]["annotations"])
# We don't include most clock emojis. See `custom_emoji_names` for more context.
SKIPPED_CLOCK_EMOJI_CODES = [
"1f550",
"1f551",
"1f552",
"1f553",
"1f554",
"1f555",
"1f556",
"1f558",
"1f559",
"1f55a",
"1f55b",
"1f55c",
"1f55d",
"1f55e",
"1f55f",
"1f560",
"1f561",
"1f562",
"1f563",
"1f564",
"1f565",
"1f566",
"1f567",
]
# We don't include the skin tones as emojis that one can search for on their own.
SKIN_TONE_EMOJI_CODES = [
"1f3fb",
"1f3fc",
"1f3fd",
"1f3fe",
"1f3ff",
]
def cleanup_name(name: str) -> str:
replacements = {
" ": "_",
"-": "_",
"": "_",
"": "",
"": "",
":": "",
".": "",
",": "",
"(": "",
")": "",
"&": "and",
"": "",
"": "",
"'": "",
}
for before, after in replacements.items():
name = name.replace(before, after)
name = re.sub(r"_{2,}", "_", name)
return name.lower()
def convert_non_ascii_chars(name: str) -> str:
replacements = {
"è": "e",
"ǐ": "i",
"ó": "o",
"ā": "a",
"ō": "o",
"ñ": "n",
"ô": "o",
"ç": "c",
"é": "e",
"ã": "a",
"í": "i",
"å": "a",
"ü": "u",
}
for before, after in replacements.items():
name = name.replace(before, after)
assert (
name.isascii()
), f"{name} still contains non-ascii characters. Add them to convert_non_ascii_chars."
return name
def main() -> None:
all_emojis = {}
all_canonical_names = set()
alias_to_emoji_code = defaultdict(list)
# STEP 1: Generate first draft of all_emojis.
for emoji_dict in EMOJI_DATA:
emoji_code = get_emoji_code(emoji_dict)
if emoji_code in SKIPPED_CLOCK_EMOJI_CODES or emoji_code in SKIN_TONE_EMOJI_CODES:
continue
if emoji_code in CUSTOM_EMOJI_NAME_MAPS:
canonical_name = cleanup_name(CUSTOM_EMOJI_NAME_MAPS[emoji_code]["canonical_name"])
if canonical_name in all_canonical_names:
raise Exception(
f"{canonical_name} was already added with a different codepoint. "
f"Rename it in `custom_emoji_names` or add an entry for {emoji_code}."
)
all_canonical_names.add(canonical_name)
all_emojis[emoji_code] = CUSTOM_EMOJI_NAME_MAPS[emoji_code]
else:
# create the unicode character(s) for the emoji, since this is the key into the CLDR data
# We can't just use emoji_dict["non_qualified"] because of this upstream bug:
# https://github.com/iamcal/emoji-data/pull/217
emoji = unqualify_emoji(hex_codepoint_to_emoji(emoji_dict["unified"]))
if emoji not in CLDR_DATA:
print(
f"{emoji} not found in custom emoji name maps, but also not found in CLDR data. Skipping."
)
continue
# CLDR_DATA[emoji] is of the form {'default': [...], 'tts': [...]}
# * "tts" is what's used for text-to-speech and always has one item, so we use that
# as the canonical name.
# * "default" has several items in it that we use as aliases.
# See also: https://www.unicode.org/reports/tr35/tr35-general.html#14-annotations-and-labels
assert len(CLDR_DATA[emoji]["tts"]) == 1
canonical_name = cleanup_name(CLDR_DATA[emoji]["tts"][0].strip())
if canonical_name in all_canonical_names:
raise Exception(
f"{canonical_name} was already added with a different codepoint. "
f"Rename it in `custom_emoji_names` or add an entry for {emoji_code}."
)
aliases = [cleanup_name(alias.strip()) for alias in CLDR_DATA[emoji]["default"]]
all_emojis[emoji_code] = {"canonical_name": canonical_name, "aliases": aliases}
all_canonical_names.add(canonical_name)
# STEP 2: We don't support having aliases that collide with canonical names for emoji, so remove them.
for emoji_code, emoji_names in all_emojis.items():
# Copy the list to not iterate while elements are being deleted.
aliases = emoji_names["aliases"][:]
for alias in aliases:
if alias in all_canonical_names:
emoji_names["aliases"].remove(alias)
else:
alias_to_emoji_code[alias].append(emoji_code) # This is used in STEP 3.
# STEP 3: We don't support multiple emoji sharing the same alias, but the CLDR data
# doesn't have that same restriction, so we have to fix this up to have unique aliases.
# If the alias was specifically specified in custom_emoji_names, then we can keep just
# that one, but otherwise there's no particular emoji that is an obvious candidate
# for the alias so just remove the alias for all relevant emoji.
for alias in alias_to_emoji_code:
if len(alias_to_emoji_code[alias]) > 1:
for emoji_code in alias_to_emoji_code[alias]:
if emoji_code not in CUSTOM_EMOJI_NAME_MAPS:
all_emojis[emoji_code]["aliases"].remove(alias)
# STEP 4: We keep non-ascii (non-"English") characters in some emoji names if that's the correct
# way to spell that word, but always add an alias for an ascii-only version of the word.
for emoji_code, emoji_names in all_emojis.items():
for name in [emoji_names["canonical_name"]] + emoji_names["aliases"]:
# These are known names where we don't have an ascii-only version and there are ascii aliases
# that a user can still enter instead to get the same emoji.
if name in ["ココ", "", "", ""]:
assert any(alias.isascii() for alias in emoji_names["aliases"])
continue
if not name.isascii():
ascii_alias = convert_non_ascii_chars(name)
# Now no other emoji can use this alias.
for code in alias_to_emoji_code[ascii_alias]:
all_emojis[code]["aliases"].remove(ascii_alias)
all_emojis[emoji_code]["aliases"].append(ascii_alias)
# STEP 5: Write final dictionary to `emoji_names.py`.
with open(OUT_EMOJI_FILE, "w") as f:
f.write(
"from typing import Any\n\n"
"# Generated with `generate_emoji_names`.\n\n"
"EMOJI_NAME_MAPS: dict[str, dict[str, Any]] = {\n"
)
for key, emoji_names in all_emojis.items():
f.write(f" {key!r}: {emoji_names!r},\n")
f.write("}\n")
print(
"\n\nDone! You should run the linter to format emoji_names.py with `./tools/lint --fix -m --only ruff-format`"
)
if __name__ == "__main__":
main()