zulip/tools/setup/emoji/generate_emoji_names

223 lines
8.0 KiB
Plaintext
Raw Permalink Normal View History

#!/usr/bin/env python3
import os
import re
import sys
from collections import defaultdict
import orjson
ZULIP_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../../")
sys.path.append(ZULIP_PATH)
from tools.setup.emoji.custom_emoji_names import CUSTOM_EMOJI_NAME_MAPS
from tools.setup.emoji.emoji_setup_utils import get_emoji_code
from zerver.lib.emoji_utils import hex_codepoint_to_emoji, unqualify_emoji
CLDR_DATA_FILE = os.path.join(
ZULIP_PATH, "node_modules", "cldr-annotations-modern", "annotations", "en", "annotations.json"
)
CLDR_DERIVED_DATA_FILE = os.path.join(
ZULIP_PATH,
"node_modules",
"cldr-annotations-derived-modern",
"annotationsDerived",
"en",
"annotations.json",
)
EMOJI_DATA_FILE = os.path.join(ZULIP_PATH, "node_modules", "emoji-datasource-google", "emoji.json")
OUT_EMOJI_FILE = os.path.join(ZULIP_PATH, "tools", "setup", "emoji", "emoji_names.py")
with open(EMOJI_DATA_FILE, "rb") as fp:
EMOJI_DATA = orjson.loads(fp.read())
with open(CLDR_DATA_FILE, "rb") as fp:
CLDR_DATA = orjson.loads(fp.read())["annotations"]["annotations"]
with open(CLDR_DERIVED_DATA_FILE, "rb") as fp:
CLDR_DATA.update(orjson.loads(fp.read())["annotationsDerived"]["annotations"])
# We don't include most clock emojis. See `custom_emoji_names` for more context.
SKIPPED_CLOCK_EMOJI_CODES = [
"1f550",
"1f551",
"1f552",
"1f553",
"1f554",
"1f555",
"1f556",
"1f558",
"1f559",
"1f55a",
"1f55b",
"1f55c",
"1f55d",
"1f55e",
"1f55f",
"1f560",
"1f561",
"1f562",
"1f563",
"1f564",
"1f565",
"1f566",
"1f567",
]
# We don't include the skin tones as emojis that one can search for on their own.
SKIN_TONE_EMOJI_CODES = [
"1f3fb",
"1f3fc",
"1f3fd",
"1f3fe",
"1f3ff",
]
def cleanup_name(name: str) -> str:
replacements = {
" ": "_",
"-": "_",
"": "_",
"“": "",
"”": "",
":": "",
".": "",
",": "",
"(": "",
")": "",
"&": "and",
"": "",
"": "",
"'": "",
}
for before, after in replacements.items():
name = name.replace(before, after)
name = re.sub(r"_{2,}", "_", name)
return name.lower()
def convert_non_ascii_chars(name: str) -> str:
replacements = {
"è": "e",
"ǐ": "i",
"ó": "o",
"ā": "a",
"ō": "o",
"ñ": "n",
"ô": "o",
"ç": "c",
"é": "e",
"ã": "a",
"í": "i",
"å": "a",
"ü": "u",
}
for before, after in replacements.items():
name = name.replace(before, after)
assert (
name.isascii()
), f"{name} still contains non-ascii characters. Add them to convert_non_ascii_chars."
return name
def main() -> None:
all_emojis = {}
all_canonical_names = set()
alias_to_emoji_code = defaultdict(list)
# STEP 1: Generate first draft of all_emojis.
for emoji_dict in EMOJI_DATA:
emoji_code = get_emoji_code(emoji_dict)
if emoji_code in SKIPPED_CLOCK_EMOJI_CODES or emoji_code in SKIN_TONE_EMOJI_CODES:
continue
if emoji_code in CUSTOM_EMOJI_NAME_MAPS:
canonical_name = cleanup_name(CUSTOM_EMOJI_NAME_MAPS[emoji_code]["canonical_name"])
if canonical_name in all_canonical_names:
raise Exception(
f"{canonical_name} was already added with a different codepoint. "
f"Rename it in `custom_emoji_names` or add an entry for {emoji_code}."
)
all_canonical_names.add(canonical_name)
all_emojis[emoji_code] = CUSTOM_EMOJI_NAME_MAPS[emoji_code]
else:
# create the unicode character(s) for the emoji, since this is the key into the CLDR data
# We can't just use emoji_dict["non_qualified"] because of this upstream bug:
# https://github.com/iamcal/emoji-data/pull/217
emoji = unqualify_emoji(hex_codepoint_to_emoji(emoji_dict["unified"]))
if emoji not in CLDR_DATA:
print(
f"{emoji} not found in custom emoji name maps, but also not found in CLDR data. Skipping."
)
continue
# CLDR_DATA[emoji] is of the form {'default': [...], 'tts': [...]}
# * "tts" is what's used for text-to-speech and always has one item, so we use that
# as the canonical name.
# * "default" has several items in it that we use as aliases.
# See also: https://www.unicode.org/reports/tr35/tr35-general.html#14-annotations-and-labels
assert len(CLDR_DATA[emoji]["tts"]) == 1
canonical_name = cleanup_name(CLDR_DATA[emoji]["tts"][0].strip())
if canonical_name in all_canonical_names:
raise Exception(
f"{canonical_name} was already added with a different codepoint. "
f"Rename it in `custom_emoji_names` or add an entry for {emoji_code}."
)
aliases = [cleanup_name(alias.strip()) for alias in CLDR_DATA[emoji]["default"]]
all_emojis[emoji_code] = {"canonical_name": canonical_name, "aliases": aliases}
all_canonical_names.add(canonical_name)
# STEP 2: We don't support having aliases that collide with canonical names for emoji, so remove them.
for emoji_code, emoji_names in all_emojis.items():
# Copy the list to not iterate while elements are being deleted.
aliases = emoji_names["aliases"][:]
for alias in aliases:
if alias in all_canonical_names:
emoji_names["aliases"].remove(alias)
else:
alias_to_emoji_code[alias].append(emoji_code) # This is used in STEP 3.
# STEP 3: We don't support multiple emoji sharing the same alias, but the CLDR data
# doesn't have that same restriction, so we have to fix this up to have unique aliases.
# If the alias was specifically specified in custom_emoji_names, then we can keep just
# that one, but otherwise there's no particular emoji that is an obvious candidate
# for the alias so just remove the alias for all relevant emoji.
for alias in alias_to_emoji_code:
if len(alias_to_emoji_code[alias]) > 1:
for emoji_code in alias_to_emoji_code[alias]:
if emoji_code not in CUSTOM_EMOJI_NAME_MAPS:
all_emojis[emoji_code]["aliases"].remove(alias)
# STEP 4: We keep non-ascii (non-"English") characters in some emoji names if that's the correct
# way to spell that word, but always add an alias for an ascii-only version of the word.
for emoji_code, emoji_names in all_emojis.items():
for name in [emoji_names["canonical_name"]] + emoji_names["aliases"]:
# These are known names where we don't have an ascii-only version and there are ascii aliases
# that a user can still enter instead to get the same emoji.
if name in ["ココ", "サ", "指", "空"]:
assert any(alias.isascii() for alias in emoji_names["aliases"])
continue
if not name.isascii():
ascii_alias = convert_non_ascii_chars(name)
# Now no other emoji can use this alias.
for code in alias_to_emoji_code[ascii_alias]:
all_emojis[code]["aliases"].remove(ascii_alias)
all_emojis[emoji_code]["aliases"].append(ascii_alias)
# STEP 5: Write final dictionary to `emoji_names.py`.
with open(OUT_EMOJI_FILE, "w") as f:
f.write(
"from typing import Any\n\n"
"# Generated with `generate_emoji_names`.\n\n"
"EMOJI_NAME_MAPS: dict[str, dict[str, Any]] = {\n"
)
for key, emoji_names in all_emojis.items():
f.write(f" {key!r}: {emoji_names!r},\n")
f.write("}\n")
print(
"\n\nDone! You should run the linter to format emoji_names.py with `./tools/lint --fix -m --only ruff-format`"
)
if __name__ == "__main__":
main()