mirror of https://github.com/zulip/zulip.git
223 lines
8.0 KiB
Python
Executable File
223 lines
8.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
import os
|
||
import re
|
||
import sys
|
||
from collections import defaultdict
|
||
|
||
import orjson
|
||
|
||
ZULIP_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../../")
|
||
sys.path.append(ZULIP_PATH)
|
||
|
||
from tools.setup.emoji.custom_emoji_names import CUSTOM_EMOJI_NAME_MAPS
|
||
from tools.setup.emoji.emoji_setup_utils import get_emoji_code
|
||
from zerver.lib.emoji_utils import hex_codepoint_to_emoji, unqualify_emoji
|
||
|
||
CLDR_DATA_FILE = os.path.join(
|
||
ZULIP_PATH, "node_modules", "cldr-annotations-modern", "annotations", "en", "annotations.json"
|
||
)
|
||
CLDR_DERIVED_DATA_FILE = os.path.join(
|
||
ZULIP_PATH,
|
||
"node_modules",
|
||
"cldr-annotations-derived-modern",
|
||
"annotationsDerived",
|
||
"en",
|
||
"annotations.json",
|
||
)
|
||
EMOJI_DATA_FILE = os.path.join(ZULIP_PATH, "node_modules", "emoji-datasource-google", "emoji.json")
|
||
OUT_EMOJI_FILE = os.path.join(ZULIP_PATH, "tools", "setup", "emoji", "emoji_names.py")
|
||
|
||
with open(EMOJI_DATA_FILE, "rb") as fp:
|
||
EMOJI_DATA = orjson.loads(fp.read())
|
||
with open(CLDR_DATA_FILE, "rb") as fp:
|
||
CLDR_DATA = orjson.loads(fp.read())["annotations"]["annotations"]
|
||
with open(CLDR_DERIVED_DATA_FILE, "rb") as fp:
|
||
CLDR_DATA.update(orjson.loads(fp.read())["annotationsDerived"]["annotations"])
|
||
|
||
# We don't include most clock emojis. See `custom_emoji_names` for more context.
|
||
SKIPPED_CLOCK_EMOJI_CODES = [
|
||
"1f550",
|
||
"1f551",
|
||
"1f552",
|
||
"1f553",
|
||
"1f554",
|
||
"1f555",
|
||
"1f556",
|
||
"1f558",
|
||
"1f559",
|
||
"1f55a",
|
||
"1f55b",
|
||
"1f55c",
|
||
"1f55d",
|
||
"1f55e",
|
||
"1f55f",
|
||
"1f560",
|
||
"1f561",
|
||
"1f562",
|
||
"1f563",
|
||
"1f564",
|
||
"1f565",
|
||
"1f566",
|
||
"1f567",
|
||
]
|
||
|
||
# We don't include the skin tones as emojis that one can search for on their own.
|
||
SKIN_TONE_EMOJI_CODES = [
|
||
"1f3fb",
|
||
"1f3fc",
|
||
"1f3fd",
|
||
"1f3fe",
|
||
"1f3ff",
|
||
]
|
||
|
||
|
||
def cleanup_name(name: str) -> str:
|
||
replacements = {
|
||
" ": "_",
|
||
"-": "_",
|
||
"–": "_",
|
||
"“": "",
|
||
"”": "",
|
||
":": "",
|
||
".": "",
|
||
",": "",
|
||
"(": "",
|
||
")": "",
|
||
"&": "and",
|
||
"‘": "",
|
||
"’": "",
|
||
"'": "",
|
||
}
|
||
for before, after in replacements.items():
|
||
name = name.replace(before, after)
|
||
name = re.sub(r"_{2,}", "_", name)
|
||
return name.lower()
|
||
|
||
|
||
def convert_non_ascii_chars(name: str) -> str:
|
||
replacements = {
|
||
"è": "e",
|
||
"ǐ": "i",
|
||
"ó": "o",
|
||
"ā": "a",
|
||
"ō": "o",
|
||
"ñ": "n",
|
||
"ô": "o",
|
||
"ç": "c",
|
||
"é": "e",
|
||
"ã": "a",
|
||
"í": "i",
|
||
"å": "a",
|
||
"ü": "u",
|
||
}
|
||
for before, after in replacements.items():
|
||
name = name.replace(before, after)
|
||
assert (
|
||
name.isascii()
|
||
), f"{name} still contains non-ascii characters. Add them to convert_non_ascii_chars."
|
||
return name
|
||
|
||
|
||
def main() -> None:
|
||
all_emojis = {}
|
||
all_canonical_names = set()
|
||
|
||
alias_to_emoji_code = defaultdict(list)
|
||
|
||
# STEP 1: Generate first draft of all_emojis.
|
||
for emoji_dict in EMOJI_DATA:
|
||
emoji_code = get_emoji_code(emoji_dict)
|
||
if emoji_code in SKIPPED_CLOCK_EMOJI_CODES or emoji_code in SKIN_TONE_EMOJI_CODES:
|
||
continue
|
||
|
||
if emoji_code in CUSTOM_EMOJI_NAME_MAPS:
|
||
canonical_name = cleanup_name(CUSTOM_EMOJI_NAME_MAPS[emoji_code]["canonical_name"])
|
||
if canonical_name in all_canonical_names:
|
||
raise Exception(
|
||
f"{canonical_name} was already added with a different codepoint. "
|
||
f"Rename it in `custom_emoji_names` or add an entry for {emoji_code}."
|
||
)
|
||
all_canonical_names.add(canonical_name)
|
||
all_emojis[emoji_code] = CUSTOM_EMOJI_NAME_MAPS[emoji_code]
|
||
else:
|
||
# create the unicode character(s) for the emoji, since this is the key into the CLDR data
|
||
# We can't just use emoji_dict["non_qualified"] because of this upstream bug:
|
||
# https://github.com/iamcal/emoji-data/pull/217
|
||
emoji = unqualify_emoji(hex_codepoint_to_emoji(emoji_dict["unified"]))
|
||
if emoji not in CLDR_DATA:
|
||
print(
|
||
f"{emoji} not found in custom emoji name maps, but also not found in CLDR data. Skipping."
|
||
)
|
||
continue
|
||
# CLDR_DATA[emoji] is of the form {'default': [...], 'tts': [...]}
|
||
# * "tts" is what's used for text-to-speech and always has one item, so we use that
|
||
# as the canonical name.
|
||
# * "default" has several items in it that we use as aliases.
|
||
# See also: https://www.unicode.org/reports/tr35/tr35-general.html#14-annotations-and-labels
|
||
assert len(CLDR_DATA[emoji]["tts"]) == 1
|
||
canonical_name = cleanup_name(CLDR_DATA[emoji]["tts"][0].strip())
|
||
if canonical_name in all_canonical_names:
|
||
raise Exception(
|
||
f"{canonical_name} was already added with a different codepoint. "
|
||
f"Rename it in `custom_emoji_names` or add an entry for {emoji_code}."
|
||
)
|
||
aliases = [cleanup_name(alias.strip()) for alias in CLDR_DATA[emoji]["default"]]
|
||
all_emojis[emoji_code] = {"canonical_name": canonical_name, "aliases": aliases}
|
||
all_canonical_names.add(canonical_name)
|
||
|
||
# STEP 2: We don't support having aliases that collide with canonical names for emoji, so remove them.
|
||
for emoji_code, emoji_names in all_emojis.items():
|
||
# Copy the list to not iterate while elements are being deleted.
|
||
aliases = emoji_names["aliases"][:]
|
||
for alias in aliases:
|
||
if alias in all_canonical_names:
|
||
emoji_names["aliases"].remove(alias)
|
||
else:
|
||
alias_to_emoji_code[alias].append(emoji_code) # This is used in STEP 3.
|
||
|
||
# STEP 3: We don't support multiple emoji sharing the same alias, but the CLDR data
|
||
# doesn't have that same restriction, so we have to fix this up to have unique aliases.
|
||
# If the alias was specifically specified in custom_emoji_names, then we can keep just
|
||
# that one, but otherwise there's no particular emoji that is an obvious candidate
|
||
# for the alias so just remove the alias for all relevant emoji.
|
||
for alias in alias_to_emoji_code:
|
||
if len(alias_to_emoji_code[alias]) > 1:
|
||
for emoji_code in alias_to_emoji_code[alias]:
|
||
if emoji_code not in CUSTOM_EMOJI_NAME_MAPS:
|
||
all_emojis[emoji_code]["aliases"].remove(alias)
|
||
|
||
# STEP 4: We keep non-ascii (non-"English") characters in some emoji names if that's the correct
|
||
# way to spell that word, but always add an alias for an ascii-only version of the word.
|
||
for emoji_code, emoji_names in all_emojis.items():
|
||
for name in [emoji_names["canonical_name"]] + emoji_names["aliases"]:
|
||
# These are known names where we don't have an ascii-only version and there are ascii aliases
|
||
# that a user can still enter instead to get the same emoji.
|
||
if name in ["ココ", "サ", "指", "空"]:
|
||
assert any(alias.isascii() for alias in emoji_names["aliases"])
|
||
continue
|
||
if not name.isascii():
|
||
ascii_alias = convert_non_ascii_chars(name)
|
||
# Now no other emoji can use this alias.
|
||
for code in alias_to_emoji_code[ascii_alias]:
|
||
all_emojis[code]["aliases"].remove(ascii_alias)
|
||
all_emojis[emoji_code]["aliases"].append(ascii_alias)
|
||
|
||
# STEP 5: Write final dictionary to `emoji_names.py`.
|
||
with open(OUT_EMOJI_FILE, "w") as f:
|
||
f.write(
|
||
"from typing import Any, Dict\n\n"
|
||
"# Generated with `generate_emoji_names`.\n\n"
|
||
"EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {\n"
|
||
)
|
||
for key, emoji_names in all_emojis.items():
|
||
f.write(f" {key!r}: {emoji_names!r},\n")
|
||
f.write("}\n")
|
||
|
||
print(
|
||
"\n\nDone! You should run the linter to format emoji_names.py with `./tools/lint --fix -m --only black`"
|
||
)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|