2022-06-10 22:43:59 +02:00
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
import os
|
2022-06-10 22:40:31 +02:00
|
|
|
|
import re
|
2022-06-10 22:43:59 +02:00
|
|
|
|
import sys
|
2022-06-10 22:40:31 +02:00
|
|
|
|
from collections import defaultdict
|
2022-06-10 22:43:59 +02:00
|
|
|
|
|
|
|
|
|
import orjson
|
|
|
|
|
|
|
|
|
|
ZULIP_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../../")
|
|
|
|
|
sys.path.append(ZULIP_PATH)
|
2022-12-04 10:59:47 +01:00
|
|
|
|
|
|
|
|
|
from tools.setup.emoji.custom_emoji_names import CUSTOM_EMOJI_NAME_MAPS
|
|
|
|
|
from tools.setup.emoji.emoji_setup_utils import get_emoji_code
|
2023-08-18 21:49:03 +02:00
|
|
|
|
from zerver.lib.emoji_utils import hex_codepoint_to_emoji, unqualify_emoji
|
2022-12-04 10:59:47 +01:00
|
|
|
|
|
2022-06-10 22:40:31 +02:00
|
|
|
|
CLDR_DATA_FILE = os.path.join(
|
|
|
|
|
ZULIP_PATH, "node_modules", "cldr-annotations-modern", "annotations", "en", "annotations.json"
|
|
|
|
|
)
|
|
|
|
|
CLDR_DERIVED_DATA_FILE = os.path.join(
|
|
|
|
|
ZULIP_PATH,
|
|
|
|
|
"node_modules",
|
|
|
|
|
"cldr-annotations-derived-modern",
|
|
|
|
|
"annotationsDerived",
|
|
|
|
|
"en",
|
|
|
|
|
"annotations.json",
|
|
|
|
|
)
|
2022-06-10 22:43:59 +02:00
|
|
|
|
EMOJI_DATA_FILE = os.path.join(ZULIP_PATH, "node_modules", "emoji-datasource-google", "emoji.json")
|
|
|
|
|
OUT_EMOJI_FILE = os.path.join(ZULIP_PATH, "tools", "setup", "emoji", "emoji_names.py")
|
|
|
|
|
|
|
|
|
|
with open(EMOJI_DATA_FILE, "rb") as fp:
|
|
|
|
|
EMOJI_DATA = orjson.loads(fp.read())
|
2022-06-10 22:40:31 +02:00
|
|
|
|
with open(CLDR_DATA_FILE, "rb") as fp:
|
|
|
|
|
CLDR_DATA = orjson.loads(fp.read())["annotations"]["annotations"]
|
|
|
|
|
with open(CLDR_DERIVED_DATA_FILE, "rb") as fp:
|
|
|
|
|
CLDR_DATA.update(orjson.loads(fp.read())["annotationsDerived"]["annotations"])
|
2022-06-10 22:43:59 +02:00
|
|
|
|
|
|
|
|
|
# We don't include most clock emojis. See `custom_emoji_names` for more context.
|
|
|
|
|
SKIPPED_CLOCK_EMOJI_CODES = [
|
|
|
|
|
"1f550",
|
|
|
|
|
"1f551",
|
|
|
|
|
"1f552",
|
|
|
|
|
"1f553",
|
|
|
|
|
"1f554",
|
|
|
|
|
"1f555",
|
|
|
|
|
"1f556",
|
|
|
|
|
"1f558",
|
|
|
|
|
"1f559",
|
|
|
|
|
"1f55a",
|
|
|
|
|
"1f55b",
|
|
|
|
|
"1f55c",
|
|
|
|
|
"1f55d",
|
|
|
|
|
"1f55e",
|
|
|
|
|
"1f55f",
|
|
|
|
|
"1f560",
|
|
|
|
|
"1f561",
|
|
|
|
|
"1f562",
|
|
|
|
|
"1f563",
|
|
|
|
|
"1f564",
|
|
|
|
|
"1f565",
|
|
|
|
|
"1f566",
|
|
|
|
|
"1f567",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
# We don't include the skin tones as emojis that one can search for on their own.
|
|
|
|
|
SKIN_TONE_EMOJI_CODES = [
|
|
|
|
|
"1f3fb",
|
|
|
|
|
"1f3fc",
|
|
|
|
|
"1f3fd",
|
|
|
|
|
"1f3fe",
|
|
|
|
|
"1f3ff",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
2022-06-10 22:40:31 +02:00
|
|
|
|
def cleanup_name(name: str) -> str:
|
|
|
|
|
replacements = {
|
|
|
|
|
" ": "_",
|
|
|
|
|
"-": "_",
|
|
|
|
|
"–": "_",
|
|
|
|
|
"“": "",
|
|
|
|
|
"”": "",
|
|
|
|
|
":": "",
|
|
|
|
|
".": "",
|
2022-11-24 21:51:56 +01:00
|
|
|
|
",": "",
|
2023-03-23 22:48:32 +01:00
|
|
|
|
"(": "",
|
|
|
|
|
")": "",
|
2022-06-10 22:40:31 +02:00
|
|
|
|
"&": "and",
|
2023-03-23 22:48:32 +01:00
|
|
|
|
"‘": "",
|
|
|
|
|
"’": "",
|
|
|
|
|
"'": "",
|
2022-06-10 22:40:31 +02:00
|
|
|
|
}
|
|
|
|
|
for before, after in replacements.items():
|
|
|
|
|
name = name.replace(before, after)
|
|
|
|
|
name = re.sub("_{2,}", "_", name)
|
|
|
|
|
return name.lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def convert_non_ascii_chars(name: str) -> str:
|
|
|
|
|
replacements = {
|
|
|
|
|
"è": "e",
|
|
|
|
|
"ǐ": "i",
|
|
|
|
|
"ó": "o",
|
|
|
|
|
"ā": "a",
|
|
|
|
|
"ō": "o",
|
|
|
|
|
"ñ": "n",
|
|
|
|
|
"ô": "o",
|
|
|
|
|
"ç": "c",
|
|
|
|
|
"é": "e",
|
|
|
|
|
"ã": "a",
|
|
|
|
|
"í": "i",
|
|
|
|
|
"å": "a",
|
2023-07-27 02:00:41 +02:00
|
|
|
|
"ü": "u",
|
2022-06-10 22:40:31 +02:00
|
|
|
|
}
|
|
|
|
|
for before, after in replacements.items():
|
|
|
|
|
name = name.replace(before, after)
|
|
|
|
|
assert (
|
|
|
|
|
name.isascii()
|
|
|
|
|
), f"{name} still contains non-ascii characters. Add them to convert_non_ascii_chars."
|
|
|
|
|
return name
|
|
|
|
|
|
|
|
|
|
|
2022-06-10 22:43:59 +02:00
|
|
|
|
def main() -> None:
|
|
|
|
|
all_emojis = {}
|
|
|
|
|
all_canonical_names = set()
|
|
|
|
|
|
2022-06-10 22:40:31 +02:00
|
|
|
|
alias_to_emoji_code = defaultdict(list)
|
|
|
|
|
|
|
|
|
|
# STEP 1: Generate first draft of all_emojis.
|
2022-06-10 22:43:59 +02:00
|
|
|
|
for emoji_dict in EMOJI_DATA:
|
|
|
|
|
emoji_code = get_emoji_code(emoji_dict)
|
|
|
|
|
if emoji_code in SKIPPED_CLOCK_EMOJI_CODES or emoji_code in SKIN_TONE_EMOJI_CODES:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
if emoji_code in CUSTOM_EMOJI_NAME_MAPS:
|
2022-06-10 22:40:31 +02:00
|
|
|
|
canonical_name = cleanup_name(CUSTOM_EMOJI_NAME_MAPS[emoji_code]["canonical_name"])
|
2022-06-10 22:43:59 +02:00
|
|
|
|
if canonical_name in all_canonical_names:
|
|
|
|
|
raise Exception(
|
|
|
|
|
f"{canonical_name} was already added with a different codepoint. "
|
|
|
|
|
f"Rename it in `custom_emoji_names` or add an entry for {emoji_code}."
|
|
|
|
|
)
|
|
|
|
|
all_canonical_names.add(canonical_name)
|
|
|
|
|
all_emojis[emoji_code] = CUSTOM_EMOJI_NAME_MAPS[emoji_code]
|
|
|
|
|
else:
|
|
|
|
|
# create the unicode character(s) for the emoji, since this is the key into the CLDR data
|
2023-08-18 21:49:03 +02:00
|
|
|
|
# We can't just use emoji_dict["non_qualified"] because of this upstream bug:
|
|
|
|
|
# https://github.com/iamcal/emoji-data/pull/217
|
|
|
|
|
emoji = unqualify_emoji(hex_codepoint_to_emoji(emoji_dict["unified"]))
|
2022-06-10 22:40:31 +02:00
|
|
|
|
if emoji not in CLDR_DATA:
|
|
|
|
|
print(
|
|
|
|
|
f"{emoji} not found in custom emoji name maps, but also not found in CLDR data. Skipping."
|
|
|
|
|
)
|
|
|
|
|
continue
|
|
|
|
|
# CLDR_DATA[emoji] is of the form {'default': [...], 'tts': [...]}
|
|
|
|
|
# * "tts" is what's used for text-to-speech and always has one item, so we use that
|
|
|
|
|
# as the canonical name.
|
|
|
|
|
# * "default" has several items in it that we use as aliases.
|
|
|
|
|
# See also: https://www.unicode.org/reports/tr35/tr35-general.html#14-annotations-and-labels
|
|
|
|
|
assert len(CLDR_DATA[emoji]["tts"]) == 1
|
|
|
|
|
canonical_name = cleanup_name(CLDR_DATA[emoji]["tts"][0].strip())
|
|
|
|
|
if canonical_name in all_canonical_names:
|
|
|
|
|
raise Exception(
|
|
|
|
|
f"{canonical_name} was already added with a different codepoint. "
|
|
|
|
|
f"Rename it in `custom_emoji_names` or add an entry for {emoji_code}."
|
|
|
|
|
)
|
|
|
|
|
aliases = [cleanup_name(alias.strip()) for alias in CLDR_DATA[emoji]["default"]]
|
|
|
|
|
all_emojis[emoji_code] = {"canonical_name": canonical_name, "aliases": aliases}
|
|
|
|
|
all_canonical_names.add(canonical_name)
|
|
|
|
|
|
|
|
|
|
# STEP 2: We don't support having aliases that collide with canonical names for emoji, so remove them.
|
2023-02-02 04:35:24 +01:00
|
|
|
|
for emoji_code, emoji_names in all_emojis.items():
|
2022-06-10 22:40:31 +02:00
|
|
|
|
# Copy the list to not iterate while elements are being deleted.
|
|
|
|
|
aliases = emoji_names["aliases"][:]
|
|
|
|
|
for alias in aliases:
|
|
|
|
|
if alias in all_canonical_names:
|
|
|
|
|
emoji_names["aliases"].remove(alias)
|
|
|
|
|
else:
|
|
|
|
|
alias_to_emoji_code[alias].append(emoji_code) # This is used in STEP 3.
|
|
|
|
|
|
|
|
|
|
# STEP 3: We don't support multiple emoji sharing the same alias, but the CLDR data
|
|
|
|
|
# doesn't have that same restriction, so we have to fix this up to have unique aliases.
|
|
|
|
|
# If the alias was specifically specified in custom_emoji_names, then we can keep just
|
|
|
|
|
# that one, but otherwise there's no particular emoji that is an obvious candidate
|
|
|
|
|
# for the alias so just remove the alias for all relevant emoji.
|
2022-12-12 03:39:16 +01:00
|
|
|
|
for alias in alias_to_emoji_code:
|
2022-06-10 22:40:31 +02:00
|
|
|
|
if len(alias_to_emoji_code[alias]) > 1:
|
|
|
|
|
for emoji_code in alias_to_emoji_code[alias]:
|
|
|
|
|
if emoji_code not in CUSTOM_EMOJI_NAME_MAPS:
|
|
|
|
|
all_emojis[emoji_code]["aliases"].remove(alias)
|
|
|
|
|
|
|
|
|
|
# STEP 4: We keep non-ascii (non-"English") characters in some emoji names if that's the correct
|
|
|
|
|
# way to spell that word, but always add an alias for an ascii-only version of the word.
|
2023-02-02 04:35:24 +01:00
|
|
|
|
for emoji_code, emoji_names in all_emojis.items():
|
2022-06-10 22:40:31 +02:00
|
|
|
|
for name in [emoji_names["canonical_name"]] + emoji_names["aliases"]:
|
|
|
|
|
# These are known names where we don't have an ascii-only version and there are ascii aliases
|
|
|
|
|
# that a user can still enter instead to get the same emoji.
|
|
|
|
|
if name in ["ココ", "サ", "指", "空"]:
|
2023-08-10 06:17:53 +02:00
|
|
|
|
assert any(alias.isascii() for alias in emoji_names["aliases"])
|
2022-06-10 22:40:31 +02:00
|
|
|
|
continue
|
|
|
|
|
if not name.isascii():
|
|
|
|
|
ascii_alias = convert_non_ascii_chars(name)
|
|
|
|
|
# Now no other emoji can use this alias.
|
|
|
|
|
for code in alias_to_emoji_code[ascii_alias]:
|
|
|
|
|
all_emojis[code]["aliases"].remove(ascii_alias)
|
|
|
|
|
all_emojis[emoji_code]["aliases"].append(ascii_alias)
|
2022-06-10 22:43:59 +02:00
|
|
|
|
|
2022-06-10 22:40:31 +02:00
|
|
|
|
# STEP 5: Write final dictionary to `emoji_names.py`.
|
2022-06-10 22:43:59 +02:00
|
|
|
|
with open(OUT_EMOJI_FILE, "w") as f:
|
|
|
|
|
f.write(
|
|
|
|
|
"from typing import Any, Dict\n\n"
|
|
|
|
|
"# Generated with `generate_emoji_names`.\n\n"
|
|
|
|
|
"EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {\n"
|
|
|
|
|
)
|
2023-02-02 04:35:24 +01:00
|
|
|
|
for key, emoji_names in all_emojis.items():
|
2022-06-10 22:40:31 +02:00
|
|
|
|
f.write(f" {key!r}: {emoji_names!r},\n")
|
2022-06-10 22:43:59 +02:00
|
|
|
|
f.write("}\n")
|
|
|
|
|
|
|
|
|
|
print(
|
|
|
|
|
"\n\nDone! You should run the linter to format emoji_names.py with `./tools/lint --fix -m --only black`"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|