mirror of https://github.com/zulip/zulip.git
emoji: Finish script to generate emoji_names.py with CLDR data.
This script pulls from our previously custom-written emoji strings and fills in the rest from CLDR. It also removes 4 custom emoji which collide with some of the new CLDR names (they will now just be called by their CLDR name).
This commit is contained in:
parent
44df15e19b
commit
8a9e68e026
|
@ -21,7 +21,7 @@ run_test("initialize", () => {
|
|||
|
||||
const complete_emoji_catalog = _.sortBy(emoji_picker.complete_emoji_catalog, "name");
|
||||
assert.equal(complete_emoji_catalog.length, 11);
|
||||
assert.equal(emoji.emojis_by_name.size, 1052);
|
||||
assert.equal(emoji.emojis_by_name.size, 1817);
|
||||
|
||||
let total_emoji_in_categories = 0;
|
||||
|
||||
|
@ -42,17 +42,17 @@ run_test("initialize", () => {
|
|||
}
|
||||
const popular_emoji_count = 6;
|
||||
const zulip_emoji_count = 1;
|
||||
assert_emoji_category(complete_emoji_catalog.pop(), "fa-car", 170);
|
||||
assert_emoji_category(complete_emoji_catalog.pop(), "fa-hashtag", 197);
|
||||
assert_emoji_category(complete_emoji_catalog.pop(), "fa-smile-o", 129);
|
||||
assert_emoji_category(complete_emoji_catalog.pop(), "fa-car", 195);
|
||||
assert_emoji_category(complete_emoji_catalog.pop(), "fa-hashtag", 221);
|
||||
assert_emoji_category(complete_emoji_catalog.pop(), "fa-smile-o", 162);
|
||||
assert_emoji_category(complete_emoji_catalog.pop(), "fa-star-o", popular_emoji_count);
|
||||
assert_emoji_category(complete_emoji_catalog.pop(), "fa-thumbs-o-up", 102);
|
||||
assert_emoji_category(complete_emoji_catalog.pop(), "fa-lightbulb-o", 189);
|
||||
assert_emoji_category(complete_emoji_catalog.pop(), "fa-cutlery", 92);
|
||||
assert_emoji_category(complete_emoji_catalog.pop(), "fa-flag", 5);
|
||||
assert_emoji_category(complete_emoji_catalog.pop(), "fa-thumbs-o-up", 353);
|
||||
assert_emoji_category(complete_emoji_catalog.pop(), "fa-lightbulb-o", 255);
|
||||
assert_emoji_category(complete_emoji_catalog.pop(), "fa-cutlery", 132);
|
||||
assert_emoji_category(complete_emoji_catalog.pop(), "fa-flag", 268);
|
||||
assert_emoji_category(complete_emoji_catalog.pop(), "fa-cog", 1);
|
||||
assert_emoji_category(complete_emoji_catalog.pop(), "fa-leaf", 104);
|
||||
assert_emoji_category(complete_emoji_catalog.pop(), "fa-soccer-ball-o", 63);
|
||||
assert_emoji_category(complete_emoji_catalog.pop(), "fa-leaf", 144);
|
||||
assert_emoji_category(complete_emoji_catalog.pop(), "fa-soccer-ball-o", 86);
|
||||
|
||||
// The popular emoji appear twice in the picker, and the zulip emoji is special
|
||||
assert.equal(
|
||||
|
|
|
@ -98,6 +98,8 @@
|
|||
"@typescript-eslint/parser": "^5.0.0",
|
||||
"babel-plugin-rewire-ts": "^1.4.0",
|
||||
"callsites": "^3.1.0",
|
||||
"cldr-annotations-derived-modern": "^41.0.0",
|
||||
"cldr-annotations-modern": "^41.0.0",
|
||||
"diff": "^5.0.0",
|
||||
"difflib": "^0.2.4",
|
||||
"enhanced-resolve": "^5.8.2",
|
||||
|
|
|
@ -28,7 +28,6 @@ CUSTOM_EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {
|
|||
"1f60d": {"canonical_name": "heart_eyes", "aliases": ["in_love"]},
|
||||
# blow_a_kiss from https://beebom.com/emoji-meanings/
|
||||
"1f618": {"canonical_name": "heart_kiss", "aliases": ["blow_a_kiss"]},
|
||||
"1f617": {"canonical_name": "kiss", "aliases": []},
|
||||
"1f619": {"canonical_name": "kiss_smiling_eyes", "aliases": []},
|
||||
"1f61a": {"canonical_name": "kiss_with_blush", "aliases": []},
|
||||
"1f60b": {"canonical_name": "yum", "aliases": []},
|
||||
|
@ -411,10 +410,6 @@ CUSTOM_EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {
|
|||
# spiral_shell from Unicode/gemoji, the others seemed like reasonable
|
||||
# additions
|
||||
"1f41a": {"canonical_name": "shell", "aliases": ["seashell", "conch", "spiral_shell"]},
|
||||
# Unicode/gemoji have lady_beetle; hopefully with ladybug we get both the
|
||||
# people that prefer lady_beetle (with beetle) and ladybug. There is also
|
||||
# ladybird, but seems a bit much for this to complete for bird.
|
||||
"1f41e": {"canonical_name": "beetle", "aliases": ["ladybug"]},
|
||||
"1f41c": {"canonical_name": "ant", "aliases": []},
|
||||
"1f577": {"canonical_name": "spider", "aliases": []},
|
||||
"1f578": {"canonical_name": "web", "aliases": ["spider_web"]},
|
||||
|
@ -738,8 +733,6 @@ CUSTOM_EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {
|
|||
"1f945": {"canonical_name": "gooooooooal", "aliases": ["goal"]},
|
||||
"1f3d2": {"canonical_name": "ice_hockey", "aliases": []},
|
||||
"1f3d1": {"canonical_name": "field_hockey", "aliases": []},
|
||||
# would say bat, but taken by Nature/30
|
||||
"1f3cf": {"canonical_name": "cricket", "aliases": ["cricket_bat"]},
|
||||
# hole_in_one seems like a more useful name to have. Sent golf to
|
||||
# Activity/39
|
||||
"26f3": {"canonical_name": "hole_in_one", "aliases": []},
|
||||
|
@ -1210,8 +1203,6 @@ CUSTOM_EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {
|
|||
"1f4ee": {"canonical_name": "mail_dropoff", "aliases": []},
|
||||
"1f4ef": {"canonical_name": "horn", "aliases": []},
|
||||
"1f4dc": {"canonical_name": "scroll", "aliases": []},
|
||||
# receipt seems more useful?
|
||||
"1f4c3": {"canonical_name": "receipt", "aliases": []},
|
||||
"1f4c4": {"canonical_name": "document", "aliases": ["paper", "file", "page"]},
|
||||
"1f4d1": {"canonical_name": "place_holder", "aliases": []},
|
||||
"1f4ca": {"canonical_name": "bar_chart", "aliases": []},
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,7 +1,8 @@
|
|||
#!/usr/bin/env python3
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
|
||||
import orjson
|
||||
|
||||
|
@ -10,11 +11,26 @@ from emoji_setup_utils import get_emoji_code
|
|||
|
||||
ZULIP_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../../")
|
||||
sys.path.append(ZULIP_PATH)
|
||||
CLDR_DATA_FILE = os.path.join(
|
||||
ZULIP_PATH, "node_modules", "cldr-annotations-modern", "annotations", "en", "annotations.json"
|
||||
)
|
||||
CLDR_DERIVED_DATA_FILE = os.path.join(
|
||||
ZULIP_PATH,
|
||||
"node_modules",
|
||||
"cldr-annotations-derived-modern",
|
||||
"annotationsDerived",
|
||||
"en",
|
||||
"annotations.json",
|
||||
)
|
||||
EMOJI_DATA_FILE = os.path.join(ZULIP_PATH, "node_modules", "emoji-datasource-google", "emoji.json")
|
||||
OUT_EMOJI_FILE = os.path.join(ZULIP_PATH, "tools", "setup", "emoji", "emoji_names.py")
|
||||
|
||||
with open(EMOJI_DATA_FILE, "rb") as fp:
|
||||
EMOJI_DATA = orjson.loads(fp.read())
|
||||
with open(CLDR_DATA_FILE, "rb") as fp:
|
||||
CLDR_DATA = orjson.loads(fp.read())["annotations"]["annotations"]
|
||||
with open(CLDR_DERIVED_DATA_FILE, "rb") as fp:
|
||||
CLDR_DATA.update(orjson.loads(fp.read())["annotationsDerived"]["annotations"])
|
||||
|
||||
# We don't include most clock emojis. See `custom_emoji_names` for more context.
|
||||
SKIPPED_CLOCK_EMOJI_CODES = [
|
||||
|
@ -53,17 +69,62 @@ SKIN_TONE_EMOJI_CODES = [
|
|||
]
|
||||
|
||||
|
||||
def cleanup_name(name: str) -> str:
|
||||
replacements = {
|
||||
" ": "_",
|
||||
"-": "_",
|
||||
"–": "_",
|
||||
"“": "",
|
||||
"”": "",
|
||||
":": "",
|
||||
".": "",
|
||||
"&": "and",
|
||||
"‘": "'",
|
||||
"’": "'",
|
||||
}
|
||||
for before, after in replacements.items():
|
||||
name = name.replace(before, after)
|
||||
name = re.sub("_{2,}", "_", name)
|
||||
return name.lower()
|
||||
|
||||
|
||||
def convert_non_ascii_chars(name: str) -> str:
|
||||
replacements = {
|
||||
"è": "e",
|
||||
"ǐ": "i",
|
||||
"ó": "o",
|
||||
"ā": "a",
|
||||
"ō": "o",
|
||||
"ñ": "n",
|
||||
"ô": "o",
|
||||
"ç": "c",
|
||||
"é": "e",
|
||||
"ã": "a",
|
||||
"í": "i",
|
||||
"å": "a",
|
||||
}
|
||||
for before, after in replacements.items():
|
||||
name = name.replace(before, after)
|
||||
assert (
|
||||
name.isascii()
|
||||
), f"{name} still contains non-ascii characters. Add them to convert_non_ascii_chars."
|
||||
return name
|
||||
|
||||
|
||||
def main() -> None:
|
||||
all_emojis = {}
|
||||
all_canonical_names = set()
|
||||
|
||||
alias_to_emoji_code = defaultdict(list)
|
||||
|
||||
# STEP 1: Generate first draft of all_emojis.
|
||||
for emoji_dict in EMOJI_DATA:
|
||||
emoji_code = get_emoji_code(emoji_dict)
|
||||
if emoji_code in SKIPPED_CLOCK_EMOJI_CODES or emoji_code in SKIN_TONE_EMOJI_CODES:
|
||||
continue
|
||||
|
||||
if emoji_code in CUSTOM_EMOJI_NAME_MAPS:
|
||||
canonical_name = CUSTOM_EMOJI_NAME_MAPS[emoji_code]["canonical_name"]
|
||||
canonical_name = cleanup_name(CUSTOM_EMOJI_NAME_MAPS[emoji_code]["canonical_name"])
|
||||
if canonical_name in all_canonical_names:
|
||||
raise Exception(
|
||||
f"{canonical_name} was already added with a different codepoint. "
|
||||
|
@ -72,9 +133,70 @@ def main() -> None:
|
|||
all_canonical_names.add(canonical_name)
|
||||
all_emojis[emoji_code] = CUSTOM_EMOJI_NAME_MAPS[emoji_code]
|
||||
else:
|
||||
continue # this commit doesn't add CLDR data yet.
|
||||
# create the unicode character(s) for the emoji, since this is the key into the CLDR data
|
||||
emoji = "".join(
|
||||
chr(int(h, 16))
|
||||
for h in (emoji_dict["non_qualified"] or emoji_dict["unified"]).split("-")
|
||||
)
|
||||
if emoji not in CLDR_DATA:
|
||||
print(
|
||||
f"{emoji} not found in custom emoji name maps, but also not found in CLDR data. Skipping."
|
||||
)
|
||||
continue
|
||||
# CLDR_DATA[emoji] is of the form {'default': [...], 'tts': [...]}
|
||||
# * "tts" is what's used for text-to-speech and always has one item, so we use that
|
||||
# as the canonical name.
|
||||
# * "default" has several items in it that we use as aliases.
|
||||
# See also: https://www.unicode.org/reports/tr35/tr35-general.html#14-annotations-and-labels
|
||||
assert len(CLDR_DATA[emoji]["tts"]) == 1
|
||||
canonical_name = cleanup_name(CLDR_DATA[emoji]["tts"][0].strip())
|
||||
if canonical_name in all_canonical_names:
|
||||
raise Exception(
|
||||
f"{canonical_name} was already added with a different codepoint. "
|
||||
f"Rename it in `custom_emoji_names` or add an entry for {emoji_code}."
|
||||
)
|
||||
aliases = [cleanup_name(alias.strip()) for alias in CLDR_DATA[emoji]["default"]]
|
||||
all_emojis[emoji_code] = {"canonical_name": canonical_name, "aliases": aliases}
|
||||
all_canonical_names.add(canonical_name)
|
||||
|
||||
# STEP 2: We don't support having aliases that collide with canonical names for emoji, so remove them.
|
||||
for (emoji_code, emoji_names) in all_emojis.items():
|
||||
# Copy the list to not iterate while elements are being deleted.
|
||||
aliases = emoji_names["aliases"][:]
|
||||
for alias in aliases:
|
||||
if alias in all_canonical_names:
|
||||
emoji_names["aliases"].remove(alias)
|
||||
else:
|
||||
alias_to_emoji_code[alias].append(emoji_code) # This is used in STEP 3.
|
||||
|
||||
# STEP 3: We don't support multiple emoji sharing the same alias, but the CLDR data
|
||||
# doesn't have that same restriction, so we have to fix this up to have unique aliases.
|
||||
# If the alias was specifically specified in custom_emoji_names, then we can keep just
|
||||
# that one, but otherwise there's no particular emoji that is an obvious candidate
|
||||
# for the alias so just remove the alias for all relevant emoji.
|
||||
for alias in alias_to_emoji_code.keys():
|
||||
if len(alias_to_emoji_code[alias]) > 1:
|
||||
for emoji_code in alias_to_emoji_code[alias]:
|
||||
if emoji_code not in CUSTOM_EMOJI_NAME_MAPS:
|
||||
all_emojis[emoji_code]["aliases"].remove(alias)
|
||||
|
||||
# STEP 4: We keep non-ascii (non-"English") characters in some emoji names if that's the correct
|
||||
# way to spell that word, but always add an alias for an ascii-only version of the word.
|
||||
for (emoji_code, emoji_names) in all_emojis.items():
|
||||
for name in [emoji_names["canonical_name"]] + emoji_names["aliases"]:
|
||||
# These are known names where we don't have an ascii-only version and there are ascii aliases
|
||||
# that a user can still enter instead to get the same emoji.
|
||||
if name in ["ココ", "サ", "指", "空"]:
|
||||
assert any(alias.isascii() for alias in aliases)
|
||||
continue
|
||||
if not name.isascii():
|
||||
ascii_alias = convert_non_ascii_chars(name)
|
||||
# Now no other emoji can use this alias.
|
||||
for code in alias_to_emoji_code[ascii_alias]:
|
||||
all_emojis[code]["aliases"].remove(ascii_alias)
|
||||
all_emojis[emoji_code]["aliases"].append(ascii_alias)
|
||||
|
||||
# STEP 5: Write final dictionary to `emoji_names.py`.
|
||||
with open(OUT_EMOJI_FILE, "w") as f:
|
||||
f.write(
|
||||
"from typing import Any, Dict\n\n"
|
||||
|
@ -82,7 +204,7 @@ def main() -> None:
|
|||
"EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {\n"
|
||||
)
|
||||
for (key, emoji_names) in all_emojis.items():
|
||||
f.write(f' "{key}": {json.dumps(emoji_names)},\n')
|
||||
f.write(f" {key!r}: {emoji_names!r},\n")
|
||||
f.write("}\n")
|
||||
|
||||
print(
|
||||
|
|
10
yarn.lock
10
yarn.lock
|
@ -3073,6 +3073,16 @@ clamp@^1.0.1:
|
|||
resolved "https://registry.yarnpkg.com/clamp/-/clamp-1.0.1.tgz#66a0e64011816e37196828fdc8c8c147312c8634"
|
||||
integrity sha512-kgMuFyE78OC6Dyu3Dy7vcx4uy97EIbVxJB/B0eJ3bUNAkwdNcxYzgKltnyADiYwsR7SEqkkUPsEUT//OVS6XMA==
|
||||
|
||||
cldr-annotations-derived-modern@^41.0.0:
|
||||
version "41.0.0"
|
||||
resolved "https://registry.yarnpkg.com/cldr-annotations-derived-modern/-/cldr-annotations-derived-modern-41.0.0.tgz#da55423006c5b9dd742d9395a2318b05d6a6efa2"
|
||||
integrity sha512-V9N8CW+DDem3NZlpGKPdvTkoqZV/rNbZq27UZO1d0JXhM+pZo6e8n3+GSqSDgIAXphxn1v9Yd9U+X9NzZ+PCcA==
|
||||
|
||||
cldr-annotations-modern@^41.0.0:
|
||||
version "41.0.0"
|
||||
resolved "https://registry.yarnpkg.com/cldr-annotations-modern/-/cldr-annotations-modern-41.0.0.tgz#5eeaef3b250e30e0dd7e8babc28a3a70c71a7192"
|
||||
integrity sha512-ymcfrliWq6IFB9vnDuT9awT7z9bTkh03g39eeU5RIYK0hFuZAPsYdpgs461GEuUKCL7SmDXXyZJwmf9iGYsCdg==
|
||||
|
||||
clean-css@^5.1.0, clean-css@^5.2.2:
|
||||
version "5.3.1"
|
||||
resolved "https://registry.yarnpkg.com/clean-css/-/clean-css-5.3.1.tgz#d0610b0b90d125196a2894d35366f734e5d7aa32"
|
||||
|
|
|
@ -758,14 +758,15 @@ class RocketChatImporter(ZulipTestCase):
|
|||
zerver_realmemoji=zerver_realmemoji,
|
||||
)
|
||||
|
||||
# :grin: and :star_struck: are not present in Zulip's default
|
||||
# emoji set, or in Reaction.UNICODE_EMOJI reaction type.
|
||||
self.assert_length(total_reactions, 7)
|
||||
# :grin: is not present in Zulip's default emoji set,
|
||||
# or in Reaction.UNICODE_EMOJI reaction type.
|
||||
self.assert_length(total_reactions, 8)
|
||||
|
||||
grinning_emoji_code = name_to_codepoint["grinning"]
|
||||
innocent_emoji_code = name_to_codepoint["innocent"]
|
||||
heart_emoji_code = name_to_codepoint["heart"]
|
||||
rocket_emoji_code = name_to_codepoint["rocket"]
|
||||
star_struck_emoji_code = name_to_codepoint["star_struck"]
|
||||
|
||||
realmemoji_code = {}
|
||||
for emoji in zerver_realmemoji:
|
||||
|
@ -777,7 +778,16 @@ class RocketChatImporter(ZulipTestCase):
|
|||
)
|
||||
self.assertEqual(
|
||||
self.get_set(total_reactions, "emoji_name"),
|
||||
{"grinning", "innocent", "heart", "rocket", "check", "zulip", "harry-ron"},
|
||||
{
|
||||
"grinning",
|
||||
"innocent",
|
||||
"star_struck",
|
||||
"heart",
|
||||
"rocket",
|
||||
"check",
|
||||
"zulip",
|
||||
"harry-ron",
|
||||
},
|
||||
)
|
||||
self.assertEqual(
|
||||
self.get_set(total_reactions, "emoji_code"),
|
||||
|
@ -786,13 +796,14 @@ class RocketChatImporter(ZulipTestCase):
|
|||
innocent_emoji_code,
|
||||
heart_emoji_code,
|
||||
rocket_emoji_code,
|
||||
star_struck_emoji_code,
|
||||
realmemoji_code["check"],
|
||||
realmemoji_code["zulip"],
|
||||
realmemoji_code["harry-ron"],
|
||||
},
|
||||
)
|
||||
self.assertEqual(self.get_set(total_reactions, "user_profile"), {2, 3, 4})
|
||||
self.assert_length(self.get_set(total_reactions, "id"), 7)
|
||||
self.assert_length(self.get_set(total_reactions, "id"), 8)
|
||||
self.assert_length(self.get_set(total_reactions, "message"), 1)
|
||||
|
||||
def test_process_message_attachment(self) -> None:
|
||||
|
|
Loading…
Reference in New Issue