emoji: Finish script to generate emoji_names.py with CLDR data.

This script pulls from our previously custom-written emoji strings
and fills in the rest from CLDR. It also removes 4 custom emoji which
collide with some of the new CLDR names (they will now just be called
by their CLDR name).
This commit is contained in:
evykassirer 2022-06-10 13:40:31 -07:00 committed by Tim Abbott
parent 44df15e19b
commit 8a9e68e026
7 changed files with 1100 additions and 34 deletions

View File

@ -21,7 +21,7 @@ run_test("initialize", () => {
const complete_emoji_catalog = _.sortBy(emoji_picker.complete_emoji_catalog, "name");
assert.equal(complete_emoji_catalog.length, 11);
assert.equal(emoji.emojis_by_name.size, 1052);
assert.equal(emoji.emojis_by_name.size, 1817);
let total_emoji_in_categories = 0;
@ -42,17 +42,17 @@ run_test("initialize", () => {
}
const popular_emoji_count = 6;
const zulip_emoji_count = 1;
assert_emoji_category(complete_emoji_catalog.pop(), "fa-car", 170);
assert_emoji_category(complete_emoji_catalog.pop(), "fa-hashtag", 197);
assert_emoji_category(complete_emoji_catalog.pop(), "fa-smile-o", 129);
assert_emoji_category(complete_emoji_catalog.pop(), "fa-car", 195);
assert_emoji_category(complete_emoji_catalog.pop(), "fa-hashtag", 221);
assert_emoji_category(complete_emoji_catalog.pop(), "fa-smile-o", 162);
assert_emoji_category(complete_emoji_catalog.pop(), "fa-star-o", popular_emoji_count);
assert_emoji_category(complete_emoji_catalog.pop(), "fa-thumbs-o-up", 102);
assert_emoji_category(complete_emoji_catalog.pop(), "fa-lightbulb-o", 189);
assert_emoji_category(complete_emoji_catalog.pop(), "fa-cutlery", 92);
assert_emoji_category(complete_emoji_catalog.pop(), "fa-flag", 5);
assert_emoji_category(complete_emoji_catalog.pop(), "fa-thumbs-o-up", 353);
assert_emoji_category(complete_emoji_catalog.pop(), "fa-lightbulb-o", 255);
assert_emoji_category(complete_emoji_catalog.pop(), "fa-cutlery", 132);
assert_emoji_category(complete_emoji_catalog.pop(), "fa-flag", 268);
assert_emoji_category(complete_emoji_catalog.pop(), "fa-cog", 1);
assert_emoji_category(complete_emoji_catalog.pop(), "fa-leaf", 104);
assert_emoji_category(complete_emoji_catalog.pop(), "fa-soccer-ball-o", 63);
assert_emoji_category(complete_emoji_catalog.pop(), "fa-leaf", 144);
assert_emoji_category(complete_emoji_catalog.pop(), "fa-soccer-ball-o", 86);
// The popular emoji appear twice in the picker, and the zulip emoji is special
assert.equal(

View File

@ -98,6 +98,8 @@
"@typescript-eslint/parser": "^5.0.0",
"babel-plugin-rewire-ts": "^1.4.0",
"callsites": "^3.1.0",
"cldr-annotations-derived-modern": "^41.0.0",
"cldr-annotations-modern": "^41.0.0",
"diff": "^5.0.0",
"difflib": "^0.2.4",
"enhanced-resolve": "^5.8.2",

View File

@ -28,7 +28,6 @@ CUSTOM_EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {
"1f60d": {"canonical_name": "heart_eyes", "aliases": ["in_love"]},
# blow_a_kiss from https://beebom.com/emoji-meanings/
"1f618": {"canonical_name": "heart_kiss", "aliases": ["blow_a_kiss"]},
"1f617": {"canonical_name": "kiss", "aliases": []},
"1f619": {"canonical_name": "kiss_smiling_eyes", "aliases": []},
"1f61a": {"canonical_name": "kiss_with_blush", "aliases": []},
"1f60b": {"canonical_name": "yum", "aliases": []},
@ -411,10 +410,6 @@ CUSTOM_EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {
# spiral_shell from Unicode/gemoji, the others seemed like reasonable
# additions
"1f41a": {"canonical_name": "shell", "aliases": ["seashell", "conch", "spiral_shell"]},
# Unicode/gemoji have lady_beetle; hopefully with ladybug we get both the
# people that prefer lady_beetle (with beetle) and ladybug. There is also
# ladybird, but seems a bit much for this to complete for bird.
"1f41e": {"canonical_name": "beetle", "aliases": ["ladybug"]},
"1f41c": {"canonical_name": "ant", "aliases": []},
"1f577": {"canonical_name": "spider", "aliases": []},
"1f578": {"canonical_name": "web", "aliases": ["spider_web"]},
@ -738,8 +733,6 @@ CUSTOM_EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {
"1f945": {"canonical_name": "gooooooooal", "aliases": ["goal"]},
"1f3d2": {"canonical_name": "ice_hockey", "aliases": []},
"1f3d1": {"canonical_name": "field_hockey", "aliases": []},
# would say bat, but taken by Nature/30
"1f3cf": {"canonical_name": "cricket", "aliases": ["cricket_bat"]},
# hole_in_one seems like a more useful name to have. Sent golf to
# Activity/39
"26f3": {"canonical_name": "hole_in_one", "aliases": []},
@ -1210,8 +1203,6 @@ CUSTOM_EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {
"1f4ee": {"canonical_name": "mail_dropoff", "aliases": []},
"1f4ef": {"canonical_name": "horn", "aliases": []},
"1f4dc": {"canonical_name": "scroll", "aliases": []},
# receipt seems more useful?
"1f4c3": {"canonical_name": "receipt", "aliases": []},
"1f4c4": {"canonical_name": "document", "aliases": ["paper", "file", "page"]},
"1f4d1": {"canonical_name": "place_holder", "aliases": []},
"1f4ca": {"canonical_name": "bar_chart", "aliases": []},

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,8 @@
#!/usr/bin/env python3
import json
import os
import re
import sys
from collections import defaultdict
import orjson
@ -10,11 +11,26 @@ from emoji_setup_utils import get_emoji_code
ZULIP_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../../")
sys.path.append(ZULIP_PATH)
CLDR_DATA_FILE = os.path.join(
ZULIP_PATH, "node_modules", "cldr-annotations-modern", "annotations", "en", "annotations.json"
)
CLDR_DERIVED_DATA_FILE = os.path.join(
ZULIP_PATH,
"node_modules",
"cldr-annotations-derived-modern",
"annotationsDerived",
"en",
"annotations.json",
)
EMOJI_DATA_FILE = os.path.join(ZULIP_PATH, "node_modules", "emoji-datasource-google", "emoji.json")
OUT_EMOJI_FILE = os.path.join(ZULIP_PATH, "tools", "setup", "emoji", "emoji_names.py")
with open(EMOJI_DATA_FILE, "rb") as fp:
EMOJI_DATA = orjson.loads(fp.read())
with open(CLDR_DATA_FILE, "rb") as fp:
CLDR_DATA = orjson.loads(fp.read())["annotations"]["annotations"]
with open(CLDR_DERIVED_DATA_FILE, "rb") as fp:
CLDR_DATA.update(orjson.loads(fp.read())["annotationsDerived"]["annotations"])
# We don't include most clock emojis. See `custom_emoji_names` for more context.
SKIPPED_CLOCK_EMOJI_CODES = [
@ -53,17 +69,62 @@ SKIN_TONE_EMOJI_CODES = [
]
def cleanup_name(name: str) -> str:
replacements = {
" ": "_",
"-": "_",
"": "_",
"“": "",
"”": "",
":": "",
".": "",
"&": "and",
"": "'",
"": "'",
}
for before, after in replacements.items():
name = name.replace(before, after)
name = re.sub("_{2,}", "_", name)
return name.lower()
def convert_non_ascii_chars(name: str) -> str:
replacements = {
"è": "e",
"ǐ": "i",
"ó": "o",
"ā": "a",
"ō": "o",
"ñ": "n",
"ô": "o",
"ç": "c",
"é": "e",
"ã": "a",
"í": "i",
"å": "a",
}
for before, after in replacements.items():
name = name.replace(before, after)
assert (
name.isascii()
), f"{name} still contains non-ascii characters. Add them to convert_non_ascii_chars."
return name
def main() -> None:
all_emojis = {}
all_canonical_names = set()
alias_to_emoji_code = defaultdict(list)
# STEP 1: Generate first draft of all_emojis.
for emoji_dict in EMOJI_DATA:
emoji_code = get_emoji_code(emoji_dict)
if emoji_code in SKIPPED_CLOCK_EMOJI_CODES or emoji_code in SKIN_TONE_EMOJI_CODES:
continue
if emoji_code in CUSTOM_EMOJI_NAME_MAPS:
canonical_name = CUSTOM_EMOJI_NAME_MAPS[emoji_code]["canonical_name"]
canonical_name = cleanup_name(CUSTOM_EMOJI_NAME_MAPS[emoji_code]["canonical_name"])
if canonical_name in all_canonical_names:
raise Exception(
f"{canonical_name} was already added with a different codepoint. "
@ -72,9 +133,70 @@ def main() -> None:
all_canonical_names.add(canonical_name)
all_emojis[emoji_code] = CUSTOM_EMOJI_NAME_MAPS[emoji_code]
else:
continue # this commit doesn't add CLDR data yet.
# create the unicode character(s) for the emoji, since this is the key into the CLDR data
emoji = "".join(
chr(int(h, 16))
for h in (emoji_dict["non_qualified"] or emoji_dict["unified"]).split("-")
)
if emoji not in CLDR_DATA:
print(
f"{emoji} not found in custom emoji name maps, but also not found in CLDR data. Skipping."
)
continue
# CLDR_DATA[emoji] is of the form {'default': [...], 'tts': [...]}
# * "tts" is what's used for text-to-speech and always has one item, so we use that
# as the canonical name.
# * "default" has several items in it that we use as aliases.
# See also: https://www.unicode.org/reports/tr35/tr35-general.html#14-annotations-and-labels
assert len(CLDR_DATA[emoji]["tts"]) == 1
canonical_name = cleanup_name(CLDR_DATA[emoji]["tts"][0].strip())
if canonical_name in all_canonical_names:
raise Exception(
f"{canonical_name} was already added with a different codepoint. "
f"Rename it in `custom_emoji_names` or add an entry for {emoji_code}."
)
aliases = [cleanup_name(alias.strip()) for alias in CLDR_DATA[emoji]["default"]]
all_emojis[emoji_code] = {"canonical_name": canonical_name, "aliases": aliases}
all_canonical_names.add(canonical_name)
# STEP 2: We don't support having aliases that collide with canonical names for emoji, so remove them.
for (emoji_code, emoji_names) in all_emojis.items():
# Copy the list to not iterate while elements are being deleted.
aliases = emoji_names["aliases"][:]
for alias in aliases:
if alias in all_canonical_names:
emoji_names["aliases"].remove(alias)
else:
alias_to_emoji_code[alias].append(emoji_code) # This is used in STEP 3.
# STEP 3: We don't support multiple emoji sharing the same alias, but the CLDR data
# doesn't have that same restriction, so we have to fix this up to have unique aliases.
# If the alias was specifically specified in custom_emoji_names, then we can keep just
# that one, but otherwise there's no particular emoji that is an obvious candidate
# for the alias so just remove the alias for all relevant emoji.
for alias in alias_to_emoji_code.keys():
if len(alias_to_emoji_code[alias]) > 1:
for emoji_code in alias_to_emoji_code[alias]:
if emoji_code not in CUSTOM_EMOJI_NAME_MAPS:
all_emojis[emoji_code]["aliases"].remove(alias)
# STEP 4: We keep non-ascii (non-"English") characters in some emoji names if that's the correct
# way to spell that word, but always add an alias for an ascii-only version of the word.
for (emoji_code, emoji_names) in all_emojis.items():
for name in [emoji_names["canonical_name"]] + emoji_names["aliases"]:
# These are known names where we don't have an ascii-only version and there are ascii aliases
# that a user can still enter instead to get the same emoji.
if name in ["ココ", "サ", "指", "空"]:
assert any(alias.isascii() for alias in aliases)
continue
if not name.isascii():
ascii_alias = convert_non_ascii_chars(name)
# Now no other emoji can use this alias.
for code in alias_to_emoji_code[ascii_alias]:
all_emojis[code]["aliases"].remove(ascii_alias)
all_emojis[emoji_code]["aliases"].append(ascii_alias)
# STEP 5: Write final dictionary to `emoji_names.py`.
with open(OUT_EMOJI_FILE, "w") as f:
f.write(
"from typing import Any, Dict\n\n"
@ -82,7 +204,7 @@ def main() -> None:
"EMOJI_NAME_MAPS: Dict[str, Dict[str, Any]] = {\n"
)
for (key, emoji_names) in all_emojis.items():
f.write(f' "{key}": {json.dumps(emoji_names)},\n')
f.write(f" {key!r}: {emoji_names!r},\n")
f.write("}\n")
print(

View File

@ -3073,6 +3073,16 @@ clamp@^1.0.1:
resolved "https://registry.yarnpkg.com/clamp/-/clamp-1.0.1.tgz#66a0e64011816e37196828fdc8c8c147312c8634"
integrity sha512-kgMuFyE78OC6Dyu3Dy7vcx4uy97EIbVxJB/B0eJ3bUNAkwdNcxYzgKltnyADiYwsR7SEqkkUPsEUT//OVS6XMA==
cldr-annotations-derived-modern@^41.0.0:
version "41.0.0"
resolved "https://registry.yarnpkg.com/cldr-annotations-derived-modern/-/cldr-annotations-derived-modern-41.0.0.tgz#da55423006c5b9dd742d9395a2318b05d6a6efa2"
integrity sha512-V9N8CW+DDem3NZlpGKPdvTkoqZV/rNbZq27UZO1d0JXhM+pZo6e8n3+GSqSDgIAXphxn1v9Yd9U+X9NzZ+PCcA==
cldr-annotations-modern@^41.0.0:
version "41.0.0"
resolved "https://registry.yarnpkg.com/cldr-annotations-modern/-/cldr-annotations-modern-41.0.0.tgz#5eeaef3b250e30e0dd7e8babc28a3a70c71a7192"
integrity sha512-ymcfrliWq6IFB9vnDuT9awT7z9bTkh03g39eeU5RIYK0hFuZAPsYdpgs461GEuUKCL7SmDXXyZJwmf9iGYsCdg==
clean-css@^5.1.0, clean-css@^5.2.2:
version "5.3.1"
resolved "https://registry.yarnpkg.com/clean-css/-/clean-css-5.3.1.tgz#d0610b0b90d125196a2894d35366f734e5d7aa32"

View File

@ -758,14 +758,15 @@ class RocketChatImporter(ZulipTestCase):
zerver_realmemoji=zerver_realmemoji,
)
# :grin: and :star_struck: are not present in Zulip's default
# emoji set, or in Reaction.UNICODE_EMOJI reaction type.
self.assert_length(total_reactions, 7)
# :grin: is not present in Zulip's default emoji set,
# or in Reaction.UNICODE_EMOJI reaction type.
self.assert_length(total_reactions, 8)
grinning_emoji_code = name_to_codepoint["grinning"]
innocent_emoji_code = name_to_codepoint["innocent"]
heart_emoji_code = name_to_codepoint["heart"]
rocket_emoji_code = name_to_codepoint["rocket"]
star_struck_emoji_code = name_to_codepoint["star_struck"]
realmemoji_code = {}
for emoji in zerver_realmemoji:
@ -777,7 +778,16 @@ class RocketChatImporter(ZulipTestCase):
)
self.assertEqual(
self.get_set(total_reactions, "emoji_name"),
{"grinning", "innocent", "heart", "rocket", "check", "zulip", "harry-ron"},
{
"grinning",
"innocent",
"star_struck",
"heart",
"rocket",
"check",
"zulip",
"harry-ron",
},
)
self.assertEqual(
self.get_set(total_reactions, "emoji_code"),
@ -786,13 +796,14 @@ class RocketChatImporter(ZulipTestCase):
innocent_emoji_code,
heart_emoji_code,
rocket_emoji_code,
star_struck_emoji_code,
realmemoji_code["check"],
realmemoji_code["zulip"],
realmemoji_code["harry-ron"],
},
)
self.assertEqual(self.get_set(total_reactions, "user_profile"), {2, 3, 4})
self.assert_length(self.get_set(total_reactions, "id"), 7)
self.assert_length(self.get_set(total_reactions, "id"), 8)
self.assert_length(self.get_set(total_reactions, "message"), 1)
def test_process_message_attachment(self) -> None: