zulip/tools/setup/emoji/build_emoji

484 lines
20 KiB
Plaintext
Raw Permalink Normal View History

#!/usr/bin/env python3
#
# See docs/subsystems/emoji.md for a high-level explanation of how this system
# works.
import os
import shutil
import subprocess
import sys
from collections.abc import Iterator, Sequence
from typing import Any
import orjson
ZULIP_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../../")
sys.path.append(ZULIP_PATH)
from scripts.lib.zulip_tools import generate_sha1sum_emoji, run_as_root
from tools.setup.emoji.emoji_names import EMOJI_NAME_MAPS
from tools.setup.emoji.emoji_setup_utils import (
EMOTICON_CONVERSIONS,
REMAPPED_EMOJIS,
emoji_is_supported,
emoji_names_for_picker,
generate_codepoint_to_name_map,
generate_codepoint_to_names_map,
generate_emoji_catalog,
generate_name_to_codepoint_map,
get_emoji_code,
)
EMOJI_CACHE_BASE_PATH = "/srv/zulip-emoji-cache"
EMOJI_SCRIPT_DIR_PATH = os.path.join(ZULIP_PATH, "tools", "setup", "emoji")
NODE_MODULES_PATH = os.path.join(ZULIP_PATH, "node_modules")
# The CSS for emoji spritesheet has somewhat tricky requirements. One
# is that we want to be able to use the same emoji CSS classes for
# different display sizes of our emoji (e.g. reactions are smaller
# than inline message emoji, which are smaller than those in the emoji
# picker) while only downloading 1 copy of the spritesheet, having
# good browser rendering performance, and reusing as much common CSS
# as is possible.
# Our solution to those problem is to use the `background-size` (Which
# is e.g. 5700%) and background-position attributes to select the
# region of the spritesheet corresponding to the target sprite and
# display it properly scaled in an emoji span.
SPRITE_CSS_FILE_TEMPLATE = """\
div.emoji,
span.emoji
{{
display: inline-block;
background-image: url(../emoji/{emojiset}.webp);
background-size: {background_size};
background-repeat: no-repeat;
/* Hide the text. */
text-indent: 100%;
white-space: nowrap;
overflow: hidden;
}}
.emoji-1f419
{{
background-image: url(../../../static/generated/emoji/images-google-64/1f419.png) !important;
background-position: 0% 0% !important;
background-size: contain !important;
}}
{emoji_positions}
"""
EMOJI_POS_INFO_TEMPLATE = """\
.emoji-{codepoint} {{
background-position: {pos_x} {pos_y};
}}
"""
EMOJI_OVERRIDE_TEMPLATE = """\
.emoji-{codepoint} {{
background-image: url(../../../static/generated/emoji/images-google-64/{codepoint}.png) !important;
background-position: 0% 0% !important;
background-size: contain !important;
}}
"""
# change directory
os.chdir(EMOJI_SCRIPT_DIR_PATH)
def main() -> None:
if not os.access(EMOJI_CACHE_BASE_PATH, os.W_OK):
provision: Let build_emoji build its own cache. We no longer need to maintain duplicate code related to where we set up the emoji cache directory. And we no longer need two extra steps for people doing advanced (i.e. manual) setup. There was no clear benefit to having provision build the cache directory for `build_emoji`, when it was easy to make `build_emoji` more self-sufficient. The `build_emoji` tool was already importing the library that has `run_as_root`, and it was already responsible for 99% of the create-directory kind of tasks. (We always call `build_emoji` unconditionally from `provision`, so there's no rationale in terms of avoiding startup time or something.) ASIDE: Its not completely clear to me why we need to put this directory in "/srv", instead of somewhere more local (like we already do for Travis), but maybe it's just to be like its siblings in "/srv": node_modules yarn.lock zulip-emoji-cache zulip-npm-cache zulip-py3-venv zulip-thumbor-venv zulip-venv-cache zulip-yarn I guess the caches that we keep in var are dev-only, although I think some of what's under `zulip-emoji-cache` is also dev-only in nature? ./var/webpack-cache ./var/mypy-cache In `docs/subsystems/emoji.md` we say this: ``` The `build_emoji` tool generates the set of files under `static/generated/emoji` (or really, it generates the `/srv/zulip-emoji-cache/<sha1>/emoji` tree, and `static/generated/emoji` is a symlink to that tree;we do this in order to cache old versions to make provisioning and production deployments super fast in the common case that we haven't changed the emoji tooling). [...] ``` I don't really understand that rationale for the development case, since `static/generated` is as much ignored by `git` as '/srv' is, without the complications of needing `sudo` to create it. And in production, I'm not sure how much time we're really saving, as it takes me about 1.4s to fully rebuild the cache in dev, not to mention we're taking on upgrade risk by sharing files between versions.
2020-04-17 14:08:55 +02:00
# Note: In production, this block will fail, since we don't
# assume sudo access; but it should never run in production
# anyway, because EMOJI_CACHE_BASE_PATH is created by Puppet before
provision: Let build_emoji build its own cache. We no longer need to maintain duplicate code related to where we set up the emoji cache directory. And we no longer need two extra steps for people doing advanced (i.e. manual) setup. There was no clear benefit to having provision build the cache directory for `build_emoji`, when it was easy to make `build_emoji` more self-sufficient. The `build_emoji` tool was already importing the library that has `run_as_root`, and it was already responsible for 99% of the create-directory kind of tasks. (We always call `build_emoji` unconditionally from `provision`, so there's no rationale in terms of avoiding startup time or something.) ASIDE: Its not completely clear to me why we need to put this directory in "/srv", instead of somewhere more local (like we already do for Travis), but maybe it's just to be like its siblings in "/srv": node_modules yarn.lock zulip-emoji-cache zulip-npm-cache zulip-py3-venv zulip-thumbor-venv zulip-venv-cache zulip-yarn I guess the caches that we keep in var are dev-only, although I think some of what's under `zulip-emoji-cache` is also dev-only in nature? ./var/webpack-cache ./var/mypy-cache In `docs/subsystems/emoji.md` we say this: ``` The `build_emoji` tool generates the set of files under `static/generated/emoji` (or really, it generates the `/srv/zulip-emoji-cache/<sha1>/emoji` tree, and `static/generated/emoji` is a symlink to that tree;we do this in order to cache old versions to make provisioning and production deployments super fast in the common case that we haven't changed the emoji tooling). [...] ``` I don't really understand that rationale for the development case, since `static/generated` is as much ignored by `git` as '/srv' is, without the complications of needing `sudo` to create it. And in production, I'm not sure how much time we're really saving, as it takes me about 1.4s to fully rebuild the cache in dev, not to mention we're taking on upgrade risk by sharing files between versions.
2020-04-17 14:08:55 +02:00
# build_emoji would be run.
run_as_root(["mkdir", "-p", EMOJI_CACHE_BASE_PATH])
run_as_root(["chown", f"{os.getuid()}:{os.getgid()}", EMOJI_CACHE_BASE_PATH])
provision: Let build_emoji build its own cache. We no longer need to maintain duplicate code related to where we set up the emoji cache directory. And we no longer need two extra steps for people doing advanced (i.e. manual) setup. There was no clear benefit to having provision build the cache directory for `build_emoji`, when it was easy to make `build_emoji` more self-sufficient. The `build_emoji` tool was already importing the library that has `run_as_root`, and it was already responsible for 99% of the create-directory kind of tasks. (We always call `build_emoji` unconditionally from `provision`, so there's no rationale in terms of avoiding startup time or something.) ASIDE: Its not completely clear to me why we need to put this directory in "/srv", instead of somewhere more local (like we already do for Travis), but maybe it's just to be like its siblings in "/srv": node_modules yarn.lock zulip-emoji-cache zulip-npm-cache zulip-py3-venv zulip-thumbor-venv zulip-venv-cache zulip-yarn I guess the caches that we keep in var are dev-only, although I think some of what's under `zulip-emoji-cache` is also dev-only in nature? ./var/webpack-cache ./var/mypy-cache In `docs/subsystems/emoji.md` we say this: ``` The `build_emoji` tool generates the set of files under `static/generated/emoji` (or really, it generates the `/srv/zulip-emoji-cache/<sha1>/emoji` tree, and `static/generated/emoji` is a symlink to that tree;we do this in order to cache old versions to make provisioning and production deployments super fast in the common case that we haven't changed the emoji tooling). [...] ``` I don't really understand that rationale for the development case, since `static/generated` is as much ignored by `git` as '/srv' is, without the complications of needing `sudo` to create it. And in production, I'm not sure how much time we're really saving, as it takes me about 1.4s to fully rebuild the cache in dev, not to mention we're taking on upgrade risk by sharing files between versions.
2020-04-17 14:08:55 +02:00
sha1_hexdigest = generate_sha1sum_emoji(ZULIP_PATH)
emoji_cache_path = os.path.join(EMOJI_CACHE_BASE_PATH, sha1_hexdigest)
success_stamp = os.path.join(emoji_cache_path, ".success-stamp")
if not os.path.exists(success_stamp):
print("Dumping emojis ...")
dump_emojis(emoji_cache_path)
with open(success_stamp, "w") as f:
f.close()
print(f"build_emoji: Using cached emojis from {emoji_cache_path}")
# /srv/zulip-emoji-cache/*/static gets symlinked to ZULIP_PATH/static/generated/emoji
TARGET_STATIC_EMOJI = os.path.join(ZULIP_PATH, "static", "generated", "emoji")
if os.path.lexists(TARGET_STATIC_EMOJI):
os.remove(TARGET_STATIC_EMOJI)
os.symlink(os.path.join(emoji_cache_path, "static"), TARGET_STATIC_EMOJI)
# /srv/zulip-emoji-cache/*/web gets copied to ZULIP_PATH/web/generated
# These must not be symlinked so webpack can resolve module references.
for subdir in ("emoji", "emoji-styles"):
target_dir = os.path.join(ZULIP_PATH, "web", "generated", subdir)
os.makedirs(target_dir, exist_ok=True)
to_remove = set(os.listdir(target_dir))
source_emoji_dump = os.path.join(emoji_cache_path, "web", subdir)
for filename in os.listdir(source_emoji_dump):
shutil.copy2(os.path.join(source_emoji_dump, filename), target_dir)
to_remove.discard(filename)
for filename in to_remove:
os.remove(os.path.join(target_dir, filename))
def percent(f: float) -> str:
return f"{f * 100:0.3f}%"
def get_square_size(emoji_data: Sequence[dict[str, Any]]) -> int:
"""
Spritesheets are usually NxN squares, and we have to
infer N from the sheet_x/sheet_y values of emojis.
"""
def get_offsets(emoji_data: Sequence[dict[str, Any]]) -> Iterator[int]:
for emoji_dict in emoji_data:
yield emoji_dict["sheet_x"]
yield emoji_dict["sheet_y"]
if "skin_variations" in emoji_dict:
for img_info in emoji_dict["skin_variations"].values():
yield img_info["sheet_x"]
yield img_info["sheet_y"]
n = max(get_offsets(emoji_data)) + 1
return n
def generate_sprite_css_files(
cache_path: str,
emoji_data: list[dict[str, Any]],
emojiset: str,
alt_name: str,
fallback_emoji_data: Sequence[dict[str, Any]],
) -> None:
"""
Spritesheets are usually NxN squares.
"""
n = get_square_size(emoji_data)
"""
Each single emoji is 64x64, with 1px gutters on every border.
We just consider the gutters to be part of the image for
simplicity reasons, so you can think of the spritesheet as
an NxN square of 66x66 pre-padded emojis. The CSS
background-size parameter below says to size the background
element as N times the size of the element that you're drawing.
Note that we use percentages here, instead of absolute
pixel values, because when we render emojis as actual elements,
their size will vary depending on which part of the UI we're
in (message emojis, emoji reactions, emoji popup, emoji
popup showcase, etc.).
(The next step is to offset the image; that will be in the
upcoming loop.)
"""
background_size = percent(n)
emoji_positions = ""
for emoji in emoji_data:
if emoji_is_supported(emoji):
"""
For background-position we need to use percentages.
Absolute pixel values won't work, because the size
of the background sprite image is proportional to
the size of the element we're rendering, and we render
elements in multiple sizes.
The way that CSS background-position works is linear
interpolation. When you tell CSS background-position
is "42% 37%", then in the `x` dimension it will align
the image such that 42% of the background image is to
the left of the 42% mark in the element itself.
For simplicity assume we render the emoji as 66px
(and everything will scale appropriately for other
size images as long as we use percentages).
The image size will be 66n.
The left offset of the x-th emoji (including its
padding) will be 66x. And the element's width
will be 66.
So, solve this equation for `p`, where p is
the ratio that we'll later express as a
percentage:
<image offset> = <offset of p% mark of element>
(p * 66n) = 66x + p66
p * n = x + p
p * n - p = x
p * (n - 1) = x
p = x / (n - 1)
If you ever want to change the code so that the
gutters don't show up in the element, the algebra
will get more complicated.
"""
emoji_positions += EMOJI_POS_INFO_TEMPLATE.format(
codepoint=get_emoji_code(emoji),
pos_x=percent(emoji["sheet_x"] / (n - 1)),
pos_y=percent(emoji["sheet_y"] / (n - 1)),
)
SPRITE_STYLES_DIRECTORY = os.path.join(cache_path, "web", "emoji-styles")
os.makedirs(SPRITE_STYLES_DIRECTORY, exist_ok=True)
SPRITE_CSS_PATH = os.path.join(SPRITE_STYLES_DIRECTORY, f"{emojiset}-sprite.css")
with open(SPRITE_CSS_PATH, "w") as f:
f.write(
SPRITE_CSS_FILE_TEMPLATE.format(
emojiset=emojiset,
alt_name=alt_name,
emoji_positions=emoji_positions,
background_size=background_size,
),
)
# Google Classic stopped being supported in 2017. To be able to use other emoji, we
# fallback to Google Modern for any emoji not covered by Google Classic.
if emojiset == "google-blob":
extra_emoji_positions = ""
covered_emoji_codes = [
get_emoji_code(emoji) for emoji in emoji_data if emoji["has_img_google"]
]
for emoji in fallback_emoji_data:
code = get_emoji_code(emoji)
if emoji["has_img_google"] and code not in covered_emoji_codes:
extra_emoji_positions += EMOJI_OVERRIDE_TEMPLATE.format(
codepoint=code,
)
with open(SPRITE_CSS_PATH, "a") as f:
f.write(extra_emoji_positions)
# The Twitter emoji team was laid off in 2022, so new emoji aren't supported.
# https://github.com/twitter/twemoji/issues/570#issuecomment-1303422143.
# The "twitter" sprite sheet were using does have images in those locations,
# but theyre fallback images that emoji-datasource fills in from the Apple
# sprite sheet, which has unclear licensing implications.
# To be able to support newer emoji, we fallback to Google Modern for any emoji
# not covered by Twemoji.
if emojiset == "twitter":
extra_emoji_positions = ""
twitter_covered_emoji_codes = {
get_emoji_code(emoji) for emoji in emoji_data if emoji["has_img_twitter"]
}
for emoji in emoji_data:
code = get_emoji_code(emoji)
if emoji["has_img_google"] and code not in twitter_covered_emoji_codes:
extra_emoji_positions += EMOJI_OVERRIDE_TEMPLATE.format(
codepoint=code,
)
with open(SPRITE_CSS_PATH, "a") as f:
f.write(extra_emoji_positions)
def setup_emoji_farms(cache_path: str, emoji_data: list[dict[str, Any]]) -> None:
def ensure_emoji_image(
emoji_dict: dict[str, Any], src_emoji_farm: str, target_emoji_farm: str
) -> None:
# We use individual images from emoji farm for rendering emojis
# in notification messages. We have a custom emoji formatter in
# notifications processing code that converts `span` tags to
# `img` and that logic requires us to have non-qualified
# `emoji_code` as file name for emoji.
emoji_code = get_emoji_code(emoji_dict)
img_file_name = emoji_code + ".png"
src_file = os.path.join(src_emoji_farm, emoji_dict["image"])
dst_file = os.path.join(target_emoji_farm, img_file_name)
shutil.copy2(src_file, dst_file)
def setup_emoji_farm(
emojiset: str,
emoji_data: list[dict[str, Any]],
alt_name: str | None = None,
fallback_emoji_data: Sequence[dict[str, Any]] = [],
) -> None:
# `alt_name` is an optional parameter that we use to avoid duplicating below
# code. It is only used while setting up google-blob emoji set as it is just
# a wrapper for an older version of emoji-datasource package due to which we
# need to use 'google' at some places in this code. It has no meaning for other
# emoji sets and is just equivalent to `emojiset`.
alt_name = alt_name or emojiset
# Copy individual emoji images from npm packages.
src_emoji_farm = os.path.join(
NODE_MODULES_PATH, "emoji-datasource-" + emojiset, "img", alt_name, "64"
)
target_emoji_farm = os.path.join(cache_path, "static", "images-" + emojiset + "-64")
os.makedirs(target_emoji_farm, exist_ok=True)
print(f"Copying individual {emojiset} image files...")
for emoji_dict in emoji_data:
if emoji_dict["has_img_" + alt_name]:
ensure_emoji_image(emoji_dict, src_emoji_farm, target_emoji_farm)
skin_variations = emoji_dict.get("skin_variations", {})
for img_info in skin_variations.values():
if img_info["has_img_" + alt_name]:
ensure_emoji_image(img_info, src_emoji_farm, target_emoji_farm)
# Copy zulip.png to the emoji farm.
zulip_image = os.path.join(ZULIP_PATH, "web", "images", "zulip-emoji")
for f in os.listdir(zulip_image):
shutil.copy2(os.path.join(zulip_image, f), target_emoji_farm, follow_symlinks=False)
# We hardcode octopus emoji image to Google emoji set's old
# "cute octopus" image. Copy it to the emoji farms.
input_img_file = os.path.join(EMOJI_SCRIPT_DIR_PATH, "1f419.png")
output_img_file = os.path.join(target_emoji_farm, "1f419.png")
shutil.copyfile(input_img_file, output_img_file)
generate_sprite_css_files(cache_path, emoji_data, emojiset, alt_name, fallback_emoji_data)
print(f"Converting {emojiset} sheet to webp...")
TARGET_EMOJI_SHEETS = os.path.join(cache_path, "web", "emoji")
os.makedirs(TARGET_EMOJI_SHEETS, exist_ok=True)
sheet_src = os.path.join(
NODE_MODULES_PATH,
f"emoji-datasource-{emojiset}",
"img",
alt_name,
"sheets-256",
"64.png",
)
sheet_dst = os.path.join(TARGET_EMOJI_SHEETS, f"{emojiset}.webp")
# From libwebp: [Q is] between 0 and 100. For lossy, 0 gives
# the smallest size and 100 the largest. For lossless, this
# parameter is the amount of effort put into the
# compression: 0 is the fastest but gives larger files
# compared to the slowest, but best, 100.
subprocess.check_call(["vips", "copy", sheet_src, f"{sheet_dst}[lossless=true,Q=100]"])
# Set up standard emoji sets.
for emojiset in ["google", "twitter"]:
setup_emoji_farm(emojiset, emoji_data)
# Set up old Google "blobs" emoji set.
GOOGLE_BLOB_EMOJI_DATA_PATH = os.path.join(
NODE_MODULES_PATH, "emoji-datasource-google-blob", "emoji.json"
)
with open(GOOGLE_BLOB_EMOJI_DATA_PATH, "rb") as fp:
blob_emoji_data = orjson.loads(fp.read())
setup_emoji_farm("google-blob", blob_emoji_data, "google", emoji_data)
def setup_old_emoji_farm(
cache_path: str, emoji_map: dict[str, str], emoji_data: list[dict[str, Any]]
) -> None:
# Code for setting up old emoji farm.
emoji_cache_path = os.path.join(cache_path, "static", "images", "emoji")
unicode_emoji_cache_path = os.path.join(cache_path, "static", "images", "emoji", "unicode")
google_emoji_cache_path = os.path.join(cache_path, "static", "images-google-64")
os.makedirs(emoji_cache_path, exist_ok=True)
os.makedirs(unicode_emoji_cache_path, exist_ok=True)
# Symlink zulip.png image file.
image_file_path = os.path.join(google_emoji_cache_path, "zulip.png")
symlink_path = os.path.join(emoji_cache_path, "zulip.png")
os.symlink(image_file_path, symlink_path)
unicode_symlink_path = os.path.join(unicode_emoji_cache_path, "zulip.png")
os.symlink(image_file_path, unicode_symlink_path)
for name, codepoint in emoji_map.items():
mapped_codepoint = REMAPPED_EMOJIS.get(codepoint, codepoint)
image_file_path = os.path.join(google_emoji_cache_path, f"{mapped_codepoint}.png")
symlink_path = os.path.join(emoji_cache_path, f"{name}.png")
os.symlink(image_file_path, symlink_path)
try:
# `emoji_map` contains duplicate entries for the same codepoint with different
# names. So creation of symlink for <codepoint>.png may throw `FileExistsError`.
unicode_symlink_path = os.path.join(unicode_emoji_cache_path, f"{codepoint}.png")
os.symlink(image_file_path, unicode_symlink_path)
except FileExistsError:
pass
def generate_map_files(cache_path: str, emoji_catalog: dict[str, list[str]]) -> None:
# This function generates the main data files about emoji that are
# consumed by the web app, mobile apps, Markdown processor, etc.
names = emoji_names_for_picker(EMOJI_NAME_MAPS)
codepoint_to_name = generate_codepoint_to_name_map(EMOJI_NAME_MAPS)
name_to_codepoint = generate_name_to_codepoint_map(EMOJI_NAME_MAPS)
EMOJI_CODES_FILE_PATH = os.path.join(cache_path, "static", "emoji_codes.json")
with open(EMOJI_CODES_FILE_PATH, "wb") as emoji_codes_file:
emoji_codes_file.write(
orjson.dumps(
{
"names": names,
"name_to_codepoint": name_to_codepoint,
"codepoint_to_name": codepoint_to_name,
"emoji_catalog": emoji_catalog,
"emoticon_conversions": EMOTICON_CONVERSIONS,
}
)
)
# This is the more official API for mobile to fetch data about emoji.
# emoji_codes.json has a lot of goo, and we're creating this new file
# as a cleaner data format to move towards. We could add the rest of
# the data into this API-described data, and then the web client could
# switch to that which would allow us to drop the existing file. But
# we'll probably instead do #18121 which will make this file obsolete.
# So this is a temporary solution. CZO discussion:
# https://chat.zulip.org/#narrow/channel/378-api-design/topic/currently.20supported.20emoji/near/1394598
EMOJI_API_FILE_PATH = os.path.join(cache_path, "static", "emoji_api.json")
with open(EMOJI_API_FILE_PATH, "wb") as emoji_api_file:
emoji_api_file.write(
orjson.dumps(
{
"code_to_names": generate_codepoint_to_names_map(EMOJI_NAME_MAPS),
}
)
)
def dump_emojis(cache_path: str) -> None:
with open("emoji_map.json", "rb") as emoji_map_file:
emoji_map = orjson.loads(emoji_map_file.read())
# `emoji.json` or any other data file can be sourced from any of the supported
# emoji set packages, they all contain the same data files.
EMOJI_DATA_FILE_PATH = os.path.join(NODE_MODULES_PATH, "emoji-datasource-google", "emoji.json")
with open(EMOJI_DATA_FILE_PATH, "rb") as emoji_data_file:
emoji_data = orjson.loads(emoji_data_file.read())
# These are the codes that we'll be able to show Google Modern
# emoji for. (For other emoji sets, we fall back to Google Modern.)
supported_codes = {get_emoji_code(emoji) for emoji in emoji_data if emoji_is_supported(emoji)}
# These are in the emoji dropdown so we should make sure they're supported.
for code in EMOJI_NAME_MAPS:
# If an assertion here fails, we either need to find an image for Google Modern
# to display for that emoji, or we need to remove that emoji as an option for users,
# by removing it from emoji_names.py through a change to `generate_emoji_names`.
assert code in supported_codes
emoji_catalog = generate_emoji_catalog(emoji_data, EMOJI_NAME_MAPS)
# Set up emoji farms.
if os.path.exists(cache_path):
shutil.rmtree(cache_path)
setup_emoji_farms(cache_path, emoji_data)
setup_old_emoji_farm(cache_path, emoji_map, emoji_data)
# This file is needed to translate emoji when importing data from Slack.
shutil.copyfile(
EMOJI_DATA_FILE_PATH,
os.path.join(cache_path, "static", "emoji-datasource-google-emoji.json"),
)
# Generate various map files.
generate_map_files(cache_path, emoji_catalog)
if __name__ == "__main__":
main()