From 2de648df021116c94faafd0c1ea6296821125e52 Mon Sep 17 00:00:00 2001 From: Anders Kaseorg Date: Wed, 13 Nov 2024 16:30:36 -0800 Subject: [PATCH] makemessages: Write with orjson. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit orjson’s use of Unicode is more consistent with what we get from Transifex. (We could alternatively use json’s ensure_ascii=False flag.) Signed-off-by: Anders Kaseorg --- tools/i18n/process-mobile-i18n | 17 ++++++++----- tools/i18n/unescape-contents | 25 +++++++++++-------- tools/i18n/update-for-legacy-translations | 17 ++++++++----- zerver/management/commands/compilemessages.py | 11 +++++--- zerver/management/commands/makemessages.py | 19 +++++++++----- 5 files changed, 56 insertions(+), 33 deletions(-) diff --git a/tools/i18n/process-mobile-i18n b/tools/i18n/process-mobile-i18n index f09bc8ba10..2b19f57754 100755 --- a/tools/i18n/process-mobile-i18n +++ b/tools/i18n/process-mobile-i18n @@ -1,9 +1,10 @@ #!/usr/bin/env python3 -import json import os import re from subprocess import check_output +import orjson + def get_json_filename(locale: str) -> str: return f"locale/{locale}/mobile.json" @@ -23,8 +24,8 @@ def get_locales() -> list[str]: def get_translation_stats(resource_path: str) -> dict[str, int]: - with open(resource_path) as raw_resource_file: - raw_info = json.load(raw_resource_file) + with open(resource_path, "rb") as raw_resource_file: + raw_info = orjson.loads(raw_resource_file.read()) total = len(raw_info) not_translated = len([i for i in raw_info.items() if i[1] == ""]) @@ -41,8 +42,12 @@ for locale in get_locales(): locale_paths.append(path) stats_path = os.path.join("locale", "mobile_info.json") -with open(stats_path, "w") as f: - json.dump(translation_stats, f, indent=2, sort_keys=True) - f.write("\n") +with open(stats_path, "wb") as f: + f.write( + orjson.dumps( + translation_stats, + option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS, + ) + ) print("Mobile stats file created at: " + stats_path) diff --git a/tools/i18n/unescape-contents b/tools/i18n/unescape-contents index 1fd3436275..d080086c90 100755 --- a/tools/i18n/unescape-contents +++ b/tools/i18n/unescape-contents @@ -2,10 +2,11 @@ import argparse import html -import json import sys from typing import NamedTuple +import orjson + class CLIArgs(NamedTuple): unescape_html: bool @@ -38,8 +39,8 @@ if __name__ == "__main__": json_data: dict[str, str] = {} - with open(args.filename) as source: - json_data = json.load(source) + with open(args.filename, "rb") as source: + json_data = orjson.loads(source.read()) if args.unescape_html: for key, value in json_data.items(): @@ -51,11 +52,13 @@ if __name__ == "__main__": file=sys.stderr, ) - with open(args.filename, mode="w") as dest: - # At least on Linux systems with LANG=en_US.UTF-8, ensure_ascii=False - # ensures our output uses real UTF-8 codepoints for human readability, - # rather than \u0000 style escape sequences, providing us a - # somewhat-implicit JSON unescape. This may behave in unexpected ways - # on other OSes or system encodings. - json.dump(json_data, dest, ensure_ascii=False, indent=args.indent_level) - dest.write("\n") + with open(args.filename, mode="wb") as dest: + # orjson ensures our output uses real UTF-8 codepoints for + # human readability, rather than \u0000 style escape + # sequences, providing us a somewhat-implicit JSON unescape. + dest.write( + orjson.dumps( + json_data, + option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS, + ) + ) diff --git a/tools/i18n/update-for-legacy-translations b/tools/i18n/update-for-legacy-translations index cb9406bca7..652fe28d73 100755 --- a/tools/i18n/update-for-legacy-translations +++ b/tools/i18n/update-for-legacy-translations @@ -1,9 +1,10 @@ #!/usr/bin/env python3 -import json import os import re from subprocess import check_output +import orjson + LEGACY_STRINGS_MAP = { "

You are searching for messages that belong to more than one channel, which is not possible.

": "

You are searching for messages that belong to more than one stream, which is not possible.

", "{name} (guest) is not subscribed to this channel. They will not be notified if you mention them.": "{name} (guest) is not subscribed to this stream. They will not be notified if you mention them.", @@ -200,8 +201,8 @@ def get_locales() -> list[str]: def get_translations(path: str) -> dict[str, str]: - with open(path) as raw_resource_file: - translations = json.load(raw_resource_file) + with open(path, "rb") as raw_resource_file: + translations = orjson.loads(raw_resource_file.read()) return translations @@ -226,9 +227,13 @@ def update_for_legacy_stream_translations( # Only replace file content if we've made any updates for legacy # translated strings. if number_of_updates > 0: - with open(path, "w") as f: - json.dump(updated_translations, f, ensure_ascii=False, indent=2, sort_keys=True) - f.write("\n") + with open(path, "wb") as f: + f.write( + orjson.dumps( + updated_translations, + option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS, + ) + ) print(f"Updated {number_of_updates} strings in: {path}") diff --git a/zerver/management/commands/compilemessages.py b/zerver/management/commands/compilemessages.py index 84262074e0..9314c238a9 100644 --- a/zerver/management/commands/compilemessages.py +++ b/zerver/management/commands/compilemessages.py @@ -1,4 +1,3 @@ -import json import os import re import unicodedata @@ -137,9 +136,13 @@ class Command(compilemessages.Command): info["percent_translated"] = percentage data["languages"].append(info) - with open(output_path, "w") as writer: - json.dump(data, writer, indent=2, sort_keys=True) - writer.write("\n") + with open(output_path, "wb") as writer: + writer.write( + orjson.dumps( + data, + option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS, + ) + ) def get_translation_percentage(self, locale_path: str, locale: str) -> int: # backend stats diff --git a/zerver/management/commands/makemessages.py b/zerver/management/commands/makemessages.py index 5e6c941fdd..f5b2b0c423 100644 --- a/zerver/management/commands/makemessages.py +++ b/zerver/management/commands/makemessages.py @@ -34,13 +34,13 @@ https://stackoverflow.com/questions/2090717 import glob import itertools -import json import os import re import subprocess from collections.abc import Collection, Iterator, Mapping from typing import Any +import orjson from django.core.management.base import CommandParser from django.core.management.commands import makemessages from django.template.base import BLOCK_TAG_END, BLOCK_TAG_START @@ -215,7 +215,7 @@ class Command(makemessages.Command): "web/src/**/*.ts", ] ) - translation_strings.extend(json.loads(extracted).values()) + translation_strings.extend(orjson.loads(extracted).values()) return list(set(translation_strings)) @@ -276,11 +276,18 @@ class Command(makemessages.Command): for locale, output_path in zip(self.get_locales(), self.get_output_paths(), strict=False): self.stdout.write(f"[frontend] processing locale {locale}") try: - with open(output_path) as reader: - old_strings = json.load(reader) + with open(output_path, "rb") as reader: + old_strings = orjson.loads(reader.read()) except (OSError, ValueError): old_strings = {} new_strings = self.get_new_strings(old_strings, translation_strings, locale) - with open(output_path, "w") as writer: - json.dump(new_strings, writer, indent=2, sort_keys=True) + with open(output_path, "wb") as writer: + writer.write( + orjson.dumps( + new_strings, + option=orjson.OPT_APPEND_NEWLINE + | orjson.OPT_INDENT_2 + | orjson.OPT_SORT_KEYS, + ) + )