From 2de648df021116c94faafd0c1ea6296821125e52 Mon Sep 17 00:00:00 2001
From: Anders Kaseorg <anders@zulip.com>
Date: Wed, 13 Nov 2024 16:30:36 -0800
Subject: [PATCH] makemessages: Write with orjson.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

orjson’s use of Unicode is more consistent with what we get from
Transifex.  (We could alternatively use json’s ensure_ascii=False
flag.)

Signed-off-by: Anders Kaseorg <anders@zulip.com>
---
 tools/i18n/process-mobile-i18n                | 17 ++++++++-----
 tools/i18n/unescape-contents                  | 25 +++++++++++--------
 tools/i18n/update-for-legacy-translations     | 17 ++++++++-----
 zerver/management/commands/compilemessages.py | 11 +++++---
 zerver/management/commands/makemessages.py    | 19 +++++++++-----
 5 files changed, 56 insertions(+), 33 deletions(-)

diff --git a/tools/i18n/process-mobile-i18n b/tools/i18n/process-mobile-i18n
index f09bc8ba10..2b19f57754 100755
--- a/tools/i18n/process-mobile-i18n
+++ b/tools/i18n/process-mobile-i18n
@@ -1,9 +1,10 @@
 #!/usr/bin/env python3
-import json
 import os
 import re
 from subprocess import check_output
 
+import orjson
+
 
 def get_json_filename(locale: str) -> str:
     return f"locale/{locale}/mobile.json"
@@ -23,8 +24,8 @@ def get_locales() -> list[str]:
 
 
 def get_translation_stats(resource_path: str) -> dict[str, int]:
-    with open(resource_path) as raw_resource_file:
-        raw_info = json.load(raw_resource_file)
+    with open(resource_path, "rb") as raw_resource_file:
+        raw_info = orjson.loads(raw_resource_file.read())
 
     total = len(raw_info)
     not_translated = len([i for i in raw_info.items() if i[1] == ""])
@@ -41,8 +42,12 @@ for locale in get_locales():
         locale_paths.append(path)
 
 stats_path = os.path.join("locale", "mobile_info.json")
-with open(stats_path, "w") as f:
-    json.dump(translation_stats, f, indent=2, sort_keys=True)
-    f.write("\n")
+with open(stats_path, "wb") as f:
+    f.write(
+        orjson.dumps(
+            translation_stats,
+            option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS,
+        )
+    )
 
 print("Mobile stats file created at: " + stats_path)
diff --git a/tools/i18n/unescape-contents b/tools/i18n/unescape-contents
index 1fd3436275..d080086c90 100755
--- a/tools/i18n/unescape-contents
+++ b/tools/i18n/unescape-contents
@@ -2,10 +2,11 @@
 
 import argparse
 import html
-import json
 import sys
 from typing import NamedTuple
 
+import orjson
+
 
 class CLIArgs(NamedTuple):
     unescape_html: bool
@@ -38,8 +39,8 @@ if __name__ == "__main__":
 
     json_data: dict[str, str] = {}
 
-    with open(args.filename) as source:
-        json_data = json.load(source)
+    with open(args.filename, "rb") as source:
+        json_data = orjson.loads(source.read())
 
         if args.unescape_html:
             for key, value in json_data.items():
@@ -51,11 +52,13 @@ if __name__ == "__main__":
                         file=sys.stderr,
                     )
 
-    with open(args.filename, mode="w") as dest:
-        # At least on Linux systems with LANG=en_US.UTF-8, ensure_ascii=False
-        # ensures our output uses real UTF-8 codepoints for human readability,
-        # rather than \u0000 style escape sequences, providing us a
-        # somewhat-implicit JSON unescape. This may behave in unexpected ways
-        # on other OSes or system encodings.
-        json.dump(json_data, dest, ensure_ascii=False, indent=args.indent_level)
-        dest.write("\n")
+    with open(args.filename, mode="wb") as dest:
+        # orjson ensures our output uses real UTF-8 codepoints for
+        # human readability, rather than \u0000 style escape
+        # sequences, providing us a somewhat-implicit JSON unescape.
+        dest.write(
+            orjson.dumps(
+                json_data,
+                option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS,
+            )
+        )
diff --git a/tools/i18n/update-for-legacy-translations b/tools/i18n/update-for-legacy-translations
index cb9406bca7..652fe28d73 100755
--- a/tools/i18n/update-for-legacy-translations
+++ b/tools/i18n/update-for-legacy-translations
@@ -1,9 +1,10 @@
 #!/usr/bin/env python3
-import json
 import os
 import re
 from subprocess import check_output
 
+import orjson
+
 LEGACY_STRINGS_MAP = {
     "<p>You are searching for messages that belong to more than one channel, which is not possible.</p>": "<p>You are searching for messages that belong to more than one stream, which is not possible.</p>",
     "<strong>{name}</strong> <i>(guest)</i> is not subscribed to this channel. They will not be notified if you mention them.": "<strong>{name}</strong> <i>(guest)</i> is not subscribed to this stream. They will not be notified if you mention them.",
@@ -200,8 +201,8 @@ def get_locales() -> list[str]:
 
 
 def get_translations(path: str) -> dict[str, str]:
-    with open(path) as raw_resource_file:
-        translations = json.load(raw_resource_file)
+    with open(path, "rb") as raw_resource_file:
+        translations = orjson.loads(raw_resource_file.read())
 
     return translations
 
@@ -226,9 +227,13 @@ def update_for_legacy_stream_translations(
     # Only replace file content if we've made any updates for legacy
     # translated strings.
     if number_of_updates > 0:
-        with open(path, "w") as f:
-            json.dump(updated_translations, f, ensure_ascii=False, indent=2, sort_keys=True)
-            f.write("\n")
+        with open(path, "wb") as f:
+            f.write(
+                orjson.dumps(
+                    updated_translations,
+                    option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS,
+                )
+            )
         print(f"Updated {number_of_updates} strings in: {path}")
 
 
diff --git a/zerver/management/commands/compilemessages.py b/zerver/management/commands/compilemessages.py
index 84262074e0..9314c238a9 100644
--- a/zerver/management/commands/compilemessages.py
+++ b/zerver/management/commands/compilemessages.py
@@ -1,4 +1,3 @@
-import json
 import os
 import re
 import unicodedata
@@ -137,9 +136,13 @@ class Command(compilemessages.Command):
             info["percent_translated"] = percentage
             data["languages"].append(info)
 
-        with open(output_path, "w") as writer:
-            json.dump(data, writer, indent=2, sort_keys=True)
-            writer.write("\n")
+        with open(output_path, "wb") as writer:
+            writer.write(
+                orjson.dumps(
+                    data,
+                    option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS,
+                )
+            )
 
     def get_translation_percentage(self, locale_path: str, locale: str) -> int:
         # backend stats
diff --git a/zerver/management/commands/makemessages.py b/zerver/management/commands/makemessages.py
index 5e6c941fdd..f5b2b0c423 100644
--- a/zerver/management/commands/makemessages.py
+++ b/zerver/management/commands/makemessages.py
@@ -34,13 +34,13 @@ https://stackoverflow.com/questions/2090717
 
 import glob
 import itertools
-import json
 import os
 import re
 import subprocess
 from collections.abc import Collection, Iterator, Mapping
 from typing import Any
 
+import orjson
 from django.core.management.base import CommandParser
 from django.core.management.commands import makemessages
 from django.template.base import BLOCK_TAG_END, BLOCK_TAG_START
@@ -215,7 +215,7 @@ class Command(makemessages.Command):
                 "web/src/**/*.ts",
             ]
         )
-        translation_strings.extend(json.loads(extracted).values())
+        translation_strings.extend(orjson.loads(extracted).values())
 
         return list(set(translation_strings))
 
@@ -276,11 +276,18 @@ class Command(makemessages.Command):
         for locale, output_path in zip(self.get_locales(), self.get_output_paths(), strict=False):
             self.stdout.write(f"[frontend] processing locale {locale}")
             try:
-                with open(output_path) as reader:
-                    old_strings = json.load(reader)
+                with open(output_path, "rb") as reader:
+                    old_strings = orjson.loads(reader.read())
             except (OSError, ValueError):
                 old_strings = {}
 
             new_strings = self.get_new_strings(old_strings, translation_strings, locale)
-            with open(output_path, "w") as writer:
-                json.dump(new_strings, writer, indent=2, sort_keys=True)
+            with open(output_path, "wb") as writer:
+                writer.write(
+                    orjson.dumps(
+                        new_strings,
+                        option=orjson.OPT_APPEND_NEWLINE
+                        | orjson.OPT_INDENT_2
+                        | orjson.OPT_SORT_KEYS,
+                    )
+                )