From 42e15172550955ada485ba384d7148422a4dc4b6 Mon Sep 17 00:00:00 2001 From: Anders Kaseorg Date: Tue, 29 Oct 2024 16:40:20 -0700 Subject: [PATCH] email_notifications: Prevent html2text from mangling Unicode. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit html2text mangles Unicode by default, with a --unicode-snob option to disable it. If I have to get called a “snob” for wanting to correctly support non-English languages, then uh, I’ll take one for the team. https://github.com/Alir3z4/html2text/blob/2024.2.26/html2text/config.py#L111-L150 Signed-off-by: Anders Kaseorg --- zerver/data_import/mattermost.py | 2 +- zerver/lib/email_notifications.py | 2 +- zerver/tests/test_email_notifications.py | 8 ++++++++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/zerver/data_import/mattermost.py b/zerver/data_import/mattermost.py index 9dcf0feeb0..af5a271a33 100644 --- a/zerver/data_import/mattermost.py +++ b/zerver/data_import/mattermost.py @@ -440,7 +440,7 @@ def process_raw_message_batch( ) # html2text is GPL licensed, so run it as a subprocess. - content = subprocess.check_output(["html2text"], input=content, text=True) + content = subprocess.check_output(["html2text", "--unicode-snob"], input=content, text=True) if len(content) > 10000: # nocoverage logging.info("skipping too-long message of length %s", len(content)) diff --git a/zerver/lib/email_notifications.py b/zerver/lib/email_notifications.py index f487605421..d870d7c13c 100644 --- a/zerver/lib/email_notifications.py +++ b/zerver/lib/email_notifications.py @@ -938,7 +938,7 @@ def enqueue_welcome_emails(user: UserProfile, realm_creation: bool = False) -> N def convert_html_to_markdown(html: str) -> str: # html2text is GPL licensed, so run it as a subprocess. markdown = subprocess.check_output( - [os.path.join(sys.prefix, "bin", "html2text")], input=html, text=True + [os.path.join(sys.prefix, "bin", "html2text"), "--unicode-snob"], input=html, text=True ).strip() # We want images to get linked and inline previewed, but html2text will turn diff --git a/zerver/tests/test_email_notifications.py b/zerver/tests/test_email_notifications.py index 68c59a0a47..9ce37b5352 100644 --- a/zerver/tests/test_email_notifications.py +++ b/zerver/tests/test_email_notifications.py @@ -11,6 +11,7 @@ from django.utils.timezone import now as timezone_now from django_auth_ldap.config import LDAPSearch from zerver.lib.email_notifications import ( + convert_html_to_markdown, enqueue_welcome_emails, get_onboarding_email_schedule, send_account_registered_email, @@ -671,3 +672,10 @@ class TestCustomWelcomeEmailSender(ZulipTestCase): email_data = orjson.loads(scheduled_emails[0].data) self.assertEqual(email_data["from_name"], name) self.assertEqual(email_data["from_address"], email) + + +class TestHtmlToMarkdown(ZulipTestCase): + def test_html_to_markdown_unicode(self) -> None: + self.assertEqual( + convert_html_to_markdown("a rose is not a rosé"), "a rose is not a rosé" + )