email_notifications: Prevent html2text from mangling Unicode.

html2text mangles Unicode by default, with a --unicode-snob option to
disable it.  If I have to get called a “snob” for wanting to correctly
support non-English languages, then uh, I’ll take one for the team.

https://github.com/Alir3z4/html2text/blob/2024.2.26/html2text/config.py#L111-L150

Signed-off-by: Anders Kaseorg <anders@zulip.com>
This commit is contained in:
Anders Kaseorg 2024-10-29 16:40:20 -07:00 committed by Tim Abbott
parent fc50736f4e
commit 42e1517255
3 changed files with 10 additions and 2 deletions

View File

@ -440,7 +440,7 @@ def process_raw_message_batch(
)
# html2text is GPL licensed, so run it as a subprocess.
content = subprocess.check_output(["html2text"], input=content, text=True)
content = subprocess.check_output(["html2text", "--unicode-snob"], input=content, text=True)
if len(content) > 10000: # nocoverage
logging.info("skipping too-long message of length %s", len(content))

View File

@ -938,7 +938,7 @@ def enqueue_welcome_emails(user: UserProfile, realm_creation: bool = False) -> N
def convert_html_to_markdown(html: str) -> str:
# html2text is GPL licensed, so run it as a subprocess.
markdown = subprocess.check_output(
[os.path.join(sys.prefix, "bin", "html2text")], input=html, text=True
[os.path.join(sys.prefix, "bin", "html2text"), "--unicode-snob"], input=html, text=True
).strip()
# We want images to get linked and inline previewed, but html2text will turn

View File

@ -11,6 +11,7 @@ from django.utils.timezone import now as timezone_now
from django_auth_ldap.config import LDAPSearch
from zerver.lib.email_notifications import (
convert_html_to_markdown,
enqueue_welcome_emails,
get_onboarding_email_schedule,
send_account_registered_email,
@ -671,3 +672,10 @@ class TestCustomWelcomeEmailSender(ZulipTestCase):
email_data = orjson.loads(scheduled_emails[0].data)
self.assertEqual(email_data["from_name"], name)
self.assertEqual(email_data["from_address"], email)
class TestHtmlToMarkdown(ZulipTestCase):
def test_html_to_markdown_unicode(self) -> None:
self.assertEqual(
convert_html_to_markdown("a rose is not a ros&eacute;"), "a rose is not a rosé"
)