diff --git a/zerver/data_import/mattermost.py b/zerver/data_import/mattermost.py index 9dcf0feeb0..af5a271a33 100644 --- a/zerver/data_import/mattermost.py +++ b/zerver/data_import/mattermost.py @@ -440,7 +440,7 @@ def process_raw_message_batch( ) # html2text is GPL licensed, so run it as a subprocess. - content = subprocess.check_output(["html2text"], input=content, text=True) + content = subprocess.check_output(["html2text", "--unicode-snob"], input=content, text=True) if len(content) > 10000: # nocoverage logging.info("skipping too-long message of length %s", len(content)) diff --git a/zerver/lib/email_notifications.py b/zerver/lib/email_notifications.py index f487605421..d870d7c13c 100644 --- a/zerver/lib/email_notifications.py +++ b/zerver/lib/email_notifications.py @@ -938,7 +938,7 @@ def enqueue_welcome_emails(user: UserProfile, realm_creation: bool = False) -> N def convert_html_to_markdown(html: str) -> str: # html2text is GPL licensed, so run it as a subprocess. markdown = subprocess.check_output( - [os.path.join(sys.prefix, "bin", "html2text")], input=html, text=True + [os.path.join(sys.prefix, "bin", "html2text"), "--unicode-snob"], input=html, text=True ).strip() # We want images to get linked and inline previewed, but html2text will turn diff --git a/zerver/tests/test_email_notifications.py b/zerver/tests/test_email_notifications.py index 68c59a0a47..9ce37b5352 100644 --- a/zerver/tests/test_email_notifications.py +++ b/zerver/tests/test_email_notifications.py @@ -11,6 +11,7 @@ from django.utils.timezone import now as timezone_now from django_auth_ldap.config import LDAPSearch from zerver.lib.email_notifications import ( + convert_html_to_markdown, enqueue_welcome_emails, get_onboarding_email_schedule, send_account_registered_email, @@ -671,3 +672,10 @@ class TestCustomWelcomeEmailSender(ZulipTestCase): email_data = orjson.loads(scheduled_emails[0].data) self.assertEqual(email_data["from_name"], name) self.assertEqual(email_data["from_address"], email) + + +class TestHtmlToMarkdown(ZulipTestCase): + def test_html_to_markdown_unicode(self) -> None: + self.assertEqual( + convert_html_to_markdown("a rose is not a rosé"), "a rose is not a rosé" + )