thumbnail: Fix corrupted email notifications due to HTML5 entities.

BeautifulSoup with formatter="html5" unnecessarily escapes many
characters with HTML5-specific entities that cannot be correctly
parsed by lxml during generation of email notifications.

Signed-off-by: Anders Kaseorg <anders@zulip.com>
(cherry picked from commit e3abd09e67)
This commit is contained in:
Anders Kaseorg 2024-09-05 13:21:51 -07:00 committed by Tim Abbott
parent e827b2acf8
commit 1c05d19c3e
2 changed files with 17 additions and 4 deletions

View File

@ -8,6 +8,7 @@ from typing import TypeVar
import pyvips
from bs4 import BeautifulSoup
from bs4.formatter import EntitySubstitution, HTMLFormatter
from django.utils.translation import gettext as _
from typing_extensions import override
@ -396,6 +397,17 @@ def get_default_thumbnail_url(image_attachment: ImageAttachment) -> tuple[str, b
)
# Like HTMLFormatter.REGISTRY["html5"], this formatter avoids producing
# self-closing tags, but it differs by avoiding unnecessary escaping with
# HTML5-specific entities that cannot be parsed by lxml and libxml2
# (https://bugs.launchpad.net/lxml/+bug/2031045).
html_formatter = HTMLFormatter(
entity_substitution=EntitySubstitution.substitute_xml, # not substitute_html
void_element_close_prefix="",
empty_attributes_are_booleans=True,
)
def rewrite_thumbnailed_images(
rendered_content: str,
images: dict[str, MarkdownImageMetadata | None],
@ -453,7 +465,8 @@ def rewrite_thumbnailed_images(
image_tag["data-animated"] = "true"
if changed:
# The formatter="html5" means we do not produce self-closing tags
return parsed_message.encode(formatter="html5").decode().strip(), remaining_thumbnails
return parsed_message.encode(
formatter=html_formatter
).decode().strip(), remaining_thumbnails
else:
return None, remaining_thumbnails

View File

@ -150,8 +150,8 @@ class MarkdownThumbnailTest(ZulipTestCase):
self.assertTrue(ImageAttachment.objects.filter(path_id=path_id).exists())
message_id = self.send_message_content(f"[I am 95% ± 5% certain!](/user_uploads/{path_id})")
expected = (
f'<p><a href="/user_uploads/{path_id}">I am 95% &plusmn; 5% certain!</a></p>\n'
f'<div class="message_inline_image"><a href="/user_uploads/{path_id}" title="I am 95% &plusmn; 5% certain!">'
f'<p><a href="/user_uploads/{path_id}">I am 95% ± 5% certain!</a></p>\n'
f'<div class="message_inline_image"><a href="/user_uploads/{path_id}" title="I am 95% ± 5% certain!">'
f'<img data-original-dimensions="128x128" src="/user_uploads/thumbnail/{path_id}/840x560.webp"></a></div>'
)
self.assert_message_content_is(message_id, expected)