email_notifications: Handle empty rendered_messages.

The transforms called from `build_message_payload` use
`lxml.html.fromstring` to parse (and stringify, and re-parse) the HTML
generated by Markdown.  However, this function fails if it is passed
an empty document.  "empty" is broader than just the empty string; it
also includes any document made entirely out of control characters,
spaces, unpaired surrogates, U+FFFE, or U+FFFF, and so forth.  These
documents would fail to parse, and raise a ParserError.

Using `lxml.html.fragment_fromstring` handles these cases, but does by
wrapping the contents in a <div> every time it is called.  As such,
replacing each `fromstring` with `fragment_fromstring` would nest
another layer of `<div>`.

Instead of each of the helper functions re-parsing, modifying, and
stringifying the HTML, parse it once with `fragment_fromstring` and
pass around the parsed document to each helper, which modifies it
in-place.  This adds one outer `<div>`, requiring minor changes to
tests and the prepend-sender functions.

The modification to add the sender is left using BeautifulSoup, as
that sort of transform is much less readable, and more fiddly, in raw
lxml.

Partial fix for #19559.
This commit is contained in:
Alex Vandiver 2021-08-19 18:10:35 +00:00 committed by Anders Kaseorg
parent 42e3c4e6ec
commit fe25517295
2 changed files with 50 additions and 35 deletions

View File

@ -44,10 +44,7 @@ from zerver.models import (
) )
def relative_to_full_url(base_url: str, content: str) -> str: def relative_to_full_url(fragment: lxml.html.HtmlElement, base_url: str) -> None:
# Convert relative URLs to absolute URLs.
fragment = lxml.html.fromstring(content)
# We handle narrow URLs separately because of two reasons: # We handle narrow URLs separately because of two reasons:
# 1: 'lxml' seems to be having an issue in dealing with URLs that begin # 1: 'lxml' seems to be having an issue in dealing with URLs that begin
# `#` due to which it doesn't add a `/` before joining the base_url to # `#` due to which it doesn't add a `/` before joining the base_url to
@ -81,15 +78,14 @@ def relative_to_full_url(base_url: str, content: str) -> str:
image_link = fragment.find("a").get("href") image_link = fragment.find("a").get("href")
image_title = fragment.find("a").get("title") image_title = fragment.find("a").get("title")
title_attr = {} if image_title is None else {"title": image_title} title_attr = {} if image_title is None else {"title": image_title}
fragment = E.P(E.A(image_link, href=image_link, target="_blank", **title_attr)) fragment.clear()
fragment.tag = "p"
fragment.append(E.A(image_link, href=image_link, target="_blank", **title_attr))
fragment.make_links_absolute(base_url) fragment.make_links_absolute(base_url)
content = lxml.html.tostring(fragment, encoding="unicode")
return content
def fix_emojis(content: str, base_url: str, emojiset: str) -> str: def fix_emojis(fragment: lxml.html.HtmlElement, base_url: str, emojiset: str) -> None:
def make_emoji_img_elem(emoji_span_elem: lxml.html.HtmlElement) -> Dict[str, Any]: def make_emoji_img_elem(emoji_span_elem: lxml.html.HtmlElement) -> Dict[str, Any]:
# Convert the emoji spans to img tags. # Convert the emoji spans to img tags.
classes = emoji_span_elem.get("class") classes = emoji_span_elem.get("class")
@ -106,7 +102,6 @@ def fix_emojis(content: str, base_url: str, emojiset: str) -> str:
img_elem.tail = emoji_span_elem.tail img_elem.tail = emoji_span_elem.tail
return img_elem return img_elem
fragment = lxml.html.fromstring(content)
for elem in fragment.cssselect("span.emoji"): for elem in fragment.cssselect("span.emoji"):
parent = elem.getparent() parent = elem.getparent()
img_elem = make_emoji_img_elem(elem) img_elem = make_emoji_img_elem(elem)
@ -116,14 +111,10 @@ def fix_emojis(content: str, base_url: str, emojiset: str) -> str:
del realm_emoji.attrib["class"] del realm_emoji.attrib["class"]
realm_emoji.set("style", "height: 20px;") realm_emoji.set("style", "height: 20px;")
content = lxml.html.tostring(fragment, encoding="unicode")
return content
def fix_spoilers_in_html(fragment: lxml.html.HtmlElement, language: str) -> None:
def fix_spoilers_in_html(content: str, language: str) -> str:
with override_language(language): with override_language(language):
spoiler_title: str = _("Open Zulip to see the spoiler content") spoiler_title: str = _("Open Zulip to see the spoiler content")
fragment = lxml.html.fromstring(content)
spoilers = fragment.find_class("spoiler-block") spoilers = fragment.find_class("spoiler-block")
for spoiler in spoilers: for spoiler in spoilers:
header = spoiler.find_class("spoiler-header")[0] header = spoiler.find_class("spoiler-header")[0]
@ -141,8 +132,6 @@ def fix_spoilers_in_html(content: str, language: str) -> str:
header_content.append(span_elem) header_content.append(span_elem)
header.drop_tag() header.drop_tag()
spoiler_content.drop_tree() spoiler_content.drop_tree()
content = lxml.html.tostring(fragment, encoding="unicode")
return content
def fix_spoilers_in_text(content: str, language: str) -> str: def fix_spoilers_in_text(content: str, language: str) -> str:
@ -199,7 +188,9 @@ def build_message_list(
message_soup = BeautifulSoup(message_html, "html.parser") message_soup = BeautifulSoup(message_html, "html.parser")
sender_name_soup = BeautifulSoup(f"<b>{sender}</b>: ", "html.parser") sender_name_soup = BeautifulSoup(f"<b>{sender}</b>: ", "html.parser")
first_tag = message_soup.find() first_tag = message_soup.find()
if first_tag.name == "p": if first_tag and first_tag.name == "div":
first_tag = first_tag.find()
if first_tag and first_tag.name == "p":
first_tag.insert(0, sender_name_soup) first_tag.insert(0, sender_name_soup)
else: else:
message_soup.insert(0, sender_name_soup) message_soup.insert(0, sender_name_soup)
@ -217,10 +208,11 @@ def build_message_list(
plain = fix_spoilers_in_text(plain, user.default_language) plain = fix_spoilers_in_text(plain, user.default_language)
assert message.rendered_content is not None assert message.rendered_content is not None
html = message.rendered_content fragment = lxml.html.fragment_fromstring(message.rendered_content, create_parent=True)
html = relative_to_full_url(user.realm.uri, html) relative_to_full_url(fragment, user.realm.uri)
html = fix_emojis(html, user.realm.uri, user.emojiset) fix_emojis(fragment, user.realm.uri, user.emojiset)
html = fix_spoilers_in_html(html, user.default_language) fix_spoilers_in_html(fragment, user.default_language)
html = lxml.html.tostring(fragment, encoding="unicode")
if sender: if sender:
plain, html = prepend_sender_to_message(plain, html, sender) plain, html = prepend_sender_to_message(plain, html, sender)
return {"plain": plain, "html": html} return {"plain": plain, "html": html}

View File

@ -6,6 +6,7 @@ from unittest import mock
from unittest.mock import patch from unittest.mock import patch
import ldap import ldap
import lxml.html
import orjson import orjson
from django.conf import settings from django.conf import settings
from django.core import mail from django.core import mail
@ -1098,14 +1099,14 @@ class TestMissedMessages(ZulipTestCase):
self.assertIn("Iago: * 1\n *2\n\n--\nYou are receiving", mail.outbox[1].body) self.assertIn("Iago: * 1\n *2\n\n--\nYou are receiving", mail.outbox[1].body)
# If message content does not starts with <p> tag sender name is appended before the <p> tag # If message content does not starts with <p> tag sender name is appended before the <p> tag
self.assertIn( self.assertIn(
" <b>Iago</b>: <ul>\n<li>1<br/>\n *2</li>\n</ul>\n", " <b>Iago</b>: <div><ul>\n<li>1<br/>\n *2</li>\n</ul></div>\n",
mail.outbox[1].alternatives[0][0], mail.outbox[1].alternatives[0][0],
) )
self.assertEqual("Hello\n\n--\n\nReply", mail.outbox[2].body[:16]) self.assertEqual("Hello\n\n--\n\nReply", mail.outbox[2].body[:16])
# Sender name is not appended to message for PM missed messages # Sender name is not appended to message for PM missed messages
self.assertIn( self.assertIn(
">\n \n <p>Hello</p>\n", ">\n \n <div><p>Hello</p></div>\n",
mail.outbox[2].alternatives[0][0], mail.outbox[2].alternatives[0][0],
) )
@ -1243,6 +1244,11 @@ class TestMissedMessages(ZulipTestCase):
self.assertEqual(email_subjects, valid_email_subjects) self.assertEqual(email_subjects, valid_email_subjects)
def test_relative_to_full_url(self) -> None: def test_relative_to_full_url(self) -> None:
def convert(test_data: str) -> str:
fragment = lxml.html.fromstring(test_data)
relative_to_full_url(fragment, "http://example.com")
return lxml.html.tostring(fragment, encoding="unicode")
zulip_realm = get_realm("zulip") zulip_realm = get_realm("zulip")
zephyr_realm = get_realm("zephyr") zephyr_realm = get_realm("zephyr")
# Run `relative_to_full_url()` function over test fixtures present in # Run `relative_to_full_url()` function over test fixtures present in
@ -1254,7 +1260,7 @@ class TestMissedMessages(ZulipTestCase):
test_fixtures[test["name"]] = test test_fixtures[test["name"]] = test
for test_name in test_fixtures: for test_name in test_fixtures:
test_data = test_fixtures[test_name]["expected_output"] test_data = test_fixtures[test_name]["expected_output"]
output_data = relative_to_full_url("http://example.com", test_data) output_data = convert(test_data)
if re.search(r"""(?<=\=['"])/(?=[^<]+>)""", output_data) is not None: if re.search(r"""(?<=\=['"])/(?=[^<]+>)""", output_data) is not None:
raise AssertionError( raise AssertionError(
"Relative URL present in email: " "Relative URL present in email: "
@ -1268,14 +1274,14 @@ class TestMissedMessages(ZulipTestCase):
# A path similar to our emoji path, but not in a link: # A path similar to our emoji path, but not in a link:
test_data = "<p>Check out the file at: '/static/generated/emoji/images/emoji/'</p>" test_data = "<p>Check out the file at: '/static/generated/emoji/images/emoji/'</p>"
actual_output = relative_to_full_url("http://example.com", test_data) actual_output = convert(test_data)
expected_output = "<p>Check out the file at: '/static/generated/emoji/images/emoji/'</p>" expected_output = "<p>Check out the file at: '/static/generated/emoji/images/emoji/'</p>"
self.assertEqual(actual_output, expected_output) self.assertEqual(actual_output, expected_output)
# An uploaded file # An uploaded file
test_data = '<a href="/user_uploads/{realm_id}/1f/some_random_value">/user_uploads/{realm_id}/1f/some_random_value</a>' test_data = '<a href="/user_uploads/{realm_id}/1f/some_random_value">/user_uploads/{realm_id}/1f/some_random_value</a>'
test_data = test_data.format(realm_id=zephyr_realm.id) test_data = test_data.format(realm_id=zephyr_realm.id)
actual_output = relative_to_full_url("http://example.com", test_data) actual_output = convert(test_data)
expected_output = ( expected_output = (
'<a href="http://example.com/user_uploads/{realm_id}/1f/some_random_value">' '<a href="http://example.com/user_uploads/{realm_id}/1f/some_random_value">'
+ "/user_uploads/{realm_id}/1f/some_random_value</a>" + "/user_uploads/{realm_id}/1f/some_random_value</a>"
@ -1285,7 +1291,7 @@ class TestMissedMessages(ZulipTestCase):
# A profile picture like syntax, but not actually in an HTML tag # A profile picture like syntax, but not actually in an HTML tag
test_data = '<p>Set src="/avatar/username@example.com?s=30"</p>' test_data = '<p>Set src="/avatar/username@example.com?s=30"</p>'
actual_output = relative_to_full_url("http://example.com", test_data) actual_output = convert(test_data)
expected_output = '<p>Set src="/avatar/username@example.com?s=30"</p>' expected_output = '<p>Set src="/avatar/username@example.com?s=30"</p>'
self.assertEqual(actual_output, expected_output) self.assertEqual(actual_output, expected_output)
@ -1294,7 +1300,7 @@ class TestMissedMessages(ZulipTestCase):
'<p><a href="#narrow/stream/test/topic/test.20topic/near/142"' '<p><a href="#narrow/stream/test/topic/test.20topic/near/142"'
+ 'title="#narrow/stream/test/topic/test.20topic/near/142">Conversation</a></p>' + 'title="#narrow/stream/test/topic/test.20topic/near/142">Conversation</a></p>'
) )
actual_output = relative_to_full_url("http://example.com", test_data) actual_output = convert(test_data)
expected_output = ( expected_output = (
'<p><a href="http://example.com/#narrow/stream/test/topic/test.20topic/near/142" ' '<p><a href="http://example.com/#narrow/stream/test/topic/test.20topic/near/142" '
+ 'title="http://example.com/#narrow/stream/test/topic/test.20topic/near/142">Conversation</a></p>' + 'title="http://example.com/#narrow/stream/test/topic/test.20topic/near/142">Conversation</a></p>'
@ -1309,7 +1315,7 @@ class TestMissedMessages(ZulipTestCase):
+ 'target="_blank" title="avatar_103.jpeg"><img src="/user_uploads/{realm_id}/52/fG7GM9e3afz_qsiUcSce2tl_/avatar_103.jpeg"></a></div>' + 'target="_blank" title="avatar_103.jpeg"><img src="/user_uploads/{realm_id}/52/fG7GM9e3afz_qsiUcSce2tl_/avatar_103.jpeg"></a></div>'
) )
test_data = test_data.format(realm_id=zulip_realm.id) test_data = test_data.format(realm_id=zulip_realm.id)
actual_output = relative_to_full_url("http://example.com", test_data) actual_output = convert(test_data)
expected_output = ( expected_output = (
'<div><p>See this <a href="http://example.com/user_uploads/{realm_id}/52/fG7GM9e3afz_qsiUcSce2tl_/avatar_103.jpeg" target="_blank" ' '<div><p>See this <a href="http://example.com/user_uploads/{realm_id}/52/fG7GM9e3afz_qsiUcSce2tl_/avatar_103.jpeg" target="_blank" '
+ 'title="avatar_103.jpeg">avatar_103.jpeg</a>.</p></div>' + 'title="avatar_103.jpeg">avatar_103.jpeg</a>.</p></div>'
@ -1325,7 +1331,7 @@ class TestMissedMessages(ZulipTestCase):
+ '<img data-src-fullsize="/thumbnail/https%3A//www.google.com/images/srpr/logo4w.png?size=0x0" ' + '<img data-src-fullsize="/thumbnail/https%3A//www.google.com/images/srpr/logo4w.png?size=0x0" '
+ 'src="/thumbnail/https%3A//www.google.com/images/srpr/logo4w.png?size=0x100"></a></div>' + 'src="/thumbnail/https%3A//www.google.com/images/srpr/logo4w.png?size=0x100"></a></div>'
) )
actual_output = relative_to_full_url("http://example.com", test_data) actual_output = convert(test_data)
expected_output = ( expected_output = (
'<p><a href="https://www.google.com/images/srpr/logo4w.png" ' '<p><a href="https://www.google.com/images/srpr/logo4w.png" '
+ 'target="_blank" title="https://www.google.com/images/srpr/logo4w.png">' + 'target="_blank" title="https://www.google.com/images/srpr/logo4w.png">'
@ -1335,7 +1341,9 @@ class TestMissedMessages(ZulipTestCase):
def test_spoilers_in_html_emails(self) -> None: def test_spoilers_in_html_emails(self) -> None:
test_data = '<div class="spoiler-block"><div class="spoiler-header">\n\n<p><a>header</a> text</p>\n</div><div class="spoiler-content" aria-hidden="true">\n\n<p>content</p>\n</div></div>\n\n<p>outside spoiler</p>' test_data = '<div class="spoiler-block"><div class="spoiler-header">\n\n<p><a>header</a> text</p>\n</div><div class="spoiler-content" aria-hidden="true">\n\n<p>content</p>\n</div></div>\n\n<p>outside spoiler</p>'
actual_output = fix_spoilers_in_html(test_data, "en") fragment = lxml.html.fromstring(test_data)
fix_spoilers_in_html(fragment, "en")
actual_output = lxml.html.tostring(fragment, encoding="unicode")
expected_output = '<div><div class="spoiler-block">\n\n<p><a>header</a> text <span class="spoiler-title" title="Open Zulip to see the spoiler content">(Open Zulip to see the spoiler content)</span></p>\n</div>\n\n<p>outside spoiler</p></div>' expected_output = '<div><div class="spoiler-block">\n\n<p><a>header</a> text <span class="spoiler-title" title="Open Zulip to see the spoiler content">(Open Zulip to see the spoiler content)</span></p>\n</div>\n\n<p>outside spoiler</p></div>'
self.assertEqual(actual_output, expected_output) self.assertEqual(actual_output, expected_output)
@ -1346,8 +1354,9 @@ class TestMissedMessages(ZulipTestCase):
if "spoiler" in test["name"]: if "spoiler" in test["name"]:
test_fixtures[test["name"]] = test test_fixtures[test["name"]] = test
for test_name in test_fixtures: for test_name in test_fixtures:
test_data = test_fixtures[test_name]["expected_output"] fragment = lxml.html.fromstring(test_fixtures[test_name]["expected_output"])
output_data = fix_spoilers_in_html(test_data, "en") fix_spoilers_in_html(fragment, "en")
output_data = lxml.html.tostring(fragment, encoding="unicode")
assert "spoiler-header" not in output_data assert "spoiler-header" not in output_data
assert "spoiler-content" not in output_data assert "spoiler-content" not in output_data
assert "spoiler-block" in output_data assert "spoiler-block" in output_data
@ -1375,9 +1384,23 @@ class TestMissedMessages(ZulipTestCase):
'<p>See <span aria-label="cloud with lightning and rain" class="emoji emoji-26c8" role="img" title="cloud with lightning and rain">' '<p>See <span aria-label="cloud with lightning and rain" class="emoji emoji-26c8" role="img" title="cloud with lightning and rain">'
+ ":cloud_with_lightning_and_rain:</span>.</p>" + ":cloud_with_lightning_and_rain:</span>.</p>"
) )
actual_output = fix_emojis(test_data, "http://example.com", "google") fragment = lxml.html.fromstring(test_data)
fix_emojis(fragment, "http://example.com", "google")
actual_output = lxml.html.tostring(fragment, encoding="unicode")
expected_output = ( expected_output = (
'<p>See <img alt=":cloud_with_lightning_and_rain:" src="http://example.com/static/generated/emoji/images-google-64/26c8.png" ' '<p>See <img alt=":cloud_with_lightning_and_rain:" src="http://example.com/static/generated/emoji/images-google-64/26c8.png" '
+ 'title="cloud with lightning and rain" style="height: 20px;">.</p>' + 'title="cloud with lightning and rain" style="height: 20px;">.</p>'
) )
self.assertEqual(actual_output, expected_output) self.assertEqual(actual_output, expected_output)
def test_empty_backticks_in_missed_message(self) -> None:
msg_id = self.send_personal_message(
self.example_user("othello"),
self.example_user("hamlet"),
"```\n```",
)
verify_body_include = ["view it in Zulip"]
email_subject = "PMs with Othello, the Moor of Venice"
self._test_cases(
msg_id, verify_body_include, email_subject, send_as_user=False, verify_html_body=True
)