email_notifications: Handle empty rendered_messages.

The transforms called from `build_message_payload` use
`lxml.html.fromstring` to parse (and stringify, and re-parse) the HTML
generated by Markdown.  However, this function fails if it is passed
an empty document.  "empty" is broader than just the empty string; it
also includes any document made entirely out of control characters,
spaces, unpaired surrogates, U+FFFE, or U+FFFF, and so forth.  These
documents would fail to parse, and raise a ParserError.

Using `lxml.html.fragment_fromstring` handles these cases, but does by
wrapping the contents in a <div> every time it is called.  As such,
replacing each `fromstring` with `fragment_fromstring` would nest
another layer of `<div>`.

Instead of each of the helper functions re-parsing, modifying, and
stringifying the HTML, parse it once with `fragment_fromstring` and
pass around the parsed document to each helper, which modifies it
in-place.  This adds one outer `<div>`, requiring minor changes to
tests and the prepend-sender functions.

The modification to add the sender is left using BeautifulSoup, as
that sort of transform is much less readable, and more fiddly, in raw
lxml.

Partial fix for #19559.
This commit is contained in:
Alex Vandiver 2021-08-19 18:10:35 +00:00 committed by Anders Kaseorg
parent 42e3c4e6ec
commit fe25517295
2 changed files with 50 additions and 35 deletions

View File

@ -44,10 +44,7 @@ from zerver.models import (
)
def relative_to_full_url(base_url: str, content: str) -> str:
# Convert relative URLs to absolute URLs.
fragment = lxml.html.fromstring(content)
def relative_to_full_url(fragment: lxml.html.HtmlElement, base_url: str) -> None:
# We handle narrow URLs separately because of two reasons:
# 1: 'lxml' seems to be having an issue in dealing with URLs that begin
# `#` due to which it doesn't add a `/` before joining the base_url to
@ -81,15 +78,14 @@ def relative_to_full_url(base_url: str, content: str) -> str:
image_link = fragment.find("a").get("href")
image_title = fragment.find("a").get("title")
title_attr = {} if image_title is None else {"title": image_title}
fragment = E.P(E.A(image_link, href=image_link, target="_blank", **title_attr))
fragment.clear()
fragment.tag = "p"
fragment.append(E.A(image_link, href=image_link, target="_blank", **title_attr))
fragment.make_links_absolute(base_url)
content = lxml.html.tostring(fragment, encoding="unicode")
return content
def fix_emojis(content: str, base_url: str, emojiset: str) -> str:
def fix_emojis(fragment: lxml.html.HtmlElement, base_url: str, emojiset: str) -> None:
def make_emoji_img_elem(emoji_span_elem: lxml.html.HtmlElement) -> Dict[str, Any]:
# Convert the emoji spans to img tags.
classes = emoji_span_elem.get("class")
@ -106,7 +102,6 @@ def fix_emojis(content: str, base_url: str, emojiset: str) -> str:
img_elem.tail = emoji_span_elem.tail
return img_elem
fragment = lxml.html.fromstring(content)
for elem in fragment.cssselect("span.emoji"):
parent = elem.getparent()
img_elem = make_emoji_img_elem(elem)
@ -116,14 +111,10 @@ def fix_emojis(content: str, base_url: str, emojiset: str) -> str:
del realm_emoji.attrib["class"]
realm_emoji.set("style", "height: 20px;")
content = lxml.html.tostring(fragment, encoding="unicode")
return content
def fix_spoilers_in_html(content: str, language: str) -> str:
def fix_spoilers_in_html(fragment: lxml.html.HtmlElement, language: str) -> None:
with override_language(language):
spoiler_title: str = _("Open Zulip to see the spoiler content")
fragment = lxml.html.fromstring(content)
spoilers = fragment.find_class("spoiler-block")
for spoiler in spoilers:
header = spoiler.find_class("spoiler-header")[0]
@ -141,8 +132,6 @@ def fix_spoilers_in_html(content: str, language: str) -> str:
header_content.append(span_elem)
header.drop_tag()
spoiler_content.drop_tree()
content = lxml.html.tostring(fragment, encoding="unicode")
return content
def fix_spoilers_in_text(content: str, language: str) -> str:
@ -199,7 +188,9 @@ def build_message_list(
message_soup = BeautifulSoup(message_html, "html.parser")
sender_name_soup = BeautifulSoup(f"<b>{sender}</b>: ", "html.parser")
first_tag = message_soup.find()
if first_tag.name == "p":
if first_tag and first_tag.name == "div":
first_tag = first_tag.find()
if first_tag and first_tag.name == "p":
first_tag.insert(0, sender_name_soup)
else:
message_soup.insert(0, sender_name_soup)
@ -217,10 +208,11 @@ def build_message_list(
plain = fix_spoilers_in_text(plain, user.default_language)
assert message.rendered_content is not None
html = message.rendered_content
html = relative_to_full_url(user.realm.uri, html)
html = fix_emojis(html, user.realm.uri, user.emojiset)
html = fix_spoilers_in_html(html, user.default_language)
fragment = lxml.html.fragment_fromstring(message.rendered_content, create_parent=True)
relative_to_full_url(fragment, user.realm.uri)
fix_emojis(fragment, user.realm.uri, user.emojiset)
fix_spoilers_in_html(fragment, user.default_language)
html = lxml.html.tostring(fragment, encoding="unicode")
if sender:
plain, html = prepend_sender_to_message(plain, html, sender)
return {"plain": plain, "html": html}

View File

@ -6,6 +6,7 @@ from unittest import mock
from unittest.mock import patch
import ldap
import lxml.html
import orjson
from django.conf import settings
from django.core import mail
@ -1098,14 +1099,14 @@ class TestMissedMessages(ZulipTestCase):
self.assertIn("Iago: * 1\n *2\n\n--\nYou are receiving", mail.outbox[1].body)
# If message content does not starts with <p> tag sender name is appended before the <p> tag
self.assertIn(
" <b>Iago</b>: <ul>\n<li>1<br/>\n *2</li>\n</ul>\n",
" <b>Iago</b>: <div><ul>\n<li>1<br/>\n *2</li>\n</ul></div>\n",
mail.outbox[1].alternatives[0][0],
)
self.assertEqual("Hello\n\n--\n\nReply", mail.outbox[2].body[:16])
# Sender name is not appended to message for PM missed messages
self.assertIn(
">\n \n <p>Hello</p>\n",
">\n \n <div><p>Hello</p></div>\n",
mail.outbox[2].alternatives[0][0],
)
@ -1243,6 +1244,11 @@ class TestMissedMessages(ZulipTestCase):
self.assertEqual(email_subjects, valid_email_subjects)
def test_relative_to_full_url(self) -> None:
def convert(test_data: str) -> str:
fragment = lxml.html.fromstring(test_data)
relative_to_full_url(fragment, "http://example.com")
return lxml.html.tostring(fragment, encoding="unicode")
zulip_realm = get_realm("zulip")
zephyr_realm = get_realm("zephyr")
# Run `relative_to_full_url()` function over test fixtures present in
@ -1254,7 +1260,7 @@ class TestMissedMessages(ZulipTestCase):
test_fixtures[test["name"]] = test
for test_name in test_fixtures:
test_data = test_fixtures[test_name]["expected_output"]
output_data = relative_to_full_url("http://example.com", test_data)
output_data = convert(test_data)
if re.search(r"""(?<=\=['"])/(?=[^<]+>)""", output_data) is not None:
raise AssertionError(
"Relative URL present in email: "
@ -1268,14 +1274,14 @@ class TestMissedMessages(ZulipTestCase):
# A path similar to our emoji path, but not in a link:
test_data = "<p>Check out the file at: '/static/generated/emoji/images/emoji/'</p>"
actual_output = relative_to_full_url("http://example.com", test_data)
actual_output = convert(test_data)
expected_output = "<p>Check out the file at: '/static/generated/emoji/images/emoji/'</p>"
self.assertEqual(actual_output, expected_output)
# An uploaded file
test_data = '<a href="/user_uploads/{realm_id}/1f/some_random_value">/user_uploads/{realm_id}/1f/some_random_value</a>'
test_data = test_data.format(realm_id=zephyr_realm.id)
actual_output = relative_to_full_url("http://example.com", test_data)
actual_output = convert(test_data)
expected_output = (
'<a href="http://example.com/user_uploads/{realm_id}/1f/some_random_value">'
+ "/user_uploads/{realm_id}/1f/some_random_value</a>"
@ -1285,7 +1291,7 @@ class TestMissedMessages(ZulipTestCase):
# A profile picture like syntax, but not actually in an HTML tag
test_data = '<p>Set src="/avatar/username@example.com?s=30"</p>'
actual_output = relative_to_full_url("http://example.com", test_data)
actual_output = convert(test_data)
expected_output = '<p>Set src="/avatar/username@example.com?s=30"</p>'
self.assertEqual(actual_output, expected_output)
@ -1294,7 +1300,7 @@ class TestMissedMessages(ZulipTestCase):
'<p><a href="#narrow/stream/test/topic/test.20topic/near/142"'
+ 'title="#narrow/stream/test/topic/test.20topic/near/142">Conversation</a></p>'
)
actual_output = relative_to_full_url("http://example.com", test_data)
actual_output = convert(test_data)
expected_output = (
'<p><a href="http://example.com/#narrow/stream/test/topic/test.20topic/near/142" '
+ 'title="http://example.com/#narrow/stream/test/topic/test.20topic/near/142">Conversation</a></p>'
@ -1309,7 +1315,7 @@ class TestMissedMessages(ZulipTestCase):
+ 'target="_blank" title="avatar_103.jpeg"><img src="/user_uploads/{realm_id}/52/fG7GM9e3afz_qsiUcSce2tl_/avatar_103.jpeg"></a></div>'
)
test_data = test_data.format(realm_id=zulip_realm.id)
actual_output = relative_to_full_url("http://example.com", test_data)
actual_output = convert(test_data)
expected_output = (
'<div><p>See this <a href="http://example.com/user_uploads/{realm_id}/52/fG7GM9e3afz_qsiUcSce2tl_/avatar_103.jpeg" target="_blank" '
+ 'title="avatar_103.jpeg">avatar_103.jpeg</a>.</p></div>'
@ -1325,7 +1331,7 @@ class TestMissedMessages(ZulipTestCase):
+ '<img data-src-fullsize="/thumbnail/https%3A//www.google.com/images/srpr/logo4w.png?size=0x0" '
+ 'src="/thumbnail/https%3A//www.google.com/images/srpr/logo4w.png?size=0x100"></a></div>'
)
actual_output = relative_to_full_url("http://example.com", test_data)
actual_output = convert(test_data)
expected_output = (
'<p><a href="https://www.google.com/images/srpr/logo4w.png" '
+ 'target="_blank" title="https://www.google.com/images/srpr/logo4w.png">'
@ -1335,7 +1341,9 @@ class TestMissedMessages(ZulipTestCase):
def test_spoilers_in_html_emails(self) -> None:
test_data = '<div class="spoiler-block"><div class="spoiler-header">\n\n<p><a>header</a> text</p>\n</div><div class="spoiler-content" aria-hidden="true">\n\n<p>content</p>\n</div></div>\n\n<p>outside spoiler</p>'
actual_output = fix_spoilers_in_html(test_data, "en")
fragment = lxml.html.fromstring(test_data)
fix_spoilers_in_html(fragment, "en")
actual_output = lxml.html.tostring(fragment, encoding="unicode")
expected_output = '<div><div class="spoiler-block">\n\n<p><a>header</a> text <span class="spoiler-title" title="Open Zulip to see the spoiler content">(Open Zulip to see the spoiler content)</span></p>\n</div>\n\n<p>outside spoiler</p></div>'
self.assertEqual(actual_output, expected_output)
@ -1346,8 +1354,9 @@ class TestMissedMessages(ZulipTestCase):
if "spoiler" in test["name"]:
test_fixtures[test["name"]] = test
for test_name in test_fixtures:
test_data = test_fixtures[test_name]["expected_output"]
output_data = fix_spoilers_in_html(test_data, "en")
fragment = lxml.html.fromstring(test_fixtures[test_name]["expected_output"])
fix_spoilers_in_html(fragment, "en")
output_data = lxml.html.tostring(fragment, encoding="unicode")
assert "spoiler-header" not in output_data
assert "spoiler-content" not in output_data
assert "spoiler-block" in output_data
@ -1375,9 +1384,23 @@ class TestMissedMessages(ZulipTestCase):
'<p>See <span aria-label="cloud with lightning and rain" class="emoji emoji-26c8" role="img" title="cloud with lightning and rain">'
+ ":cloud_with_lightning_and_rain:</span>.</p>"
)
actual_output = fix_emojis(test_data, "http://example.com", "google")
fragment = lxml.html.fromstring(test_data)
fix_emojis(fragment, "http://example.com", "google")
actual_output = lxml.html.tostring(fragment, encoding="unicode")
expected_output = (
'<p>See <img alt=":cloud_with_lightning_and_rain:" src="http://example.com/static/generated/emoji/images-google-64/26c8.png" '
+ 'title="cloud with lightning and rain" style="height: 20px;">.</p>'
)
self.assertEqual(actual_output, expected_output)
def test_empty_backticks_in_missed_message(self) -> None:
msg_id = self.send_personal_message(
self.example_user("othello"),
self.example_user("hamlet"),
"```\n```",
)
verify_body_include = ["view it in Zulip"]
email_subject = "PMs with Othello, the Moor of Venice"
self._test_cases(
msg_id, verify_body_include, email_subject, send_as_user=False, verify_html_body=True
)