email_notifications: Handle empty rendered_messages.

The transforms called from `build_message_payload` use `lxml.html.fromstring` to parse (and stringify, and re-parse) the HTML generated by Markdown. However, this function fails if it is passed an empty document. "empty" is broader than just the empty string; it also includes any document made entirely out of control characters, spaces, unpaired surrogates, U+FFFE, or U+FFFF, and so forth. These documents would fail to parse, and raise a ParserError. Using `lxml.html.fragment_fromstring` handles these cases, but does by wrapping the contents in a <div> every time it is called. As such, replacing each `fromstring` with `fragment_fromstring` would nest another layer of `<div>`. Instead of each of the helper functions re-parsing, modifying, and stringifying the HTML, parse it once with `fragment_fromstring` and pass around the parsed document to each helper, which modifies it in-place. This adds one outer `<div>`, requiring minor changes to tests and the prepend-sender functions. The modification to add the sender is left using BeautifulSoup, as that sort of transform is much less readable, and more fiddly, in raw lxml. Partial fix for #19559.
2021-08-19 18:10:35 +00:00 · 2021-08-19 18:10:35 +00:00 · fe25517295
parent 42e3c4e6ec
commit fe25517295
2 changed files with 50 additions and 35 deletions
--- a/zerver/lib/email_notifications.py
+++ b/zerver/lib/email_notifications.py
@ -44,10 +44,7 @@ from zerver.models import (
 )
-def relative_to_full_url(base_url: str, content: str) -> str:
+def relative_to_full_url(fragment: lxml.html.HtmlElement, base_url: str) -> None:
    # Convert relative URLs to absolute URLs.
    fragment = lxml.html.fromstring(content)
    # We handle narrow URLs separately because of two reasons:
    # 1: 'lxml' seems to be having an issue in dealing with URLs that begin
    # `#` due to which it doesn't add a `/` before joining the base_url to
@ -81,15 +78,14 @@ def relative_to_full_url(base_url: str, content: str) -> str:
        image_link = fragment.find("a").get("href")
        image_title = fragment.find("a").get("title")
        title_attr = {} if image_title is None else {"title": image_title}
-        fragment = E.P(E.A(image_link, href=image_link, target="_blank", **title_attr))
+        fragment.clear()
        fragment.tag = "p"
        fragment.append(E.A(image_link, href=image_link, target="_blank", **title_attr))
    fragment.make_links_absolute(base_url)
    content = lxml.html.tostring(fragment, encoding="unicode")
    return content
-def fix_emojis(content: str, base_url: str, emojiset: str) -> str:
+def fix_emojis(fragment: lxml.html.HtmlElement, base_url: str, emojiset: str) -> None:
    def make_emoji_img_elem(emoji_span_elem: lxml.html.HtmlElement) -> Dict[str, Any]:
        # Convert the emoji spans to img tags.
        classes = emoji_span_elem.get("class")
@ -106,7 +102,6 @@ def fix_emojis(content: str, base_url: str, emojiset: str) -> str:
        img_elem.tail = emoji_span_elem.tail
        return img_elem
    fragment = lxml.html.fromstring(content)
    for elem in fragment.cssselect("span.emoji"):
        parent = elem.getparent()
        img_elem = make_emoji_img_elem(elem)
@ -116,14 +111,10 @@ def fix_emojis(content: str, base_url: str, emojiset: str) -> str:
        del realm_emoji.attrib["class"]
        realm_emoji.set("style", "height: 20px;")
    content = lxml.html.tostring(fragment, encoding="unicode")
    return content
-
+def fix_spoilers_in_html(fragment: lxml.html.HtmlElement, language: str) -> None:
 def fix_spoilers_in_html(content: str, language: str) -> str:
    with override_language(language):
        spoiler_title: str = _("Open Zulip to see the spoiler content")
    fragment = lxml.html.fromstring(content)
    spoilers = fragment.find_class("spoiler-block")
    for spoiler in spoilers:
        header = spoiler.find_class("spoiler-header")[0]
@ -141,8 +132,6 @@ def fix_spoilers_in_html(content: str, language: str) -> str:
        header_content.append(span_elem)
        header.drop_tag()
        spoiler_content.drop_tree()
    content = lxml.html.tostring(fragment, encoding="unicode")
    return content
 def fix_spoilers_in_text(content: str, language: str) -> str:
@ -199,7 +188,9 @@ def build_message_list(
        message_soup = BeautifulSoup(message_html, "html.parser")
        sender_name_soup = BeautifulSoup(f"<b>{sender}</b>: ", "html.parser")
        first_tag = message_soup.find()
-        if first_tag.name == "p":
+        if first_tag and first_tag.name == "div":
            first_tag = first_tag.find()
        if first_tag and first_tag.name == "p":
            first_tag.insert(0, sender_name_soup)
        else:
            message_soup.insert(0, sender_name_soup)
@ -217,10 +208,11 @@ def build_message_list(
        plain = fix_spoilers_in_text(plain, user.default_language)
        assert message.rendered_content is not None
-        html = message.rendered_content
+        fragment = lxml.html.fragment_fromstring(message.rendered_content, create_parent=True)
-        html = relative_to_full_url(user.realm.uri, html)
+        relative_to_full_url(fragment, user.realm.uri)
-        html = fix_emojis(html, user.realm.uri, user.emojiset)
+        fix_emojis(fragment, user.realm.uri, user.emojiset)
-        html = fix_spoilers_in_html(html, user.default_language)
+        fix_spoilers_in_html(fragment, user.default_language)
        html = lxml.html.tostring(fragment, encoding="unicode")
        if sender:
            plain, html = prepend_sender_to_message(plain, html, sender)
        return {"plain": plain, "html": html}
--- a/zerver/tests/test_email_notifications.py
+++ b/zerver/tests/test_email_notifications.py
@ -6,6 +6,7 @@ from unittest import mock
 from unittest.mock import patch
 import ldap
 import lxml.html
 import orjson
 from django.conf import settings
 from django.core import mail
@ -1098,14 +1099,14 @@ class TestMissedMessages(ZulipTestCase):
        self.assertIn("Iago: * 1\n *2\n\n--\nYou are receiving", mail.outbox[1].body)
        # If message content does not starts with <p> tag sender name is appended before the <p> tag
        self.assertIn(
-            "       <b>Iago</b>: <ul>\n<li>1<br/>\n *2</li>\n</ul>\n",
+            "       <b>Iago</b>: <div><ul>\n<li>1<br/>\n *2</li>\n</ul></div>\n",
            mail.outbox[1].alternatives[0][0],
        )
        self.assertEqual("Hello\n\n--\n\nReply", mail.outbox[2].body[:16])
        # Sender name is not appended to message for PM missed messages
        self.assertIn(
-            ">\n                    \n                        <p>Hello</p>\n",
+            ">\n                    \n                        <div><p>Hello</p></div>\n",
            mail.outbox[2].alternatives[0][0],
        )
@ -1243,6 +1244,11 @@ class TestMissedMessages(ZulipTestCase):
        self.assertEqual(email_subjects, valid_email_subjects)
    def test_relative_to_full_url(self) -> None:
        def convert(test_data: str) -> str:
            fragment = lxml.html.fromstring(test_data)
            relative_to_full_url(fragment, "http://example.com")
            return lxml.html.tostring(fragment, encoding="unicode")
        zulip_realm = get_realm("zulip")
        zephyr_realm = get_realm("zephyr")
        # Run `relative_to_full_url()` function over test fixtures present in
@ -1254,7 +1260,7 @@ class TestMissedMessages(ZulipTestCase):
            test_fixtures[test["name"]] = test
        for test_name in test_fixtures:
            test_data = test_fixtures[test_name]["expected_output"]
-            output_data = relative_to_full_url("http://example.com", test_data)
+            output_data = convert(test_data)
            if re.search(r"""(?<=\=['"])/(?=[^<]+>)""", output_data) is not None:
                raise AssertionError(
                    "Relative URL present in email: "
@ -1268,14 +1274,14 @@ class TestMissedMessages(ZulipTestCase):
        # A path similar to our emoji path, but not in a link:
        test_data = "<p>Check out the file at: '/static/generated/emoji/images/emoji/'</p>"
-        actual_output = relative_to_full_url("http://example.com", test_data)
+        actual_output = convert(test_data)
        expected_output = "<p>Check out the file at: '/static/generated/emoji/images/emoji/'</p>"
        self.assertEqual(actual_output, expected_output)
        # An uploaded file
        test_data = '<a href="/user_uploads/{realm_id}/1f/some_random_value">/user_uploads/{realm_id}/1f/some_random_value</a>'
        test_data = test_data.format(realm_id=zephyr_realm.id)
-        actual_output = relative_to_full_url("http://example.com", test_data)
+        actual_output = convert(test_data)
        expected_output = (
            '<a href="http://example.com/user_uploads/{realm_id}/1f/some_random_value">'
            + "/user_uploads/{realm_id}/1f/some_random_value</a>"
@ -1285,7 +1291,7 @@ class TestMissedMessages(ZulipTestCase):
        # A profile picture like syntax, but not actually in an HTML tag
        test_data = '<p>Set src="/avatar/username@example.com?s=30"</p>'
-        actual_output = relative_to_full_url("http://example.com", test_data)
+        actual_output = convert(test_data)
        expected_output = '<p>Set src="/avatar/username@example.com?s=30"</p>'
        self.assertEqual(actual_output, expected_output)
@ -1294,7 +1300,7 @@ class TestMissedMessages(ZulipTestCase):
            '<p><a href="#narrow/stream/test/topic/test.20topic/near/142"'
            + 'title="#narrow/stream/test/topic/test.20topic/near/142">Conversation</a></p>'
        )
-        actual_output = relative_to_full_url("http://example.com", test_data)
+        actual_output = convert(test_data)
        expected_output = (
            '<p><a href="http://example.com/#narrow/stream/test/topic/test.20topic/near/142" '
            + 'title="http://example.com/#narrow/stream/test/topic/test.20topic/near/142">Conversation</a></p>'
@ -1309,7 +1315,7 @@ class TestMissedMessages(ZulipTestCase):
            + 'target="_blank" title="avatar_103.jpeg"><img src="/user_uploads/{realm_id}/52/fG7GM9e3afz_qsiUcSce2tl_/avatar_103.jpeg"></a></div>'
        )
        test_data = test_data.format(realm_id=zulip_realm.id)
-        actual_output = relative_to_full_url("http://example.com", test_data)
+        actual_output = convert(test_data)
        expected_output = (
            '<div><p>See this <a href="http://example.com/user_uploads/{realm_id}/52/fG7GM9e3afz_qsiUcSce2tl_/avatar_103.jpeg" target="_blank" '
            + 'title="avatar_103.jpeg">avatar_103.jpeg</a>.</p></div>'
@ -1325,7 +1331,7 @@ class TestMissedMessages(ZulipTestCase):
            + '<img data-src-fullsize="/thumbnail/https%3A//www.google.com/images/srpr/logo4w.png?size=0x0" '
            + 'src="/thumbnail/https%3A//www.google.com/images/srpr/logo4w.png?size=0x100"></a></div>'
        )
-        actual_output = relative_to_full_url("http://example.com", test_data)
+        actual_output = convert(test_data)
        expected_output = (
            '<p><a href="https://www.google.com/images/srpr/logo4w.png" '
            + 'target="_blank" title="https://www.google.com/images/srpr/logo4w.png">'
@ -1335,7 +1341,9 @@ class TestMissedMessages(ZulipTestCase):
    def test_spoilers_in_html_emails(self) -> None:
        test_data = '<div class="spoiler-block"><div class="spoiler-header">\n\n<p><a>header</a> text</p>\n</div><div class="spoiler-content" aria-hidden="true">\n\n<p>content</p>\n</div></div>\n\n<p>outside spoiler</p>'
-        actual_output = fix_spoilers_in_html(test_data, "en")
+        fragment = lxml.html.fromstring(test_data)
        fix_spoilers_in_html(fragment, "en")
        actual_output = lxml.html.tostring(fragment, encoding="unicode")
        expected_output = '<div><div class="spoiler-block">\n\n<p><a>header</a> text <span class="spoiler-title" title="Open Zulip to see the spoiler content">(Open Zulip to see the spoiler content)</span></p>\n</div>\n\n<p>outside spoiler</p></div>'
        self.assertEqual(actual_output, expected_output)
@ -1346,8 +1354,9 @@ class TestMissedMessages(ZulipTestCase):
            if "spoiler" in test["name"]:
                test_fixtures[test["name"]] = test
        for test_name in test_fixtures:
-            test_data = test_fixtures[test_name]["expected_output"]
+            fragment = lxml.html.fromstring(test_fixtures[test_name]["expected_output"])
-            output_data = fix_spoilers_in_html(test_data, "en")
+            fix_spoilers_in_html(fragment, "en")
            output_data = lxml.html.tostring(fragment, encoding="unicode")
            assert "spoiler-header" not in output_data
            assert "spoiler-content" not in output_data
            assert "spoiler-block" in output_data
@ -1375,9 +1384,23 @@ class TestMissedMessages(ZulipTestCase):
            '<p>See <span aria-label="cloud with lightning and rain" class="emoji emoji-26c8" role="img" title="cloud with lightning and rain">'
            + ":cloud_with_lightning_and_rain:</span>.</p>"
        )
-        actual_output = fix_emojis(test_data, "http://example.com", "google")
+        fragment = lxml.html.fromstring(test_data)
        fix_emojis(fragment, "http://example.com", "google")
        actual_output = lxml.html.tostring(fragment, encoding="unicode")
        expected_output = (
            '<p>See <img alt=":cloud_with_lightning_and_rain:" src="http://example.com/static/generated/emoji/images-google-64/26c8.png" '
            + 'title="cloud with lightning and rain" style="height: 20px;">.</p>'
        )
        self.assertEqual(actual_output, expected_output)
    def test_empty_backticks_in_missed_message(self) -> None:
        msg_id = self.send_personal_message(
            self.example_user("othello"),
            self.example_user("hamlet"),
            "```\n```",
        )
        verify_body_include = ["view it in Zulip"]
        email_subject = "PMs with Othello, the Moor of Venice"
        self._test_cases(
            msg_id, verify_body_include, email_subject, send_as_user=False, verify_html_body=True
        )