url_preview: Only return image URLs that validate as URLs.

This commit is contained in:
Alex Vandiver 2022-02-18 13:48:53 -08:00 committed by Alex Vandiver
parent 6afdf2410d
commit e53f9fad29
3 changed files with 73 additions and 0 deletions

View File

@ -1,4 +1,5 @@
from typing import Dict, Optional
from urllib.parse import urlparse
from bs4.element import Tag
@ -48,5 +49,11 @@ class GenericParser(BaseParser):
first_image = first_h1.find_next_sibling("img", src=True)
if isinstance(first_image, Tag) and first_image["src"] != "":
assert isinstance(first_image["src"], str)
try:
# We use urlparse and not URLValidator because we
# need to support relative URLs.
urlparse(first_image["src"])
except ValueError:
return None
return first_image["src"]
return None

View File

@ -1,4 +1,5 @@
from typing import Dict
from urllib.parse import urlparse
from .base import BaseParser
@ -23,6 +24,14 @@ class OpenGraphParser(BaseParser):
if not tag.has_attr("content"):
continue
if og_property_name == "image":
try:
# We use urlparse and not URLValidator because we
# need to support relative URLs.
urlparse(tag["content"])
except ValueError:
continue
result[og_property_name] = tag["content"]
return result

View File

@ -290,6 +290,25 @@ class GenericParserTestCase(ZulipTestCase):
self.assertEqual(result.get("description"), "Description text")
self.assertEqual(result.get("image"), "http://test.com/test.jpg")
def test_extract_bad_image(self) -> None:
html = b"""
<html>
<body>
<h1>Main header</h1>
<img data-src="Not an image">
<img src="http://[bad url/test.jpg">
<div>
<p>Description text</p>
</div>
</body>
</html>
"""
parser = GenericParser(html, "text/html; charset=UTF-8")
result = parser.extract_data()
self.assertEqual(result.get("title"), "Main header")
self.assertEqual(result.get("description"), "Description text")
self.assertIsNone(result.get("image"))
def test_extract_description(self) -> None:
html = b"""
<html>
@ -721,6 +740,44 @@ class PreviewTestCase(ZulipTestCase):
msg.rendered_content,
)
@responses.activate
@override_settings(INLINE_URL_EMBED_PREVIEW=True)
def test_link_preview_open_graph_image_bad_url(self) -> None:
user = self.example_user("hamlet")
self.login_user(user)
url = "http://test.org/foo.html"
with mock_queue_publish("zerver.lib.actions.queue_json_publish") as patched:
msg_id = self.send_stream_message(user, "Denmark", topic_name="foo", content=url)
patched.assert_called_once()
queue = patched.call_args[0][0]
self.assertEqual(queue, "embed_links")
event = patched.call_args[0][1]
# HTML with a bad og:image metadata
html = "\n".join(
line
if "og:image" not in line
else '<meta property="og:image" content="http://[bad url/" />'
for line in self.open_graph_html.splitlines()
)
self.create_mock_response(url, body=html)
with self.settings(TEST_SUITE=False, CACHES=TEST_CACHES):
with self.assertLogs(level="INFO") as info_logs:
FetchLinksEmbedData().consume(event)
cached_data = link_embed_data_from_cache(url)
self.assertTrue(
"INFO:root:Time spent on get_link_embed_data for http://test.org/foo.html: "
in info_logs.output[0]
)
self.assertIn("title", cached_data)
self.assertNotIn("image", cached_data)
msg = Message.objects.select_related("sender").get(id=msg_id)
self.assertEqual(
('<p><a href="http://test.org/foo.html">' "http://test.org/foo.html</a></p>"),
msg.rendered_content,
)
@responses.activate
@override_settings(INLINE_URL_EMBED_PREVIEW=True)
def test_link_preview_open_graph_image_missing_content(self) -> None: