markdown: Rewrite YouTube URL parser without regex spaghetti.

This also adds support for the new YouTube Shorts URLs.

Signed-off-by: Anders Kaseorg <anders@zulip.com>
This commit is contained in:
Anders Kaseorg 2023-02-09 15:45:11 -08:00 committed by Tim Abbott
parent 53aa3f6c71
commit 0a1904a6a7
2 changed files with 27 additions and 27 deletions

View File

@ -26,7 +26,7 @@ from typing import (
TypeVar, TypeVar,
Union, Union,
) )
from urllib.parse import urlencode, urljoin, urlsplit from urllib.parse import parse_qs, urlencode, urljoin, urlsplit
from xml.etree.ElementTree import Element, SubElement from xml.etree.ElementTree import Element, SubElement
import ahocorasick import ahocorasick
@ -811,28 +811,30 @@ class InlineInterestingLinkProcessor(markdown.treeprocessors.Treeprocessor):
def youtube_id(self, url: str) -> Optional[str]: def youtube_id(self, url: str) -> Optional[str]:
if not self.zmd.image_preview_enabled: if not self.zmd.image_preview_enabled:
return None return None
# YouTube video id extraction regular expression from https://pastebin.com/KyKAFv1s
# Slightly modified to support URLs of the forms id = None
# - youtu.be/<id> split_url = urlsplit(url)
# - youtube.com/playlist?v=<id>&list=<list-id> if split_url.scheme in ("http", "https"):
# - youtube.com/watch_videos?video_ids=<id1>,<id2>,<id3> if split_url.hostname in (
# If it matches, match.group(2) is the video id. "m.youtube.com",
schema_re = r"(?:https?://)" "www.youtube.com",
host_re = r"(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)" "www.youtube-nocookie.com",
param_re = ( "youtube.com",
r"(?:(?:(?:v|embed)/)" "youtube-nocookie.com",
r"|(?:(?:(?:watch|playlist)(?:_popup|_videos)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v(?:ideo_ids)?=))" ):
) query = parse_qs(split_url.query)
id_re = r"([0-9A-Za-z_-]+)" if split_url.path in ("/watch", "/watch_popup") and "v" in query:
youtube_re = r"^({schema_re}?{host_re}{param_re}?)?{id_re}(?(1).+)?$" id = query["v"][0]
youtube_re = youtube_re.format( elif split_url.path == "/watch_videos" and "video_ids" in query:
schema_re=schema_re, host_re=host_re, id_re=id_re, param_re=param_re id = query["video_ids"][0].split(",", 1)[0]
) elif split_url.path.startswith(("/embed/", "/shorts/", "/v/")):
match = re.match(youtube_re, url) id = split_url.path.split("/", 3)[2]
# URLs of the form youtube.com/playlist?list=<list-id> are incorrectly matched elif split_url.hostname == "youtu.be" and split_url.path.startswith("/"):
if match is None or match.group(2) == "playlist": id = split_url.path[len("/") :]
return None
return match.group(2) if id is not None and re.fullmatch(r"[0-9A-Za-z_-]+", id):
return id
return None
def youtube_title(self, extracted_data: UrlEmbedData) -> Optional[str]: def youtube_title(self, extracted_data: UrlEmbedData) -> Optional[str]:
if extracted_data.title is not None: if extracted_data.title is not None:

View File

@ -537,14 +537,12 @@ class MarkdownTest(ZulipTestCase):
'<p><a href="https://www.youtube.com/playlist?list=PL8dPuuaLjXtNlUrzyH5r6jN9ulIgZBpdo">https://www.youtube.com/playlist?list=PL8dPuuaLjXtNlUrzyH5r6jN9ulIgZBpdo</a></p>', '<p><a href="https://www.youtube.com/playlist?list=PL8dPuuaLjXtNlUrzyH5r6jN9ulIgZBpdo">https://www.youtube.com/playlist?list=PL8dPuuaLjXtNlUrzyH5r6jN9ulIgZBpdo</a></p>',
) )
msg = ( msg = "https://www.youtube.com/watch?v=O5nskjZ_GoI&list=PL8dPuuaLjXtNlUrzyH5r6jN9ulIgZBpdo"
"https://www.youtube.com/playlist?v=O5nskjZ_GoI&list=PL8dPuuaLjXtNlUrzyH5r6jN9ulIgZBpdo"
)
converted = markdown_convert_wrapper(msg) converted = markdown_convert_wrapper(msg)
self.assertEqual( self.assertEqual(
converted, converted,
f"""<p><a href="https://www.youtube.com/playlist?v=O5nskjZ_GoI&amp;list=PL8dPuuaLjXtNlUrzyH5r6jN9ulIgZBpdo">https://www.youtube.com/playlist?v=O5nskjZ_GoI&amp;list=PL8dPuuaLjXtNlUrzyH5r6jN9ulIgZBpdo</a></p>\n<div class="youtube-video message_inline_image"><a data-id="O5nskjZ_GoI" href="https://www.youtube.com/playlist?v=O5nskjZ_GoI&amp;list=PL8dPuuaLjXtNlUrzyH5r6jN9ulIgZBpdo"><img src="{get_camo_url("https://i.ytimg.com/vi/O5nskjZ_GoI/default.jpg")}"></a></div>""", f"""<p><a href="https://www.youtube.com/watch?v=O5nskjZ_GoI&amp;list=PL8dPuuaLjXtNlUrzyH5r6jN9ulIgZBpdo">https://www.youtube.com/watch?v=O5nskjZ_GoI&amp;list=PL8dPuuaLjXtNlUrzyH5r6jN9ulIgZBpdo</a></p>\n<div class="youtube-video message_inline_image"><a data-id="O5nskjZ_GoI" href="https://www.youtube.com/watch?v=O5nskjZ_GoI&amp;list=PL8dPuuaLjXtNlUrzyH5r6jN9ulIgZBpdo"><img src="{get_camo_url("https://i.ytimg.com/vi/O5nskjZ_GoI/default.jpg")}"></a></div>""",
) )
msg = "http://www.youtube.com/watch_videos?video_ids=nOJgD4fcZhI,i96UO8-GFvw" msg = "http://www.youtube.com/watch_videos?video_ids=nOJgD4fcZhI,i96UO8-GFvw"