markdown: Allow whitespace overlaps in topic linkifiers.

`prepare_linkifier_pattern`, as of db934be064, adds a match to the
end of the regex, of either the end of string, or a non-word character
-- this is in place of a negative look-ahead, which is no longer
possible in re2.  This causes the regex to consume trailing
whitespace, and thus not be able to match twice in succession with
`pattern.finditer` -- "#1234 #5678" fails to match because the space
is consumed by the first match of the regex.

Rather than use `pattern.finditer`, write own own version, which
rewinds over the non-word character consumed after the match, if any.
This allows the same "after" non-word character to also satisfy the
"before" of the next match.

Fixes #21502.
This commit is contained in:
Alex Vandiver 2022-03-21 20:11:23 -04:00 committed by Tim Abbott
parent d89b5042a9
commit 1ac0035f8c
2 changed files with 27 additions and 4 deletions

View File

@ -2365,9 +2365,21 @@ def topic_links(linkifiers_key: int, topic_name: str) -> List[Dict[str, str]]:
# here on an invalid regex would spam the logs with every
# message sent; simply move on.
continue
for m in pattern.finditer(topic_name):
pos = 0
while pos < len(topic_name):
m = pattern.search(topic_name, pos)
if m is None:
break
match_details = m.groupdict()
match_text = match_details[OUTER_CAPTURE_GROUP]
# Adjust the start point of the match for the next
# iteration -- we rewind the non-word character at the
# end, if there was one, so a potential next match can
# also use it.
pos = m.end() - len(match_details[AFTER_CAPTURE_GROUP])
# We format the linkifier's url string using the matched text.
# Also, we include the matched text in the response, so that our clients
# don't have to implement any logic of their own to get back the text.

View File

@ -1322,13 +1322,13 @@ class MarkdownTest(ZulipTestCase):
flush_per_request_caches()
content = "We should fix #224 and #115, but not issue#124 or #1124z or [trac #15](https://trac.example.com/ticket/16) today."
content = "We should fix #224 #336 #446 and #115, but not issue#124 or #1124z or [trac #15](https://trac.example.com/ticket/16) today."
converted = markdown_convert(content, message_realm=realm, message=msg)
converted_topic = topic_links(realm.id, msg.topic_name())
self.assertEqual(
converted.rendered_content,
'<p>We should fix <a href="https://trac.example.com/ticket/224">#224</a> and <a href="https://trac.example.com/ticket/115">#115</a>, but not issue#124 or #1124z or <a href="https://trac.example.com/ticket/16">trac #15</a> today.</p>',
'<p>We should fix <a href="https://trac.example.com/ticket/224">#224</a> <a href="https://trac.example.com/ticket/336">#336</a> <a href="https://trac.example.com/ticket/446">#446</a> and <a href="https://trac.example.com/ticket/115">#115</a>, but not issue#124 or #1124z or <a href="https://trac.example.com/ticket/16">trac #15</a> today.</p>',
)
self.assertEqual(
converted_topic, [{"url": "https://trac.example.com/ticket/444", "text": "#444"}]
@ -1344,6 +1344,17 @@ class MarkdownTest(ZulipTestCase):
],
)
msg.set_topic_name("#444 #555 #666")
converted_topic = topic_links(realm.id, msg.topic_name())
self.assertEqual(
converted_topic,
[
{"url": "https://trac.example.com/ticket/444", "text": "#444"},
{"url": "https://trac.example.com/ticket/555", "text": "#555"},
{"url": "https://trac.example.com/ticket/666", "text": "#666"},
],
)
RealmFilter(
realm=realm,
pattern=r"#(?P<id>[a-zA-Z]+-[0-9]+)",
@ -1474,7 +1485,7 @@ class MarkdownTest(ZulipTestCase):
converted.rendered_content,
'<p>We should fix <a href="https://trac.example.com/ticket/ABC-123">ABC-123</a> or <a href="https://trac.example.com/ticket/16">trac ABC-123</a> today.</p>',
)
# Both the links should be generated in topics.
# But both the links should be generated in topics.
self.assertEqual(
converted_topic,
[