markdown: Calculate linkifier precedence in topics.

This uses the linkifier index among the list of linkifiers in the replacement as the priority to order the replacement order for patterns in the topic. This avoids having multiple overlapping matches that each produce a link. The linkifier with the lowest id will be prioritized when its pattern overlaps with another. Linkifiers are prioritized over raw URLs. Note that the same algorithm is used for local echoing and the backend markdown processor. Fixes #23715. Signed-off-by: Zixuan James Li <p359101898@gmail.com>
2022-12-01 19:39:06 -05:00 · 2022-12-01 19:39:06 -05:00 · a3a0103d86
parent 5f4d857d3c
commit a3a0103d86
4 changed files with 378 additions and 52 deletions
--- a/frontend_tests/node_tests/markdown_parse.js
+++ b/frontend_tests/node_tests/markdown_parse.js
@ -6,6 +6,7 @@ const {zrequire} = require("../zjsunit/namespace");
 const {run_test} = require("../zjsunit/test");

 const markdown = zrequire("markdown");
+const linkifiers = zrequire("linkifiers");

 const my_id = 101;

@ -216,10 +217,17 @@ run_test("linkifiers", () => {
    );
 });

+function assert_topic_links(topic, expected_links) {
+    const topic_links = markdown.get_topic_links({
+        topic,
+        get_linkifier_map: linkifiers.get_linkifier_map,
+    });
+    assert.deepEqual(topic_links, expected_links);
+}
+
 run_test("topic links", () => {
-    const topic = "progress on #foo101 and #foo102";
-    const topic_links = markdown.get_topic_links({topic, get_linkifier_map});
-    assert.deepEqual(topic_links, [
+    linkifiers.initialize([{pattern: "#foo(?P<id>\\d+)", url_format: "http://foo.com/%(id)s"}]);
+    assert_topic_links("progress on #foo101 and #foo102", [
        {
            text: "#foo101",
            url: "http://foo.com/101",
@ -235,8 +243,8 @@ run_test("topic links repeated", () => {
    // Links generated from repeated patterns should preserve the order.
    const topic =
        "#foo101 https://google.com #foo102 #foo103 https://google.com #foo101 #foo102 #foo103";
-    const topic_links = markdown.get_topic_links({topic, get_linkifier_map});
-    assert.deepEqual(topic_links, [
+    linkifiers.initialize([{pattern: "#foo(?P<id>\\d+)", url_format: "http://foo.com/%(id)s"}]);
+    assert_topic_links(topic, [
        {
            text: "#foo101",
            url: "http://foo.com/101",
@ -271,3 +279,100 @@ run_test("topic links repeated", () => {
        },
    ]);
 });
+
+run_test("topic links overlapping", () => {
+    linkifiers.initialize([
+        {pattern: "[a-z]+(?P<id>1\\d+) #[a-z]+", url_format: "http://a.com/%(id)s"},
+        {pattern: "[a-z]+(?P<id>1\\d+)", url_format: "http://b.com/%(id)s"},
+        {pattern: ".+#(?P<id>[a-z]+)", url_format: "http://wildcard.com/%(id)s"},
+        {pattern: "#(?P<id>[a-z]+)", url_format: "http://c.com/%(id)s"},
+    ]);
+    // b.com's pattern should be matched while it overlaps with c.com's.
+    assert_topic_links("#foo100", [
+        {
+            text: "foo100",
+            url: "http://b.com/100",
+        },
+    ]);
+    // a.com's pattern should be matched while it overlaps with b.com's, wildcard.com's and c.com's.
+    assert_topic_links("#asd123 #asd", [
+        {
+            text: "asd123 #asd",
+            url: "http://a.com/123",
+        },
+    ]);
+    // a.com's pattern do not match, wildcard.com's and b.com's patterns should match
+    // and the links are ordered by the matched index.
+    assert_topic_links("/#asd #foo100", [
+        {
+            text: "/#asd",
+            url: "http://wildcard.com/asd",
+        },
+        {
+            text: "foo100",
+            url: "http://b.com/100",
+        },
+    ]);
+    assert_topic_links("foo.anything/#asd", [
+        {
+            text: "foo.anything/#asd",
+            url: "http://wildcard.com/asd",
+        },
+    ]);
+
+    // While the raw URL "http://foo.com/foo100" appears before b.com's match "foo100",
+    // we prioritize the linkifier match first.
+    assert_topic_links("http://foo.com/foo100", [
+        {
+            text: "foo100",
+            url: "http://b.com/100",
+        },
+    ]);
+
+    // Here the raw URL "https://foo.com/#asd" appears after wildcard.com's match "something https://foo.com/#asd".
+    // The latter is prioritized and the raw URL does not get included.
+    assert_topic_links("something https://foo.com/#asd", [
+        {
+            text: "something https://foo.com/#asd",
+            url: "http://wildcard.com/asd",
+        },
+    ]);
+});
+
+run_test("topic links ordering by priority", () => {
+    // The same test case is also implemented in zerver/tests/test_markdown.py
+    linkifiers.initialize([
+        {pattern: "http", url_format: "http://example.com/"},
+        {pattern: "b#(?P<id>[a-z]+)", url_format: "http://example.com/b/%(id)s"},
+        {
+            pattern: "a#(?P<aid>[a-z]+) b#(?P<bid>[a-z]+)",
+            url_format: "http://example.com/a/%(aid)s/b/%(bid)",
+        },
+        {pattern: "a#(?P<id>[a-z]+)", url_format: "http://example.com/a/%(id)s"},
+    ]);
+
+    // There should be 5 link matches in the topic, if ordered from the most priortized to the least:
+    // 1. "http" (linkifier)
+    // 2. "b#bar" (linkifier)
+    // 3. "a#asd b#bar" (linkifier)
+    // 4. "a#asd" (linkifier)
+    // 5. "http://foo.com" (raw URL)
+    // When there are overlapping matches, the one that appears earlier in the list should
+    // have a topic link generated.
+    // For this test case, while "a#asd" and "a#asd b#bar" both match and they overlap,
+    // there is a match "b#bar" with a higher priority, preventing "a#asd b#bar" from being matched.
+    assert_topic_links("http://foo.com a#asd b#bar", [
+        {
+            text: "http",
+            url: "http://example.com/",
+        },
+        {
+            text: "a#asd",
+            url: "http://example.com/a/asd",
+        },
+        {
+            text: "b#bar",
+            url: "http://example.com/b/bar",
+        },
+    ]);
+});
--- a/static/js/markdown.js
+++ b/static/js/markdown.js
@ -276,10 +276,23 @@ function parse_with_options({raw_content, helper_config, options}) {
    return {content, flags};
 }

+function is_x_between(x, start, length) {
+    return start <= x && x < start + length;
+}
+
+function is_overlapping(match_a, match_b) {
+    return (
+        is_x_between(match_a.index, match_b.index, match_b.text.length) ||
+        is_x_between(match_b.index, match_a.index, match_a.text.length)
+    );
+}
+
 export function get_topic_links({topic, get_linkifier_map}) {
    // We export this for testing purposes, and mobile may want to
    // use this as well in the future.
    const links = [];
+    // The lower the precedence is, the more prioritized the pattern is.
+    let precedence = 0;

    for (const [pattern, url] of get_linkifier_map().entries()) {
        let match;
@ -296,22 +309,54 @@ export function get_topic_links({topic, get_linkifier_map}) {
            }
            // We store the starting index as well, to sort the order of occurrence of the links
            // in the topic, similar to the logic implemented in zerver/lib/markdown/__init__.py
-            links.push({url: link_url, text: match[0], index: match.index});
+            links.push({url: link_url, text: match[0], index: match.index, precedence});
        }
+        precedence += 1;
    }

+    // Sort the matches beforehand so we favor the match with a higher priority and tie-break with the starting index.
+    // Note that we sort it before processing the raw URLs so that linkifiers will be prioritized over them.
+    links.sort((a, b) => {
+        if (a.precedence !== null && b.precedence !== null) {
+            // When both of the links have precedence set, find the one that comes first.
+            const diff = a.precedence - b.precedence;
+            if (diff !== 0) {
+                return diff;
+            }
+        }
+        // Fallback to the index when there is either a tie in precedence or at least one of the links is a raw URL.
+        return a.index - b.index;
+    });
+
    // Also make raw URLs navigable
    const url_re = /\b(https?:\/\/[^\s<]+[^\s"'),.:;<\]])/g; // Slightly modified from third/marked.js
    let match;
    while ((match = url_re.exec(topic)) !== null) {
-        links.push({url: match[0], text: match[0], index: match.index});
-    }
-    links.sort((a, b) => a.index - b.index);
-    for (const match of links) {
-        delete match.index;
+        links.push({url: match[0], text: match[0], index: match.index, precedence: null});
    }
+    // The following removes overlapping intervals depending on the precedence of linkifier patterns.
+    // This uses the same algorithm implemented in zerver/lib/markdown/__init__.py.
+    // To avoid mutating links while processing links, the final output gets pushed to another list.
+    const applied_matches = [];

-    return links;
+    // To avoid mutating matches inside the loop, the final output gets appended to another list.
+    for (const new_match of links) {
+        // When the current match does not overlap with all existing matches,
+        // we are confident that the link should present in the final output because
+        //  1. Given that the links are sorted by precedence, the current match has the highest priority
+        //     among the matches to be checked.
+        //  2. None of the matches with higher priority overlaps with the current match.
+        // This might be optimized to search for overlapping matches in O(logn) time,
+        // but it is kept as-is since performance is not critical for this codepath and for simplicity.
+        if (applied_matches.every((applied_match) => !is_overlapping(applied_match, new_match))) {
+            applied_matches.push(new_match);
+        }
+    }
+    // We need to sort applied_matches again because the links were previously ordered by precedence,
+    // so that the links are displayed in the order their patterns are matched.
+    return applied_matches
+        .sort((a, b) => a.index - b.index)
+        .map((match) => ({url: match.url, text: match.text}));
 }

 export function is_status_message(raw_content) {
--- a/zerver/lib/markdown/init.py
+++ b/zerver/lib/markdown/init.py
@ -2372,13 +2372,22 @@ def percent_escape_format_string(format_string: str) -> str:
    return re.sub(r"(?<!%)(%%)*%([a-fA-F0-9][a-fA-F0-9])", r"\1%%\2", format_string)


+@dataclass
+class TopicLinkMatch:
+    url: str
+    text: str
+    index: int
+    precedence: Optional[int]
+
+
 # Security note: We don't do any HTML escaping in this
 # function on the URLs; they are expected to be HTML-escaped when
 # rendered by clients (just as links rendered into message bodies
 # are validated and escaped inside `url_to_a`).
 def topic_links(linkifiers_key: int, topic_name: str) -> List[Dict[str, str]]:
-    matches: List[Dict[str, Union[str, int]]] = []
+    matches: List[TopicLinkMatch] = []
    linkifiers = linkifiers_for_realm(linkifiers_key)
+    precedence = 0

    options = re2.Options()
    options.log_errors = False
@ -2411,12 +2420,18 @@ def topic_links(linkifiers_key: int, topic_name: str) -> List[Dict[str, str]]:
            # Also, we include the matched text in the response, so that our clients
            # don't have to implement any logic of their own to get back the text.
            matches += [
-                dict(
+                TopicLinkMatch(
                    url=url_format_string % match_details,
                    text=match_text,
                    index=m.start(),
+                    precedence=precedence,
                )
            ]
+        precedence += 1
+
+    # Sort the matches beforehand so we favor the match with a higher priority and tie-break with the starting index.
+    # Note that we sort it before processing the raw URLs so that linkifiers will be prioritized over them.
+    matches.sort(key=lambda k: (k.precedence, k.index))

    pos = 0
    # Also make raw URLs navigable.
@ -2438,14 +2453,41 @@ def topic_links(linkifiers_key: int, topic_name: str) -> List[Dict[str, str]]:
                url = result._replace(scheme="https").geturl()
            else:
                url = actual_match_url
-            matches.append(dict(url=url, text=actual_match_url, index=pos))
+            matches.append(
+                TopicLinkMatch(
+                    url=url,
+                    text=actual_match_url,
+                    index=pos,
+                    precedence=None,
+                )
+            )
        # Move pass the next split point, and start matching the URL from there
        pos = end + 1

-    # In order to preserve the order in which the links occur, we sort the matched text
-    # based on its starting index in the topic. We pop the index field before returning.
-    matches = sorted(matches, key=lambda k: k["index"])
-    return [{k: str(v) for k, v in match.items() if k != "index"} for match in matches]
+    def are_matches_overlapping(match_a: TopicLinkMatch, match_b: TopicLinkMatch) -> bool:
+        return (match_b.index <= match_a.index < match_b.index + len(match_b.text)) or (
+            match_a.index <= match_b.index < match_a.index + len(match_a.text)
+        )
+
+    # The following removes overlapping intervals depending on the precedence of linkifier patterns.
+    # This uses the same algorithm implemented in static/js/markdown.js.
+    # To avoid mutating matches inside the loop, the final output gets appended to another list.
+    applied_matches: List[TopicLinkMatch] = []
+    for current_match in matches:
+        # When the current match does not overlap with all existing matches,
+        # we are confident that the link should present in the final output because
+        #  1. Given that the links are sorted by precedence, the current match has the highest priority
+        #     among the matches to be checked.
+        #  2. None of the matches with higher priority overlaps with the current match.
+        # This might be optimized to search for overlapping matches in O(logn) time,
+        # but it is kept as-is since performance is not critical for this codepath and for simplicity.
+        if all(
+            not are_matches_overlapping(old_match, current_match) for old_match in applied_matches
+        ):
+            applied_matches.append(current_match)
+    # We need to sort applied_matches again because the links were previously ordered by precedence.
+    applied_matches.sort(key=lambda v: v.index)
+    return [{"url": match.url, "text": match.text} for match in applied_matches]


 def maybe_update_markdown_engines(linkifiers_key: int, email_gateway: bool) -> None:
--- a/zerver/tests/test_markdown.py
+++ b/zerver/tests/test_markdown.py
@ -1301,16 +1301,26 @@ class MarkdownTest(ZulipTestCase):
            ],
        )

+    def check_add_linkifiers(
+        self, linkifiers: List[RealmFilter], expected_linkifier_strs: List[str]
+    ) -> None:
+        self.assert_length(linkifiers, len(expected_linkifier_strs))
+        for linkifier, expected_linkifier_str in zip(linkifiers, expected_linkifier_strs):
+            linkifier.clean()
+            linkifier.save()
+            self.assertEqual(str(linkifier), expected_linkifier_str)
+
    def test_realm_patterns(self) -> None:
        realm = get_realm("zulip")
-        url_format_string = r"https://trac.example.com/ticket/%(id)s"
-        linkifier = RealmFilter(
-            realm=realm, pattern=r"#(?P<id>[0-9]{2,8})", url_format_string=url_format_string
-        )
-        linkifier.save()
-        self.assertEqual(
-            str(linkifier),
-            "<RealmFilter(zulip): #(?P<id>[0-9]{2,8}) https://trac.example.com/ticket/%(id)s>",
+        self.check_add_linkifiers(
+            [
+                RealmFilter(
+                    realm=realm,
+                    pattern=r"#(?P<id>[0-9]{2,8})",
+                    url_format_string=r"https://trac.example.com/ticket/%(id)s",
+                )
+            ],
+            ["<RealmFilter(zulip): #(?P<id>[0-9]{2,8}) https://trac.example.com/ticket/%(id)s>"],
        )

        msg = Message(sender=self.example_user("othello"))
@ -1458,29 +1468,29 @@ class MarkdownTest(ZulipTestCase):

    def test_multiple_matching_realm_patterns(self) -> None:
        realm = get_realm("zulip")
-        url_format_string = r"https://trac.example.com/ticket/%(id)s"
-        linkifier_1 = RealmFilter(
-            realm=realm,
-            pattern=r"(?P<id>ABC\-[0-9]+)",
-            url_format_string=url_format_string,
-        )
-        linkifier_1.save()
-        self.assertEqual(
-            str(linkifier_1),
-            r"<RealmFilter(zulip): (?P<id>ABC\-[0-9]+) https://trac.example.com/ticket/%(id)s>",
-        )
-
-        url_format_string = r"https://other-trac.example.com/ticket/%(id)s"
-        linkifier_2 = RealmFilter(
-            realm=realm,
-            pattern=r"(?P<id>[A-Z][A-Z0-9]*\-[0-9]+)",
-            url_format_string=url_format_string,
-        )
-        linkifier_2.save()
-        self.assertEqual(
-            str(linkifier_2),
-            r"<RealmFilter(zulip): (?P<id>[A-Z][A-Z0-9]*\-[0-9]+)"
-            " https://other-trac.example.com/ticket/%(id)s>",
+        self.check_add_linkifiers(
+            [
+                RealmFilter(
+                    realm=realm,
+                    pattern="(?P<id>ABC-[0-9]+)",
+                    url_format_string="https://trac.example.com/ticket/%(id)s",
+                ),
+                RealmFilter(
+                    realm=realm,
+                    pattern="(?P<id>[A-Z][A-Z0-9]*-[0-9]+)",
+                    url_format_string="https://other-trac.example.com/ticket/%(id)s",
+                ),
+                RealmFilter(
+                    realm=realm,
+                    pattern="(?P<id>[A-Z][A-Z0-9]+)",
+                    url_format_string="https://yet-another-trac.example.com/ticket/%(id)s",
+                ),
+            ],
+            [
+                "<RealmFilter(zulip): (?P<id>ABC-[0-9]+) https://trac.example.com/ticket/%(id)s>",
+                "<RealmFilter(zulip): (?P<id>[A-Z][A-Z0-9]*-[0-9]+) https://other-trac.example.com/ticket/%(id)s>",
+                "<RealmFilter(zulip): (?P<id>[A-Z][A-Z0-9]+) https://yet-another-trac.example.com/ticket/%(id)s>",
+            ],
        )

        msg = Message(sender=self.example_user("othello"))
@ -1500,12 +1510,136 @@ class MarkdownTest(ZulipTestCase):
            converted.rendered_content,
            '<p>We should fix <a href="https://trac.example.com/ticket/ABC-123">ABC-123</a> or <a href="https://trac.example.com/ticket/16">trac ABC-123</a> today.</p>',
        )
-        # But both the links should be generated in topics.
+        # Only the older linkifier should be used in the topic, because the two patterns overlap.
        self.assertEqual(
            converted_topic,
            [
                {"url": "https://trac.example.com/ticket/ABC-123", "text": "ABC-123"},
-                {"url": "https://other-trac.example.com/ticket/ABC-123", "text": "ABC-123"},
+            ],
+        )
+
+        # linkifier 3 matches ASD, ABC and QWE, but because it has lower priority
+        # than linkifier 1 and linkifier 2 because it is created last, the former
+        # two matches will not be chosen.
+        # Both linkifier 1 and linkifier 2 matches ABC-123, similarly, as linkifier 2
+        # has a lower priority, only linkifier 1's URL will be generated.
+        converted_topic = topic_links(realm.id, "ASD-123 ABC-123 QWE")
+        self.assertEqual(
+            converted_topic,
+            [
+                {"url": "https://other-trac.example.com/ticket/ASD-123", "text": "ASD-123"},
+                {"url": "https://trac.example.com/ticket/ABC-123", "text": "ABC-123"},
+                {"url": "https://yet-another-trac.example.com/ticket/QWE", "text": "QWE"},
+            ],
+        )
+
+    def test_links_and_linkifiers_in_topic_name(self) -> None:
+        realm = get_realm("zulip")
+        self.check_add_linkifiers(
+            [
+                RealmFilter(
+                    realm=realm,
+                    pattern="ABC-42",
+                    url_format_string="https://google.com",
+                ),
+                RealmFilter(
+                    realm=realm,
+                    pattern=r"com.+(?P<id>ABC\-[0-9]+)",
+                    url_format_string="https://trac.example.com/ticket/%(id)s",
+                ),
+            ],
+            [
+                "<RealmFilter(zulip): ABC-42 https://google.com>",
+                r"<RealmFilter(zulip): com.+(?P<id>ABC\-[0-9]+) https://trac.example.com/ticket/%(id)s>",
+            ],
+        )
+
+        # This verifies that second linkifier has a lower priority than the first one.
+        # It helps us to later ensure that even with a low priority, the linkifier can take effect
+        # when it appears earlier than a raw URL.
+        converted_topic = topic_links(realm.id, "com ABC-42")
+        self.assertEqual(
+            converted_topic,
+            [{"url": "https://google.com", "text": "ABC-42"}],
+        )
+        # The linkifier matches "com/ABC-123", which is after where the raw URL starts
+        converted_topic = topic_links(realm.id, "https://foo.com/ABC-123")
+        self.assertEqual(
+            converted_topic,
+            [{"url": "https://foo.com/ABC-123", "text": "https://foo.com/ABC-123"}],
+        )
+
+        # The linkifier matches "com https://foo.com/ABC-123", which is before where the raw URL starts
+        converted_topic = topic_links(realm.id, "com https://foo.com/ABC-123")
+        self.assertEqual(
+            converted_topic,
+            [
+                {
+                    "url": "https://trac.example.com/ticket/ABC-123",
+                    "text": "com https://foo.com/ABC-123",
+                }
+            ],
+        )
+
+    def test_topic_links_ordering_by_priority(self) -> None:
+        # The same test case is also implemented in frontend_tests/node_tests/markdown_parse.js
+        realm = get_realm("zulip")
+        self.check_add_linkifiers(
+            [
+                RealmFilter(
+                    realm=realm,
+                    pattern="http",
+                    url_format_string="http://example.com/",
+                ),
+                RealmFilter(
+                    realm=realm,
+                    pattern="b#(?P<id>[a-z]+)",
+                    url_format_string="http://example.com/b/%(id)s",
+                ),
+                RealmFilter(
+                    realm=realm,
+                    pattern="a#(?P<aid>[a-z]+) b#(?P<bid>[a-z]+)",
+                    url_format_string="http://example.com/a/%(aid)s/b/%(bid)s",
+                ),
+                RealmFilter(
+                    realm=realm,
+                    pattern="a#(?P<id>[a-z]+)",
+                    url_format_string="http://example.com/a/%(id)s",
+                ),
+            ],
+            [
+                "<RealmFilter(zulip): http http://example.com/>",
+                "<RealmFilter(zulip): b#(?P<id>[a-z]+) http://example.com/b/%(id)s>",
+                "<RealmFilter(zulip): a#(?P<aid>[a-z]+) b#(?P<bid>[a-z]+) http://example.com/a/%(aid)s/b/%(bid)s>",
+                "<RealmFilter(zulip): a#(?P<id>[a-z]+) http://example.com/a/%(id)s>",
+            ],
+        )
+        # There should be 5 link matches in the topic, if ordered from the most priortized to the least:
+        # 1. "http" (linkifier)
+        # 2. "b#bar" (linkifier)
+        # 3. "a#asd b#bar" (linkifier)
+        # 4. "a#asd" (linkifier)
+        # 5. "http://foo.com" (raw URL)
+        # When there are overlapping matches, the one that appears earlier in the list should
+        # have a topic link generated.
+        # For this test case, while "a#asd" and "a#asd b#bar" both match and they overlap,
+        # there is a match "b#bar" with a higher priority, preventing "a#asd b#bar" from being matched.
+        converted_topic = topic_links(realm.id, "http://foo.com a#asd b#bar")
+        self.assertEqual(
+            converted_topic,
+            [
+                {
+                    "text": "http",
+                    "url": "http://example.com/",
+                },
+                {
+                    "text": "a#asd",
+                    "url": "http://example.com/a/asd",
+                },
+                {
+                    "text": "b#bar",
+                    "url": "http://example.com/b/bar",
+                },
            ],
        )