Revert "linkifiers: Match JS implementation to server implementation."

This reverts commit 091e2f177b. This version of python_to_js_linkifier fails for at least some real linkifiers. We'll likely re-introduce this after a bit more debugging.
2023-11-16 14:59:18 -08:00 · 2023-11-16 14:59:18 -08:00 · 3cfe4b720c
parent 43e9bbe31c
commit 3cfe4b720c
7 changed files with 79 additions and 68 deletions
--- a/web/src/linkifiers.ts
+++ b/web/src/linkifiers.ts
@ -42,7 +42,7 @@ function python_to_js_linkifier(
        current_group += 1;
    }
    // Convert any python in-regex flags to RegExp flags
-    let js_flags = "gu";
+    let js_flags = "g";
    const inline_flag_re = /\(\?([Limsux]+)\)/;
    match = inline_flag_re.exec(pattern);

@ -59,11 +59,15 @@ function python_to_js_linkifier(

        pattern = pattern.replace(inline_flag_re, "");
    }
-    // This boundary-matching must be kept in sync with prepare_linkifier_pattern in
-    // zerver.lib.markdown.  It does not use look-ahead or look-behind, because re2
-    // does not support either.
-    pattern =
-        /(^|\s|\u0085|\p{Z}|['"(,:<])/u.source + "(" + pattern + ")" + /($|[^\p{L}\p{N}])/u.source;
+    // Ideally we should have been checking that linkifiers
+    // begin with certain characters but since there is no
+    // support for negative lookbehind in javascript, we check
+    // for this condition in `contains_backend_only_syntax()`
+    // function. If the condition is satisfied then the message
+    // is rendered locally, otherwise, we return false there and
+    // message is rendered on the backend which has proper support
+    // for negative lookbehind.
+    pattern = pattern + /(?!\w)/.source;
    let final_regex = null;
    try {
        final_regex = new RegExp(pattern, js_flags);
--- a/web/src/markdown.js
+++ b/web/src/markdown.js
@ -70,6 +70,31 @@ export function translate_emoticons_to_names({src, get_emoticon_translations}) {
    return translated;
 }

+function contains_problematic_linkifier({content, get_linkifier_map}) {
+    // If a linkifier doesn't start with some specified characters
+    // then don't render it locally. It is workaround for the fact that
+    // javascript regex doesn't support lookbehind.
+    for (const re of get_linkifier_map().keys()) {
+        const pattern = /[^\s"'(,:<]/.source + re.source + /(?!\w)/.source;
+        const regex = new RegExp(pattern);
+        if (regex.test(content)) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+function content_contains_backend_only_syntax({content, get_linkifier_map}) {
+    // Try to guess whether or not a message contains syntax that only the
+    // backend Markdown processor can correctly handle.
+    // If it doesn't, we can immediately render it client-side for local echo.
+    return (
+        contains_preview_link(content) ||
+        contains_problematic_linkifier({content, get_linkifier_map})
+    );
+}
+
 function parse_with_options({raw_content, helper_config, options}) {
    // Given the raw markdown content of a message (raw_content)
    // we return the HTML content (content) and flags.
@ -292,27 +317,13 @@ export function get_topic_links({topic, get_linkifier_map}) {
    let precedence = 0;

    for (const [pattern, {url_template, group_number_to_name}] of get_linkifier_map().entries()) {
-        // Strip off the "g" modifier
-        const non_global_pattern = new RegExp(pattern.source, pattern.flags.replace("g", ""));
-        let pos = 0;
-        while (pos < topic.length) {
-            const match = non_global_pattern.exec(topic.slice(pos));
-            if (match === null) {
-                break;
-            }
-            // Advance position to the start of the body of the match, after the index
-            // offset and any leading whitespace (captured in match[1])
-            pos += match.index + match[1].length;
-
-            // match[0] is the entire match, which may have leading or trailing boundary
-            // characters. match[1] is the leading characters, match[2] is the main body of the
-            // match, and user-provided groups start in match[3].  Slice those user-provided groups
-            // off.
-            const user_groups = match.slice(3, -1);
+        let match;
+        while ((match = pattern.exec(topic)) !== null) {
+            const matched_groups = match.slice(1);
            let i = 0;
            const template_context = {};
-            while (i < user_groups.length) {
-                const matched_group = user_groups[i];
+            while (i < matched_groups.length) {
+                const matched_group = matched_groups[i];
                const current_group = i + 1;
                template_context[group_number_to_name[current_group]] = matched_group;
                i += 1;
@ -320,13 +331,7 @@ export function get_topic_links({topic, get_linkifier_map}) {
            const link_url = url_template.expand(template_context);
            // We store the starting index as well, to sort the order of occurrence of the links
            // in the topic, similar to the logic implemented in zerver/lib/markdown/__init__.py
-            links.push({url: link_url, text: match[2], index: pos, precedence});
-
-            // Adjust the start point of the match for the next iteration -- we already advanced
-            // to the start of the user match, so now only advance by match[2], so that we don't
-            // include the length of any trailing characters. This means that patterns can overlap
-            // their whitespace.
-            pos += match[2].length;
+            links.push({url: link_url, text: match[0], index: match.index, precedence});
        }
        precedence += 1;
    }
@ -443,10 +448,10 @@ function handleEmoji({emoji_name, get_realm_emoji_url, get_emoji_codepoint}) {
 function handleLinkifier({pattern, matches, get_linkifier_map}) {
    const {url_template, group_number_to_name} = get_linkifier_map().get(pattern);

-    const user_groups = matches.slice(1);
    let current_group = 1;
    const template_context = {};
-    for (const match of user_groups) {
+
+    for (const match of matches) {
        template_context[group_number_to_name[current_group]] = match;
        current_group += 1;
    }
@ -683,7 +688,10 @@ export function add_topic_links(message) {
 }

 export function contains_backend_only_syntax(content) {
-    return contains_preview_link(content);
+    return content_contains_backend_only_syntax({
+        content,
+        get_linkifier_map: web_app_helpers.get_linkifier_map,
+    });
 }

 export function parse_non_message(raw_content) {
--- a/web/tests/linkifiers.test.js
+++ b/web/tests/linkifiers.test.js
@ -30,10 +30,7 @@ run_test("python_to_js_linkifier", () => {
        },
    ]);
    let actual_value = get_linkifier_regexes();
-    let expected_value = [
-        /(^|\s|\u0085|\p{Z}|['"(,:<])(\/aa\/g)($|[^\p{L}\p{N}])/gimu,
-        /(^|\s|\u0085|\p{Z}|['"(,:<])(\/aa\/g)($|[^\p{L}\p{N}])/gu,
-    ];
+    let expected_value = [/\/aa\/g(?!\w)/gim, /\/aa\/g(?!\w)/g];
    assert.deepEqual(actual_value, expected_value);
    // Test case with multiple replacements.
    linkifiers.update_linkifier_rules([
@ -44,7 +41,7 @@ run_test("python_to_js_linkifier", () => {
        },
    ]);
    actual_value = get_linkifier_regexes();
-    expected_value = [/(^|\s|\u0085|\p{Z}|['"(,:<])(#cf(\d+)([A-Z][\dA-Z]*))($|[^\p{L}\p{N}])/gu];
+    expected_value = [/#cf(\d+)([A-Z][\dA-Z]*)(?!\w)/g];
    assert.deepEqual(actual_value, expected_value);
    // Test incorrect syntax.
    blueslip.expect("error", "python_to_js_linkifier failure!");
--- a/web/tests/markdown.test.js
+++ b/web/tests/markdown.test.js
@ -456,6 +456,9 @@ test("marked", () => {
            expected:
                '<blockquote>\n<p>User group mention in quote: <span class="user-group-mention silent" data-user-group-id="2">Backend</span></p>\n</blockquote>\n<blockquote>\n<p>Another user group mention in quote: <span class="user-group-mention silent" data-user-group-id="1">hamletcharacters</span></p>\n</blockquote>',
        },
+        // Test only those linkifiers which don't return True for
+        // `contains_backend_only_syntax()`. Those which return True
+        // are tested separately.
        {
            input: "This is a linkifier #1234 with text after it",
            expected:
@ -476,14 +479,6 @@ test("marked", () => {
            expected:
                '<p>This is a linkifier with <a href="https://zone_45.zulip.net/ticket/123" title="https://zone_45.zulip.net/ticket/123">ZGROUP_123:45</a> groups</p>',
        },
-        {
-            input: "Here is the PR-#123.",
-            expected: `<p>Here is the PR-<a href="https://trac.example.com/ticket/123" title="https://trac.example.com/ticket/123">#123</a>.</p>`,
-        },
-        {
-            input: "Function abc() was introduced in (PR)#123.",
-            expected: `<p>Function abc() was introduced in (PR)<a href="https://trac.example.com/ticket/123" title="https://trac.example.com/ticket/123">#123</a>.</p>`,
-        },
        {input: "Test *italic*", expected: "<p>Test <em>italic</em></p>"},
        {
            input: "T\n#**Denmark**",
@ -850,6 +845,16 @@ test("message_flags", () => {
    assert.equal(message.flags.includes("mentioned"), false);
 });

+test("backend_only_linkifiers", () => {
+    const backend_only_linkifiers = [
+        "Here is the PR-#123.",
+        "Function abc() was introduced in (PR)#123.",
+    ];
+    for (const content of backend_only_linkifiers) {
+        assert.equal(markdown.contains_backend_only_syntax(content), true);
+    }
+});
+
 test("translate_emoticons_to_names", () => {
    const get_emoticon_translations = emoji.get_emoticon_translations;

--- a/web/tests/markdown_parse.test.js
+++ b/web/tests/markdown_parse.test.js
@ -114,7 +114,7 @@ function get_realm_emoji_url(emoji_name) {
    return realm_emoji_map.get(emoji_name);
 }

-const regex = /(^|\s|\u0085|\p{Z}|['"(,:<])(#foo(\d+))($|[^\p{L}\p{N}])/gu;
+const regex = /#foo(\d+)(?!\w)/g;
 const linkifier_map = new Map();
 linkifier_map.set(regex, {
    url_template: url_template_lib.parse("http://foo.com/{id}"),
@ -287,30 +287,30 @@ run_test("topic links repeated", () => {

 run_test("topic links overlapping", () => {
    linkifiers.initialize([
-        {pattern: "[a-z]+(?P<id>1\\d+) :[a-z]+", url_template: "http://a.com/{id}"},
+        {pattern: "[a-z]+(?P<id>1\\d+) #[a-z]+", url_template: "http://a.com/{id}"},
        {pattern: "[a-z]+(?P<id>1\\d+)", url_template: "http://b.com/{id}"},
-        {pattern: ".+:(?P<id>[a-z]+)", url_template: "http://wildcard.com/{id}"},
-        {pattern: ":(?P<id>[a-z]+)", url_template: "http://c.com/{id}"},
+        {pattern: ".+#(?P<id>[a-z]+)", url_template: "http://wildcard.com/{id}"},
+        {pattern: "#(?P<id>[a-z]+)", url_template: "http://c.com/{id}"},
    ]);
    // b.com's pattern should be matched while it overlaps with c.com's.
-    assert_topic_links(":foo100", [
+    assert_topic_links("#foo100", [
        {
            text: "foo100",
            url: "http://b.com/100",
        },
    ]);
    // a.com's pattern should be matched while it overlaps with b.com's, wildcard.com's and c.com's.
-    assert_topic_links(":asd123 :asd", [
+    assert_topic_links("#asd123 #asd", [
        {
-            text: "asd123 :asd",
+            text: "asd123 #asd",
            url: "http://a.com/123",
        },
    ]);
    // a.com's pattern do not match, wildcard.com's and b.com's patterns should match
    // and the links are ordered by the matched index.
-    assert_topic_links("/:asd :foo100", [
+    assert_topic_links("/#asd #foo100", [
        {
-            text: "/:asd",
+            text: "/#asd",
            url: "http://wildcard.com/asd",
        },
        {
@ -318,27 +318,27 @@ run_test("topic links overlapping", () => {
            url: "http://b.com/100",
        },
    ]);
-    assert_topic_links("foo.anything/:asd", [
+    assert_topic_links("foo.anything/#asd", [
        {
-            text: "foo.anything/:asd",
+            text: "foo.anything/#asd",
            url: "http://wildcard.com/asd",
        },
    ]);

    // While the raw URL "http://foo.com/foo100" appears before b.com's match "foo100",
    // we prioritize the linkifier match first.
-    assert_topic_links("http://foo.com/:foo100", [
+    assert_topic_links("http://foo.com/foo100", [
        {
            text: "foo100",
            url: "http://b.com/100",
        },
    ]);

-    // Here the raw URL "https://foo.com/:asd" appears after wildcard.com's match "something https://foo.com/:asd".
+    // Here the raw URL "https://foo.com/#asd" appears after wildcard.com's match "something https://foo.com/#asd".
    // The latter is prioritized and the raw URL does not get included.
-    assert_topic_links("something https://foo.com/:asd", [
+    assert_topic_links("something https://foo.com/#asd", [
        {
-            text: "something https://foo.com/:asd",
+            text: "something https://foo.com/#asd",
            url: "http://wildcard.com/asd",
        },
    ]);
--- a/web/third/marked/lib/marked.js
+++ b/web/third/marked/lib/marked.js
@ -650,10 +650,10 @@ InlineLexer.prototype.output = function(src) {
    regexes.forEach(function (regex) {
      var ret = self.inlineReplacement(regex, src, function(regex, groups, match) {
        // Insert the created URL
-        href = self.linkifier(regex, groups.slice(1, -1), match);
+        href = self.linkifier(regex, groups, match);
        if (href !== undefined) {
          href = escape(href);
-          return groups[0] + self.renderer.link(href, href, groups[1]) + groups.slice(-1)[0]
+          return self.renderer.link(href, href, match);
        } else {
          return match;
        }
--- a/zerver/lib/markdown/init.py
+++ b/zerver/lib/markdown/init.py
@ -1821,9 +1821,6 @@ def prepare_linkifier_pattern(source: str) -> str:
    # We use an extended definition of 'whitespace' which is
    # equivalent to \p{White_Space} -- since \s in re2 only matches
    # ASCII spaces, and re2 does not support \p{White_Space}.
-    #
-    # This implementation should be kept in sync with the one in
-    # web/src/linkifiers.js
    regex = rf"""
        (?P<{BEFORE_CAPTURE_GROUP}>
            ^  |