Revert "linkifiers: Match JS implementation to server implementation."

This reverts commit 091e2f177b.

This version of python_to_js_linkifier fails for at least some real
linkifiers. We'll likely re-introduce this after a bit more debugging.
This commit is contained in:
Tim Abbott 2023-11-16 14:59:18 -08:00
parent 43e9bbe31c
commit 3cfe4b720c
7 changed files with 79 additions and 68 deletions

View File

@ -42,7 +42,7 @@ function python_to_js_linkifier(
current_group += 1; current_group += 1;
} }
// Convert any python in-regex flags to RegExp flags // Convert any python in-regex flags to RegExp flags
let js_flags = "gu"; let js_flags = "g";
const inline_flag_re = /\(\?([Limsux]+)\)/; const inline_flag_re = /\(\?([Limsux]+)\)/;
match = inline_flag_re.exec(pattern); match = inline_flag_re.exec(pattern);
@ -59,11 +59,15 @@ function python_to_js_linkifier(
pattern = pattern.replace(inline_flag_re, ""); pattern = pattern.replace(inline_flag_re, "");
} }
// This boundary-matching must be kept in sync with prepare_linkifier_pattern in // Ideally we should have been checking that linkifiers
// zerver.lib.markdown. It does not use look-ahead or look-behind, because re2 // begin with certain characters but since there is no
// does not support either. // support for negative lookbehind in javascript, we check
pattern = // for this condition in `contains_backend_only_syntax()`
/(^|\s|\u0085|\p{Z}|['"(,:<])/u.source + "(" + pattern + ")" + /($|[^\p{L}\p{N}])/u.source; // function. If the condition is satisfied then the message
// is rendered locally, otherwise, we return false there and
// message is rendered on the backend which has proper support
// for negative lookbehind.
pattern = pattern + /(?!\w)/.source;
let final_regex = null; let final_regex = null;
try { try {
final_regex = new RegExp(pattern, js_flags); final_regex = new RegExp(pattern, js_flags);

View File

@ -70,6 +70,31 @@ export function translate_emoticons_to_names({src, get_emoticon_translations}) {
return translated; return translated;
} }
function contains_problematic_linkifier({content, get_linkifier_map}) {
// If a linkifier doesn't start with some specified characters
// then don't render it locally. It is workaround for the fact that
// javascript regex doesn't support lookbehind.
for (const re of get_linkifier_map().keys()) {
const pattern = /[^\s"'(,:<]/.source + re.source + /(?!\w)/.source;
const regex = new RegExp(pattern);
if (regex.test(content)) {
return true;
}
}
return false;
}
function content_contains_backend_only_syntax({content, get_linkifier_map}) {
// Try to guess whether or not a message contains syntax that only the
// backend Markdown processor can correctly handle.
// If it doesn't, we can immediately render it client-side for local echo.
return (
contains_preview_link(content) ||
contains_problematic_linkifier({content, get_linkifier_map})
);
}
function parse_with_options({raw_content, helper_config, options}) { function parse_with_options({raw_content, helper_config, options}) {
// Given the raw markdown content of a message (raw_content) // Given the raw markdown content of a message (raw_content)
// we return the HTML content (content) and flags. // we return the HTML content (content) and flags.
@ -292,27 +317,13 @@ export function get_topic_links({topic, get_linkifier_map}) {
let precedence = 0; let precedence = 0;
for (const [pattern, {url_template, group_number_to_name}] of get_linkifier_map().entries()) { for (const [pattern, {url_template, group_number_to_name}] of get_linkifier_map().entries()) {
// Strip off the "g" modifier let match;
const non_global_pattern = new RegExp(pattern.source, pattern.flags.replace("g", "")); while ((match = pattern.exec(topic)) !== null) {
let pos = 0; const matched_groups = match.slice(1);
while (pos < topic.length) {
const match = non_global_pattern.exec(topic.slice(pos));
if (match === null) {
break;
}
// Advance position to the start of the body of the match, after the index
// offset and any leading whitespace (captured in match[1])
pos += match.index + match[1].length;
// match[0] is the entire match, which may have leading or trailing boundary
// characters. match[1] is the leading characters, match[2] is the main body of the
// match, and user-provided groups start in match[3]. Slice those user-provided groups
// off.
const user_groups = match.slice(3, -1);
let i = 0; let i = 0;
const template_context = {}; const template_context = {};
while (i < user_groups.length) { while (i < matched_groups.length) {
const matched_group = user_groups[i]; const matched_group = matched_groups[i];
const current_group = i + 1; const current_group = i + 1;
template_context[group_number_to_name[current_group]] = matched_group; template_context[group_number_to_name[current_group]] = matched_group;
i += 1; i += 1;
@ -320,13 +331,7 @@ export function get_topic_links({topic, get_linkifier_map}) {
const link_url = url_template.expand(template_context); const link_url = url_template.expand(template_context);
// We store the starting index as well, to sort the order of occurrence of the links // We store the starting index as well, to sort the order of occurrence of the links
// in the topic, similar to the logic implemented in zerver/lib/markdown/__init__.py // in the topic, similar to the logic implemented in zerver/lib/markdown/__init__.py
links.push({url: link_url, text: match[2], index: pos, precedence}); links.push({url: link_url, text: match[0], index: match.index, precedence});
// Adjust the start point of the match for the next iteration -- we already advanced
// to the start of the user match, so now only advance by match[2], so that we don't
// include the length of any trailing characters. This means that patterns can overlap
// their whitespace.
pos += match[2].length;
} }
precedence += 1; precedence += 1;
} }
@ -443,10 +448,10 @@ function handleEmoji({emoji_name, get_realm_emoji_url, get_emoji_codepoint}) {
function handleLinkifier({pattern, matches, get_linkifier_map}) { function handleLinkifier({pattern, matches, get_linkifier_map}) {
const {url_template, group_number_to_name} = get_linkifier_map().get(pattern); const {url_template, group_number_to_name} = get_linkifier_map().get(pattern);
const user_groups = matches.slice(1);
let current_group = 1; let current_group = 1;
const template_context = {}; const template_context = {};
for (const match of user_groups) {
for (const match of matches) {
template_context[group_number_to_name[current_group]] = match; template_context[group_number_to_name[current_group]] = match;
current_group += 1; current_group += 1;
} }
@ -683,7 +688,10 @@ export function add_topic_links(message) {
} }
export function contains_backend_only_syntax(content) { export function contains_backend_only_syntax(content) {
return contains_preview_link(content); return content_contains_backend_only_syntax({
content,
get_linkifier_map: web_app_helpers.get_linkifier_map,
});
} }
export function parse_non_message(raw_content) { export function parse_non_message(raw_content) {

View File

@ -30,10 +30,7 @@ run_test("python_to_js_linkifier", () => {
}, },
]); ]);
let actual_value = get_linkifier_regexes(); let actual_value = get_linkifier_regexes();
let expected_value = [ let expected_value = [/\/aa\/g(?!\w)/gim, /\/aa\/g(?!\w)/g];
/(^|\s|\u0085|\p{Z}|['"(,:<])(\/aa\/g)($|[^\p{L}\p{N}])/gimu,
/(^|\s|\u0085|\p{Z}|['"(,:<])(\/aa\/g)($|[^\p{L}\p{N}])/gu,
];
assert.deepEqual(actual_value, expected_value); assert.deepEqual(actual_value, expected_value);
// Test case with multiple replacements. // Test case with multiple replacements.
linkifiers.update_linkifier_rules([ linkifiers.update_linkifier_rules([
@ -44,7 +41,7 @@ run_test("python_to_js_linkifier", () => {
}, },
]); ]);
actual_value = get_linkifier_regexes(); actual_value = get_linkifier_regexes();
expected_value = [/(^|\s|\u0085|\p{Z}|['"(,:<])(#cf(\d+)([A-Z][\dA-Z]*))($|[^\p{L}\p{N}])/gu]; expected_value = [/#cf(\d+)([A-Z][\dA-Z]*)(?!\w)/g];
assert.deepEqual(actual_value, expected_value); assert.deepEqual(actual_value, expected_value);
// Test incorrect syntax. // Test incorrect syntax.
blueslip.expect("error", "python_to_js_linkifier failure!"); blueslip.expect("error", "python_to_js_linkifier failure!");

View File

@ -456,6 +456,9 @@ test("marked", () => {
expected: expected:
'<blockquote>\n<p>User group mention in quote: <span class="user-group-mention silent" data-user-group-id="2">Backend</span></p>\n</blockquote>\n<blockquote>\n<p>Another user group mention in quote: <span class="user-group-mention silent" data-user-group-id="1">hamletcharacters</span></p>\n</blockquote>', '<blockquote>\n<p>User group mention in quote: <span class="user-group-mention silent" data-user-group-id="2">Backend</span></p>\n</blockquote>\n<blockquote>\n<p>Another user group mention in quote: <span class="user-group-mention silent" data-user-group-id="1">hamletcharacters</span></p>\n</blockquote>',
}, },
// Test only those linkifiers which don't return True for
// `contains_backend_only_syntax()`. Those which return True
// are tested separately.
{ {
input: "This is a linkifier #1234 with text after it", input: "This is a linkifier #1234 with text after it",
expected: expected:
@ -476,14 +479,6 @@ test("marked", () => {
expected: expected:
'<p>This is a linkifier with <a href="https://zone_45.zulip.net/ticket/123" title="https://zone_45.zulip.net/ticket/123">ZGROUP_123:45</a> groups</p>', '<p>This is a linkifier with <a href="https://zone_45.zulip.net/ticket/123" title="https://zone_45.zulip.net/ticket/123">ZGROUP_123:45</a> groups</p>',
}, },
{
input: "Here is the PR-#123.",
expected: `<p>Here is the PR-<a href="https://trac.example.com/ticket/123" title="https://trac.example.com/ticket/123">#123</a>.</p>`,
},
{
input: "Function abc() was introduced in (PR)#123.",
expected: `<p>Function abc() was introduced in (PR)<a href="https://trac.example.com/ticket/123" title="https://trac.example.com/ticket/123">#123</a>.</p>`,
},
{input: "Test *italic*", expected: "<p>Test <em>italic</em></p>"}, {input: "Test *italic*", expected: "<p>Test <em>italic</em></p>"},
{ {
input: "T\n#**Denmark**", input: "T\n#**Denmark**",
@ -850,6 +845,16 @@ test("message_flags", () => {
assert.equal(message.flags.includes("mentioned"), false); assert.equal(message.flags.includes("mentioned"), false);
}); });
test("backend_only_linkifiers", () => {
const backend_only_linkifiers = [
"Here is the PR-#123.",
"Function abc() was introduced in (PR)#123.",
];
for (const content of backend_only_linkifiers) {
assert.equal(markdown.contains_backend_only_syntax(content), true);
}
});
test("translate_emoticons_to_names", () => { test("translate_emoticons_to_names", () => {
const get_emoticon_translations = emoji.get_emoticon_translations; const get_emoticon_translations = emoji.get_emoticon_translations;

View File

@ -114,7 +114,7 @@ function get_realm_emoji_url(emoji_name) {
return realm_emoji_map.get(emoji_name); return realm_emoji_map.get(emoji_name);
} }
const regex = /(^|\s|\u0085|\p{Z}|['"(,:<])(#foo(\d+))($|[^\p{L}\p{N}])/gu; const regex = /#foo(\d+)(?!\w)/g;
const linkifier_map = new Map(); const linkifier_map = new Map();
linkifier_map.set(regex, { linkifier_map.set(regex, {
url_template: url_template_lib.parse("http://foo.com/{id}"), url_template: url_template_lib.parse("http://foo.com/{id}"),
@ -287,30 +287,30 @@ run_test("topic links repeated", () => {
run_test("topic links overlapping", () => { run_test("topic links overlapping", () => {
linkifiers.initialize([ linkifiers.initialize([
{pattern: "[a-z]+(?P<id>1\\d+) :[a-z]+", url_template: "http://a.com/{id}"}, {pattern: "[a-z]+(?P<id>1\\d+) #[a-z]+", url_template: "http://a.com/{id}"},
{pattern: "[a-z]+(?P<id>1\\d+)", url_template: "http://b.com/{id}"}, {pattern: "[a-z]+(?P<id>1\\d+)", url_template: "http://b.com/{id}"},
{pattern: ".+:(?P<id>[a-z]+)", url_template: "http://wildcard.com/{id}"}, {pattern: ".+#(?P<id>[a-z]+)", url_template: "http://wildcard.com/{id}"},
{pattern: ":(?P<id>[a-z]+)", url_template: "http://c.com/{id}"}, {pattern: "#(?P<id>[a-z]+)", url_template: "http://c.com/{id}"},
]); ]);
// b.com's pattern should be matched while it overlaps with c.com's. // b.com's pattern should be matched while it overlaps with c.com's.
assert_topic_links(":foo100", [ assert_topic_links("#foo100", [
{ {
text: "foo100", text: "foo100",
url: "http://b.com/100", url: "http://b.com/100",
}, },
]); ]);
// a.com's pattern should be matched while it overlaps with b.com's, wildcard.com's and c.com's. // a.com's pattern should be matched while it overlaps with b.com's, wildcard.com's and c.com's.
assert_topic_links(":asd123 :asd", [ assert_topic_links("#asd123 #asd", [
{ {
text: "asd123 :asd", text: "asd123 #asd",
url: "http://a.com/123", url: "http://a.com/123",
}, },
]); ]);
// a.com's pattern do not match, wildcard.com's and b.com's patterns should match // a.com's pattern do not match, wildcard.com's and b.com's patterns should match
// and the links are ordered by the matched index. // and the links are ordered by the matched index.
assert_topic_links("/:asd :foo100", [ assert_topic_links("/#asd #foo100", [
{ {
text: "/:asd", text: "/#asd",
url: "http://wildcard.com/asd", url: "http://wildcard.com/asd",
}, },
{ {
@ -318,27 +318,27 @@ run_test("topic links overlapping", () => {
url: "http://b.com/100", url: "http://b.com/100",
}, },
]); ]);
assert_topic_links("foo.anything/:asd", [ assert_topic_links("foo.anything/#asd", [
{ {
text: "foo.anything/:asd", text: "foo.anything/#asd",
url: "http://wildcard.com/asd", url: "http://wildcard.com/asd",
}, },
]); ]);
// While the raw URL "http://foo.com/foo100" appears before b.com's match "foo100", // While the raw URL "http://foo.com/foo100" appears before b.com's match "foo100",
// we prioritize the linkifier match first. // we prioritize the linkifier match first.
assert_topic_links("http://foo.com/:foo100", [ assert_topic_links("http://foo.com/foo100", [
{ {
text: "foo100", text: "foo100",
url: "http://b.com/100", url: "http://b.com/100",
}, },
]); ]);
// Here the raw URL "https://foo.com/:asd" appears after wildcard.com's match "something https://foo.com/:asd". // Here the raw URL "https://foo.com/#asd" appears after wildcard.com's match "something https://foo.com/#asd".
// The latter is prioritized and the raw URL does not get included. // The latter is prioritized and the raw URL does not get included.
assert_topic_links("something https://foo.com/:asd", [ assert_topic_links("something https://foo.com/#asd", [
{ {
text: "something https://foo.com/:asd", text: "something https://foo.com/#asd",
url: "http://wildcard.com/asd", url: "http://wildcard.com/asd",
}, },
]); ]);

View File

@ -650,10 +650,10 @@ InlineLexer.prototype.output = function(src) {
regexes.forEach(function (regex) { regexes.forEach(function (regex) {
var ret = self.inlineReplacement(regex, src, function(regex, groups, match) { var ret = self.inlineReplacement(regex, src, function(regex, groups, match) {
// Insert the created URL // Insert the created URL
href = self.linkifier(regex, groups.slice(1, -1), match); href = self.linkifier(regex, groups, match);
if (href !== undefined) { if (href !== undefined) {
href = escape(href); href = escape(href);
return groups[0] + self.renderer.link(href, href, groups[1]) + groups.slice(-1)[0] return self.renderer.link(href, href, match);
} else { } else {
return match; return match;
} }

View File

@ -1821,9 +1821,6 @@ def prepare_linkifier_pattern(source: str) -> str:
# We use an extended definition of 'whitespace' which is # We use an extended definition of 'whitespace' which is
# equivalent to \p{White_Space} -- since \s in re2 only matches # equivalent to \p{White_Space} -- since \s in re2 only matches
# ASCII spaces, and re2 does not support \p{White_Space}. # ASCII spaces, and re2 does not support \p{White_Space}.
#
# This implementation should be kept in sync with the one in
# web/src/linkifiers.js
regex = rf""" regex = rf"""
(?P<{BEFORE_CAPTURE_GROUP}> (?P<{BEFORE_CAPTURE_GROUP}>
^ | ^ |