linkifiers: Loosen regex that validates URLs.

User-supplied custom realm filter has had some sort of regex-based
validation of the format URL since their introduction in
d7e1e4a2c0 -- and this has always been
in addition to the URLValidator.  The URLValidator is the one which
does the security-relevant work of validating that the schema is
reasonable, and that the overall shape of the URL is well-formed.  The
regex has served primarily to arbitrary limit the characters that can
appear in the URL, in the mistaken name of safety.

Adjust the regex, such that its only purpose is to verify that the
usages of `%` characters in the URL are reasonable, and leave the URL
validation to the URLValidator, which can do a far better job.  This
includes broadening the support to include `%%` as an escape
character; this is likely such a niche case as to be unnecessary, but
costs little.

Fixes #16013.
This commit is contained in:
Alex Vandiver 2021-10-20 01:08:44 +00:00 committed by Tim Abbott
parent adb612a0b4
commit 8dd9b4e812
3 changed files with 71 additions and 12 deletions

View File

@ -105,10 +105,7 @@ async function test_edit_invalid_linkifier(page: Page): Promise<void> {
page,
edit_linkifier_format_status_selector,
);
assert.strictEqual(
edit_linkifier_format_status,
"Failed: Enter a valid URL.,Invalid URL format string.",
);
assert.strictEqual(edit_linkifier_format_status, "Failed: Enter a valid URL.");
await page.click(".close-modal-btn");
await page.waitForSelector("#dialog_widget_modal", {hidden: true});

View File

@ -1120,7 +1120,28 @@ def filter_pattern_validator(value: str) -> Pattern[str]:
def filter_format_validator(value: str) -> None:
regex = re.compile(r"^([\.\/:a-zA-Z0-9#_?=&;~-]+%\(([a-zA-Z0-9_-]+)\)s)+[/a-zA-Z0-9#_?=&;~-]*$")
"""Does not attempt to verify URL-ness, but rather %(foo)s.
URLValidator is assumed to have caught anything which is malformed
as a URL.
"""
regex = re.compile(
r"""
^
(
[^%] # Any non-percent,
| # OR...
% ( # A %, which can mean:
\( [a-zA-Z0-9_-]+ \) s # Interpolation group
| # OR
% # %%, which is an escaped %
)
)+ # Those happen one or more times
$
""",
re.VERBOSE,
)
if not regex.match(value):
raise ValidationError(_("Invalid URL format string."))

View File

@ -1,7 +1,9 @@
import re
from django.core.exceptions import ValidationError
from zerver.lib.test_classes import ZulipTestCase
from zerver.models import RealmFilter
from zerver.models import RealmFilter, filter_format_validator
class RealmFilterTest(ZulipTestCase):
@ -43,7 +45,9 @@ class RealmFilterTest(ZulipTestCase):
data["pattern"] = r"ZUL-(?P<id>\d+)"
data["url_format_string"] = "https://realm.com/my_realm_filter/"
result = self.client_post("/json/realm/filters", info=data)
self.assert_json_error(result, "Invalid URL format string.")
self.assert_json_error(
result, "Group 'id' in linkifier pattern is not present in URL format string."
)
data["url_format_string"] = "https://realm.com/my_realm_filter/#hashtag/%(id)s"
result = self.client_post("/json/realm/filters", info=data)
@ -117,13 +121,10 @@ class RealmFilterTest(ZulipTestCase):
result, "Group 'id' in linkifier pattern is not present in URL format string."
)
# BUG: In theory, this should be valid, since %% should be a
# valid escaping method. It's unlikely someone actually wants
# to do this, though.
data["pattern"] = r"ZUL-(?P<id>\d+)"
data["pattern"] = r"ZUL-ESCAPE-(?P<id>\d+)"
data["url_format_string"] = r"https://realm.com/my_realm_filter/%%(ignored)s/%(id)s"
result = self.client_post("/json/realm/filters", info=data)
self.assert_json_error(result, "Invalid URL format string.")
self.assert_json_success(result)
data["pattern"] = r"(?P<org>[a-zA-Z0-9_-]+)/(?P<repo>[a-zA-Z0-9_-]+)#(?P<id>[0-9]+)"
data["url_format_string"] = "https://github.com/%(org)s/%(repo)s/issue/%(id)s"
@ -200,3 +201,43 @@ class RealmFilterTest(ZulipTestCase):
data["url_format_string"] = "https://realm.com/my_realm_filter/%(id)s"
result = self.client_patch(f"/json/realm/filters/{linkifier_id + 1}", info=data)
self.assert_json_error(result, "Linkifier not found.")
def test_valid_urls(self) -> None:
valid_urls = [
"https://example.com/",
"https://user:password@example.com/",
"https://example.com/@user/thing",
"https://example.com/!path",
"https://example.com/foo.bar",
"https://example.com/foo[bar]",
"https://example.com/%(foo)s",
"https://example.com/%(foo)s%(bars)s",
"https://example.com/%(foo)s/and/%(bar)s",
"https://example.com/?foo=%(foo)s",
"https://example.com/%%",
"https://example.com/%%(",
"https://example.com/%%()",
"https://example.com/%%(foo",
"https://example.com/%%(foo)",
"https://example.com/%%(foo)s",
]
for url in valid_urls:
filter_format_validator(url)
invalid_urls = [
"https://example.com/%ab",
"https://example.com/%ba",
"https://example.com/%21",
"https://example.com/words%20with%20spaces",
"https://example.com/back%20to%20%(back)s",
"https://example.com/encoded%2fwith%2fletters",
"https://example.com/encoded%2Fwith%2Fupper%2Fcase%2Fletters",
"https://example.com/%(foo)",
"https://example.com/%()s",
"https://example.com/%4!",
"https://example.com/%(foo",
"https://example.com/%2(foo)s",
]
for url in invalid_urls:
with self.assertRaises(ValidationError):
filter_format_validator(url)