mirror of https://github.com/zulip/zulip.git
bugdown: Flatten get_web_link_regex().
We use early-exit to flatten the code. I also tweaked the comments a bit based on some recent profile findings. (e.g. reading the file isn't actually a big bottleneck, it's more the regex itself)
This commit is contained in:
parent
852756aeb3
commit
eea711a805
|
@ -92,58 +92,60 @@ LINK_REGEX = None # type: Pattern
|
||||||
def get_web_link_regex() -> str:
|
def get_web_link_regex() -> str:
|
||||||
# We create this one time, but not at startup. So the
|
# We create this one time, but not at startup. So the
|
||||||
# first message rendered in any process will have some
|
# first message rendered in any process will have some
|
||||||
# extra costs.
|
# extra costs. It's roughly 75ms to run this code, so
|
||||||
|
# caching the value in LINK_REGEX is super important here.
|
||||||
global LINK_REGEX
|
global LINK_REGEX
|
||||||
if LINK_REGEX is None:
|
if LINK_REGEX is not None:
|
||||||
# NOTE: this is a very expensive step, it reads a file of tlds!
|
return LINK_REGEX
|
||||||
tlds = '|'.join(list_of_tlds())
|
|
||||||
|
|
||||||
# A link starts at a word boundary, and ends at space, punctuation, or end-of-input.
|
tlds = '|'.join(list_of_tlds())
|
||||||
#
|
|
||||||
# We detect a url either by the `https?://` or by building around the TLD.
|
|
||||||
|
|
||||||
# In lieu of having a recursive regex (which python doesn't support) to match
|
# A link starts at a word boundary, and ends at space, punctuation, or end-of-input.
|
||||||
# arbitrary numbers of nested matching parenthesis, we manually build a regexp that
|
#
|
||||||
# can match up to six
|
# We detect a url either by the `https?://` or by building around the TLD.
|
||||||
# The inner_paren_contents chunk matches the innermore non-parenthesis-holding text,
|
|
||||||
# and the paren_group matches text with, optionally, a matching set of parens
|
|
||||||
inner_paren_contents = r"[^\s()\"]*"
|
|
||||||
paren_group = r"""
|
|
||||||
[^\s()\"]*? # Containing characters that won't end the URL
|
|
||||||
(?: \( %s \) # and more characters in matched parens
|
|
||||||
[^\s()\"]*? # followed by more characters
|
|
||||||
)* # zero-or-more sets of paired parens
|
|
||||||
"""
|
|
||||||
nested_paren_chunk = paren_group
|
|
||||||
for i in range(6):
|
|
||||||
nested_paren_chunk = nested_paren_chunk % (paren_group,)
|
|
||||||
nested_paren_chunk = nested_paren_chunk % (inner_paren_contents,)
|
|
||||||
|
|
||||||
file_links = r"| (?:file://(/[^/ ]*)+/?)" if settings.ENABLE_FILE_LINKS else r""
|
# In lieu of having a recursive regex (which python doesn't support) to match
|
||||||
regex = r"""
|
# arbitrary numbers of nested matching parenthesis, we manually build a regexp that
|
||||||
(?<![^\s'"\(,:<]) # Start after whitespace or specified chars
|
# can match up to six
|
||||||
# (Double-negative lookbehind to allow start-of-string)
|
# The inner_paren_contents chunk matches the innermore non-parenthesis-holding text,
|
||||||
(?P<url> # Main group
|
# and the paren_group matches text with, optionally, a matching set of parens
|
||||||
(?:(?: # Domain part
|
inner_paren_contents = r"[^\s()\"]*"
|
||||||
https?://[\w.:@-]+? # If it has a protocol, anything goes.
|
paren_group = r"""
|
||||||
|(?: # Or, if not, be more strict to avoid false-positives
|
[^\s()\"]*? # Containing characters that won't end the URL
|
||||||
(?:[\w-]+\.)+ # One or more domain components, separated by dots
|
(?: \( %s \) # and more characters in matched parens
|
||||||
(?:%s) # TLDs (filled in via format from tlds-alpha-by-domain.txt)
|
[^\s()\"]*? # followed by more characters
|
||||||
)
|
)* # zero-or-more sets of paired parens
|
||||||
|
"""
|
||||||
|
nested_paren_chunk = paren_group
|
||||||
|
for i in range(6):
|
||||||
|
nested_paren_chunk = nested_paren_chunk % (paren_group,)
|
||||||
|
nested_paren_chunk = nested_paren_chunk % (inner_paren_contents,)
|
||||||
|
|
||||||
|
file_links = r"| (?:file://(/[^/ ]*)+/?)" if settings.ENABLE_FILE_LINKS else r""
|
||||||
|
regex = r"""
|
||||||
|
(?<![^\s'"\(,:<]) # Start after whitespace or specified chars
|
||||||
|
# (Double-negative lookbehind to allow start-of-string)
|
||||||
|
(?P<url> # Main group
|
||||||
|
(?:(?: # Domain part
|
||||||
|
https?://[\w.:@-]+? # If it has a protocol, anything goes.
|
||||||
|
|(?: # Or, if not, be more strict to avoid false-positives
|
||||||
|
(?:[\w-]+\.)+ # One or more domain components, separated by dots
|
||||||
|
(?:%s) # TLDs (filled in via format from tlds-alpha-by-domain.txt)
|
||||||
)
|
)
|
||||||
(?:/ # A path, beginning with /
|
|
||||||
%s # zero-to-6 sets of paired parens
|
|
||||||
)?) # Path is optional
|
|
||||||
| (?:[\w.-]+\@[\w.-]+\.[\w]+) # Email is separate, since it can't have a path
|
|
||||||
%s # File path start with file:///, enable by setting ENABLE_FILE_LINKS=True
|
|
||||||
| (?:bitcoin:[13][a-km-zA-HJ-NP-Z1-9]{25,34}) # Bitcoin address pattern, see https://mokagio.github.io/tech-journal/2014/11/21/regex-bitcoin.html
|
|
||||||
)
|
)
|
||||||
(?= # URL must be followed by (not included in group)
|
(?:/ # A path, beginning with /
|
||||||
[!:;\?\),\.\'\"\>]* # Optional punctuation characters
|
%s # zero-to-6 sets of paired parens
|
||||||
(?:\Z|\s) # followed by whitespace or end of string
|
)?) # Path is optional
|
||||||
)
|
| (?:[\w.-]+\@[\w.-]+\.[\w]+) # Email is separate, since it can't have a path
|
||||||
""" % (tlds, nested_paren_chunk, file_links)
|
%s # File path start with file:///, enable by setting ENABLE_FILE_LINKS=True
|
||||||
LINK_REGEX = verbose_compile(regex)
|
| (?:bitcoin:[13][a-km-zA-HJ-NP-Z1-9]{25,34}) # Bitcoin address pattern, see https://mokagio.github.io/tech-journal/2014/11/21/regex-bitcoin.html
|
||||||
|
)
|
||||||
|
(?= # URL must be followed by (not included in group)
|
||||||
|
[!:;\?\),\.\'\"\>]* # Optional punctuation characters
|
||||||
|
(?:\Z|\s) # followed by whitespace or end of string
|
||||||
|
)
|
||||||
|
""" % (tlds, nested_paren_chunk, file_links)
|
||||||
|
LINK_REGEX = verbose_compile(regex)
|
||||||
return LINK_REGEX
|
return LINK_REGEX
|
||||||
|
|
||||||
def clear_state_for_testing() -> None:
|
def clear_state_for_testing() -> None:
|
||||||
|
|
Loading…
Reference in New Issue