bugdown: Flatten get_web_link_regex().

We use early-exit to flatten the code.

I also tweaked the comments a bit based on some recent
profile findings.  (e.g. reading the file isn't actually
a big bottleneck, it's more the regex itself)
This commit is contained in:
Steve Howell 2019-01-22 18:35:41 +00:00 committed by Tim Abbott
parent 852756aeb3
commit eea711a805
1 changed files with 48 additions and 46 deletions

View File

@ -92,58 +92,60 @@ LINK_REGEX = None # type: Pattern
def get_web_link_regex() -> str: def get_web_link_regex() -> str:
# We create this one time, but not at startup. So the # We create this one time, but not at startup. So the
# first message rendered in any process will have some # first message rendered in any process will have some
# extra costs. # extra costs. It's roughly 75ms to run this code, so
# caching the value in LINK_REGEX is super important here.
global LINK_REGEX global LINK_REGEX
if LINK_REGEX is None: if LINK_REGEX is not None:
# NOTE: this is a very expensive step, it reads a file of tlds! return LINK_REGEX
tlds = '|'.join(list_of_tlds())
# A link starts at a word boundary, and ends at space, punctuation, or end-of-input. tlds = '|'.join(list_of_tlds())
#
# We detect a url either by the `https?://` or by building around the TLD.
# In lieu of having a recursive regex (which python doesn't support) to match # A link starts at a word boundary, and ends at space, punctuation, or end-of-input.
# arbitrary numbers of nested matching parenthesis, we manually build a regexp that #
# can match up to six # We detect a url either by the `https?://` or by building around the TLD.
# The inner_paren_contents chunk matches the innermore non-parenthesis-holding text,
# and the paren_group matches text with, optionally, a matching set of parens
inner_paren_contents = r"[^\s()\"]*"
paren_group = r"""
[^\s()\"]*? # Containing characters that won't end the URL
(?: \( %s \) # and more characters in matched parens
[^\s()\"]*? # followed by more characters
)* # zero-or-more sets of paired parens
"""
nested_paren_chunk = paren_group
for i in range(6):
nested_paren_chunk = nested_paren_chunk % (paren_group,)
nested_paren_chunk = nested_paren_chunk % (inner_paren_contents,)
file_links = r"| (?:file://(/[^/ ]*)+/?)" if settings.ENABLE_FILE_LINKS else r"" # In lieu of having a recursive regex (which python doesn't support) to match
regex = r""" # arbitrary numbers of nested matching parenthesis, we manually build a regexp that
(?<![^\s'"\(,:<]) # Start after whitespace or specified chars # can match up to six
# (Double-negative lookbehind to allow start-of-string) # The inner_paren_contents chunk matches the innermore non-parenthesis-holding text,
(?P<url> # Main group # and the paren_group matches text with, optionally, a matching set of parens
(?:(?: # Domain part inner_paren_contents = r"[^\s()\"]*"
https?://[\w.:@-]+? # If it has a protocol, anything goes. paren_group = r"""
|(?: # Or, if not, be more strict to avoid false-positives [^\s()\"]*? # Containing characters that won't end the URL
(?:[\w-]+\.)+ # One or more domain components, separated by dots (?: \( %s \) # and more characters in matched parens
(?:%s) # TLDs (filled in via format from tlds-alpha-by-domain.txt) [^\s()\"]*? # followed by more characters
) )* # zero-or-more sets of paired parens
"""
nested_paren_chunk = paren_group
for i in range(6):
nested_paren_chunk = nested_paren_chunk % (paren_group,)
nested_paren_chunk = nested_paren_chunk % (inner_paren_contents,)
file_links = r"| (?:file://(/[^/ ]*)+/?)" if settings.ENABLE_FILE_LINKS else r""
regex = r"""
(?<![^\s'"\(,:<]) # Start after whitespace or specified chars
# (Double-negative lookbehind to allow start-of-string)
(?P<url> # Main group
(?:(?: # Domain part
https?://[\w.:@-]+? # If it has a protocol, anything goes.
|(?: # Or, if not, be more strict to avoid false-positives
(?:[\w-]+\.)+ # One or more domain components, separated by dots
(?:%s) # TLDs (filled in via format from tlds-alpha-by-domain.txt)
) )
(?:/ # A path, beginning with /
%s # zero-to-6 sets of paired parens
)?) # Path is optional
| (?:[\w.-]+\@[\w.-]+\.[\w]+) # Email is separate, since it can't have a path
%s # File path start with file:///, enable by setting ENABLE_FILE_LINKS=True
| (?:bitcoin:[13][a-km-zA-HJ-NP-Z1-9]{25,34}) # Bitcoin address pattern, see https://mokagio.github.io/tech-journal/2014/11/21/regex-bitcoin.html
) )
(?= # URL must be followed by (not included in group) (?:/ # A path, beginning with /
[!:;\?\),\.\'\"\>]* # Optional punctuation characters %s # zero-to-6 sets of paired parens
(?:\Z|\s) # followed by whitespace or end of string )?) # Path is optional
) | (?:[\w.-]+\@[\w.-]+\.[\w]+) # Email is separate, since it can't have a path
""" % (tlds, nested_paren_chunk, file_links) %s # File path start with file:///, enable by setting ENABLE_FILE_LINKS=True
LINK_REGEX = verbose_compile(regex) | (?:bitcoin:[13][a-km-zA-HJ-NP-Z1-9]{25,34}) # Bitcoin address pattern, see https://mokagio.github.io/tech-journal/2014/11/21/regex-bitcoin.html
)
(?= # URL must be followed by (not included in group)
[!:;\?\),\.\'\"\>]* # Optional punctuation characters
(?:\Z|\s) # followed by whitespace or end of string
)
""" % (tlds, nested_paren_chunk, file_links)
LINK_REGEX = verbose_compile(regex)
return LINK_REGEX return LINK_REGEX
def clear_state_for_testing() -> None: def clear_state_for_testing() -> None: