bugdown: Flatten get_web_link_regex().

We use early-exit to flatten the code. I also tweaked the comments a bit based on some recent profile findings. (e.g. reading the file isn't actually a big bottleneck, it's more the regex itself)
2019-01-22 18:35:41 +00:00 · 2019-01-22 18:35:41 +00:00 · eea711a805
parent 852756aeb3
commit eea711a805
1 changed files with 48 additions and 46 deletions
--- a/zerver/lib/bugdown/init.py
+++ b/zerver/lib/bugdown/init.py
@ -92,58 +92,60 @@ LINK_REGEX = None  # type: Pattern
 def get_web_link_regex() -> str:
    # We create this one time, but not at startup.  So the
    # first message rendered in any process will have some
-    # extra costs.
+    # extra costs.  It's roughly 75ms to run this code, so
    # caching the value in LINK_REGEX is super important here.
    global LINK_REGEX
-    if LINK_REGEX is None:
+    if LINK_REGEX is not None:
-        # NOTE: this is a very expensive step, it reads a file of tlds!
+        return LINK_REGEX
        tlds = '|'.join(list_of_tlds())
-        # A link starts at a word boundary, and ends at space, punctuation, or end-of-input.
+    tlds = '|'.join(list_of_tlds())
        #
        # We detect a url either by the `https?://` or by building around the TLD.
-        # In lieu of having a recursive regex (which python doesn't support) to match
+    # A link starts at a word boundary, and ends at space, punctuation, or end-of-input.
-        # arbitrary numbers of nested matching parenthesis, we manually build a regexp that
+    #
-        # can match up to six
+    # We detect a url either by the `https?://` or by building around the TLD.
        # The inner_paren_contents chunk matches the innermore non-parenthesis-holding text,
        # and the paren_group matches text with, optionally, a matching set of parens
        inner_paren_contents = r"[^\s()\"]*"
        paren_group = r"""
                        [^\s()\"]*?            # Containing characters that won't end the URL
                        (?: \( %s \)           # and more characters in matched parens
                            [^\s()\"]*?        # followed by more characters
                        )*                     # zero-or-more sets of paired parens
                       """
        nested_paren_chunk = paren_group
        for i in range(6):
            nested_paren_chunk = nested_paren_chunk % (paren_group,)
        nested_paren_chunk = nested_paren_chunk % (inner_paren_contents,)
-        file_links = r"| (?:file://(/[^/ ]*)+/?)" if settings.ENABLE_FILE_LINKS else r""
+    # In lieu of having a recursive regex (which python doesn't support) to match
-        regex = r"""
+    # arbitrary numbers of nested matching parenthesis, we manually build a regexp that
-            (?<![^\s'"\(,:<])    # Start after whitespace or specified chars
+    # can match up to six
-                                 # (Double-negative lookbehind to allow start-of-string)
+    # The inner_paren_contents chunk matches the innermore non-parenthesis-holding text,
-            (?P<url>             # Main group
+    # and the paren_group matches text with, optionally, a matching set of parens
-                (?:(?:           # Domain part
+    inner_paren_contents = r"[^\s()\"]*"
-                    https?://[\w.:@-]+?   # If it has a protocol, anything goes.
+    paren_group = r"""
-                   |(?:                   # Or, if not, be more strict to avoid false-positives
+                    [^\s()\"]*?            # Containing characters that won't end the URL
-                        (?:[\w-]+\.)+     # One or more domain components, separated by dots
+                    (?: \( %s \)           # and more characters in matched parens
-                        (?:%s)            # TLDs (filled in via format from tlds-alpha-by-domain.txt)
+                        [^\s()\"]*?        # followed by more characters
-                    )
+                    )*                     # zero-or-more sets of paired parens
                   """
    nested_paren_chunk = paren_group
    for i in range(6):
        nested_paren_chunk = nested_paren_chunk % (paren_group,)
    nested_paren_chunk = nested_paren_chunk % (inner_paren_contents,)
    file_links = r"| (?:file://(/[^/ ]*)+/?)" if settings.ENABLE_FILE_LINKS else r""
    regex = r"""
        (?<![^\s'"\(,:<])    # Start after whitespace or specified chars
                             # (Double-negative lookbehind to allow start-of-string)
        (?P<url>             # Main group
            (?:(?:           # Domain part
                https?://[\w.:@-]+?   # If it has a protocol, anything goes.
               |(?:                   # Or, if not, be more strict to avoid false-positives
                    (?:[\w-]+\.)+     # One or more domain components, separated by dots
                    (?:%s)            # TLDs (filled in via format from tlds-alpha-by-domain.txt)
                )
                (?:/             # A path, beginning with /
                    %s           # zero-to-6 sets of paired parens
                )?)              # Path is optional
                | (?:[\w.-]+\@[\w.-]+\.[\w]+) # Email is separate, since it can't have a path
                %s               # File path start with file:///, enable by setting ENABLE_FILE_LINKS=True
                | (?:bitcoin:[13][a-km-zA-HJ-NP-Z1-9]{25,34})  # Bitcoin address pattern, see https://mokagio.github.io/tech-journal/2014/11/21/regex-bitcoin.html
            )
-            (?=                            # URL must be followed by (not included in group)
+            (?:/             # A path, beginning with /
-                [!:;\?\),\.\'\"\>]*         # Optional punctuation characters
+                %s           # zero-to-6 sets of paired parens
-                (?:\Z|\s)                  # followed by whitespace or end of string
+            )?)              # Path is optional
-            )
+            | (?:[\w.-]+\@[\w.-]+\.[\w]+) # Email is separate, since it can't have a path
-            """ % (tlds, nested_paren_chunk, file_links)
+            %s               # File path start with file:///, enable by setting ENABLE_FILE_LINKS=True
-        LINK_REGEX = verbose_compile(regex)
+            | (?:bitcoin:[13][a-km-zA-HJ-NP-Z1-9]{25,34})  # Bitcoin address pattern, see https://mokagio.github.io/tech-journal/2014/11/21/regex-bitcoin.html
        )
        (?=                            # URL must be followed by (not included in group)
            [!:;\?\),\.\'\"\>]*         # Optional punctuation characters
            (?:\Z|\s)                  # followed by whitespace or end of string
        )
        """ % (tlds, nested_paren_chunk, file_links)
    LINK_REGEX = verbose_compile(regex)
    return LINK_REGEX
 def clear_state_for_testing() -> None: