check-templates: Make parser more thorough (and faster).

We now create tokens for whitespace and text, such that you could rebuild the template file with "".join(token.s for token in tokens). I also fixed a few bugs related to not parsing whitespace-control tokens. We no longer ignore template variables, although we could do a lot better at validating them. The most immediate use case for the more thorough parser is to simplify the pretty printer, but it should also make it less likely for us to skip over new template constructs (i.e. the tool will fail hard rather than acting strange). Note that this speeds up the tool by almost 3x, which may be slightly surprising considering we are building more tokens. The reason is that we are now munching efficiently through big chunks of whitespace and text at a time, rather than checking each individual character to see if it starts one of the N other token types. The changes to the pretty_print module here are a bit ugly, but they should mostly be made irrelevant in subsequent commits.
2021-12-01 17:03:31 +00:00 · 2021-12-01 17:03:31 +00:00 · a744e38e67
parent 2eac0560b2
commit a744e38e67
3 changed files with 87 additions and 15 deletions
--- a/tools/lib/pretty_print.py
+++ b/tools/lib/pretty_print.py
@ -45,11 +45,12 @@ def else_token(token: Token) -> bool:


 def pop_unused_tokens(tokens: List[Token], row: int) -> bool:
+    was_closed = False
    while tokens and tokens[-1].line <= row:
        token = tokens.pop()
        if close_token(token):
-            return True
-    return False
+            was_closed = True
+    return was_closed


 def indent_pref(row: int, tokens: List[Token], line: str) -> str:
@ -146,10 +147,19 @@ def pretty_print_html(html: str) -> str:
                next_offset = open_offsets.pop()
            return tag_continuation_offset

+        while tokens and tokens[-1].line < row:
+            token = tokens.pop()
+
        offset = next_offset
        if tokens:
            token = tokens[-1]
-            if token.line == row and token.line_span > 1:
+            if token.kind == "indent":
+                token = tokens[-2]
+            if (
+                token.line == row
+                and token.line_span > 1
+                and token.kind not in ("template_var", "text")
+            ):
                if token.kind in ("django_comment", "handlebar_comment", "html_comment"):
                    tag_continuation_offset = offset
                else:
--- a/tools/lib/template_parser.py
+++ b/tools/lib/template_parser.py
@ -68,13 +68,16 @@ def tokenize(text: str) -> List[Token]:
        return looking_at("</")

    def looking_at_handlebars_start() -> bool:
-        return looking_at("{{#") or looking_at("{{^")
+        return looking_at("{{#") or looking_at("{{^") or looking_at("{{~#")

    def looking_at_handlebars_else() -> bool:
        return looking_at("{{else")

+    def looking_at_template_var() -> bool:
+        return looking_at("{")
+
    def looking_at_handlebars_end() -> bool:
-        return looking_at("{{/")
+        return looking_at("{{/") or looking_at("{{~/")

    def looking_at_django_start() -> bool:
        return looking_at("{% ")
@ -92,8 +95,11 @@ def tokenize(text: str) -> List[Token]:
        # This function detects tag like {%- if foo -%}...{% endif %}
        return looking_at("{%-") and not looking_at("{%- end")

+    def looking_at_whitespace() -> bool:
+        return looking_at("\n") or looking_at(" ")
+
    state = TokenizerState()
-    tokens = []
+    tokens: List[Token] = []

    while state.i < len(text):
        try:
@ -142,13 +148,13 @@ def tokenize(text: str) -> List[Token]:
                kind = "handlebars_else"
            elif looking_at_handlebars_start():
                s = get_handlebars_tag(text, state.i)
-                tag = s[3:-2].split()[0]
+                tag = s[3:-2].split()[0].strip("#")
                if tag.startswith("*"):
                    tag = tag[1:]
                kind = "handlebars_start"
            elif looking_at_handlebars_end():
                s = get_handlebars_tag(text, state.i)
-                tag = s[3:-2]
+                tag = s[3:-2].strip("/#~")
                kind = "handlebars_end"
            elif looking_at_django_else():
                s = get_django_tag(text, state.i)
@ -174,15 +180,37 @@ def tokenize(text: str) -> List[Token]:
                s = get_django_tag(text, state.i, stripped=True)
                tag = s[3:-3].split()[0]
                kind = "jinja2_whitespace_stripped_type2_start"
+            elif looking_at_template_var():
+                # order is important here
+                s = get_template_var(text, state.i)
+                tag = "var"
+                kind = "template_var"
+            elif looking_at("\n"):
+                s = "\n"
+                tag = "newline"
+                kind = "newline"
+            elif looking_at(" "):
+                s = get_spaces(text, state.i)
+                tag = ""
+                if not tokens or tokens[-1].kind == "newline":
+                    kind = "indent"
+                else:
+                    kind = "whitespace"
+            elif text[state.i] in "{<":
+                snippet = text[state.i :][:15]
+                raise AssertionError(f"tool cannot parse {snippet}")
            else:
-                advance(1)
-                continue
+                s = get_text(text, state.i)
+                if s == "":
+                    continue
+                tag = ""
+                kind = "text"
        except TokenizationException as e:
            raise FormattedException(
                f'''{e.message} at line {state.line} col {state.col}:"{e.line_content}"''',
            )

-        line_span = len(s.split("\n"))
+        line_span = len(s.strip("\n").split("\n"))
        token = Token(
            kind=kind,
            s=s,
@ -359,8 +387,13 @@ def validate(fn: Optional[str] = None, text: Optional[str] = None) -> None:
            "django_comment",
            "handlebar_comment",
            "handlebars_singleton",
+            "indent",
+            "template_var",
            "html_comment",
            "html_doctype",
+            "newline",
+            "text",
+            "whitespace",
        ):
            continue

@ -471,6 +504,22 @@ def get_handlebars_tag(text: str, i: int) -> str:
    return s


+def get_spaces(text: str, i: int) -> str:
+    s = ""
+    while i < len(text) and text[i] in " ":
+        s += text[i]
+        i += 1
+    return s
+
+
+def get_text(text: str, i: int) -> str:
+    s = ""
+    while i < len(text) and text[i] not in "{<":
+        s += text[i]
+        i += 1
+    return s.strip()
+
+
 def get_django_tag(text: str, i: int, stripped: bool = False) -> str:
    end = i + 2
    if stripped:
@ -528,6 +577,20 @@ def get_handlebar_comment(text: str, i: int) -> str:
    raise TokenizationException("Unclosed comment", text[i:unclosed_end])


+def get_template_var(text: str, i: int) -> str:
+    end = i + 3
+    unclosed_end = 0
+    while end <= len(text):
+        if text[end - 1] == "}":
+            if end < len(text) and text[end] == "}":
+                end += 1
+            return text[i:end]
+        if not unclosed_end and text[end] == "<":
+            unclosed_end = end
+        end += 1
+    raise TokenizationException("Unclosed var", text[i:unclosed_end])
+
+
 def get_django_comment(text: str, i: int) -> str:
    end = i + 4
    unclosed_end = 0
--- a/tools/tests/test_html_branches.py
+++ b/tools/tests/test_html_branches.py
@ -17,17 +17,16 @@ TEST_TEMPLATES_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "t

 class TestHtmlBranches(unittest.TestCase):
    def test_get_tag_info(self) -> None:
-        html = """
-            <p id="test" class="test1 test2">foo</p>
-        """
+        html = """<p id="test" class="test1 test2">foo</p>"""

-        start_tag, end_tag = tools.lib.template_parser.tokenize(html)
+        start_tag, text, end_tag = tools.lib.template_parser.tokenize(html)

        start_tag_info = get_tag_info(start_tag)
        end_tag_info = get_tag_info(end_tag)

        self.assertEqual(start_tag_info.text(), "p.test1.test2#test")
        self.assertEqual(end_tag_info.text(), "p")
+        self.assertEqual(text.s, "foo")

    def test_html_tag_tree(self) -> None:
        html = """