mirror of https://github.com/zulip/zulip.git
check-templates: Make parser more thorough (and faster).
We now create tokens for whitespace and text, such that you could rebuild the template file with "".join(token.s for token in tokens). I also fixed a few bugs related to not parsing whitespace-control tokens. We no longer ignore template variables, although we could do a lot better at validating them. The most immediate use case for the more thorough parser is to simplify the pretty printer, but it should also make it less likely for us to skip over new template constructs (i.e. the tool will fail hard rather than acting strange). Note that this speeds up the tool by almost 3x, which may be slightly surprising considering we are building more tokens. The reason is that we are now munching efficiently through big chunks of whitespace and text at a time, rather than checking each individual character to see if it starts one of the N other token types. The changes to the pretty_print module here are a bit ugly, but they should mostly be made irrelevant in subsequent commits.
This commit is contained in:
parent
2eac0560b2
commit
a744e38e67
|
@ -45,11 +45,12 @@ def else_token(token: Token) -> bool:
|
||||||
|
|
||||||
|
|
||||||
def pop_unused_tokens(tokens: List[Token], row: int) -> bool:
|
def pop_unused_tokens(tokens: List[Token], row: int) -> bool:
|
||||||
|
was_closed = False
|
||||||
while tokens and tokens[-1].line <= row:
|
while tokens and tokens[-1].line <= row:
|
||||||
token = tokens.pop()
|
token = tokens.pop()
|
||||||
if close_token(token):
|
if close_token(token):
|
||||||
return True
|
was_closed = True
|
||||||
return False
|
return was_closed
|
||||||
|
|
||||||
|
|
||||||
def indent_pref(row: int, tokens: List[Token], line: str) -> str:
|
def indent_pref(row: int, tokens: List[Token], line: str) -> str:
|
||||||
|
@ -146,10 +147,19 @@ def pretty_print_html(html: str) -> str:
|
||||||
next_offset = open_offsets.pop()
|
next_offset = open_offsets.pop()
|
||||||
return tag_continuation_offset
|
return tag_continuation_offset
|
||||||
|
|
||||||
|
while tokens and tokens[-1].line < row:
|
||||||
|
token = tokens.pop()
|
||||||
|
|
||||||
offset = next_offset
|
offset = next_offset
|
||||||
if tokens:
|
if tokens:
|
||||||
token = tokens[-1]
|
token = tokens[-1]
|
||||||
if token.line == row and token.line_span > 1:
|
if token.kind == "indent":
|
||||||
|
token = tokens[-2]
|
||||||
|
if (
|
||||||
|
token.line == row
|
||||||
|
and token.line_span > 1
|
||||||
|
and token.kind not in ("template_var", "text")
|
||||||
|
):
|
||||||
if token.kind in ("django_comment", "handlebar_comment", "html_comment"):
|
if token.kind in ("django_comment", "handlebar_comment", "html_comment"):
|
||||||
tag_continuation_offset = offset
|
tag_continuation_offset = offset
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -68,13 +68,16 @@ def tokenize(text: str) -> List[Token]:
|
||||||
return looking_at("</")
|
return looking_at("</")
|
||||||
|
|
||||||
def looking_at_handlebars_start() -> bool:
|
def looking_at_handlebars_start() -> bool:
|
||||||
return looking_at("{{#") or looking_at("{{^")
|
return looking_at("{{#") or looking_at("{{^") or looking_at("{{~#")
|
||||||
|
|
||||||
def looking_at_handlebars_else() -> bool:
|
def looking_at_handlebars_else() -> bool:
|
||||||
return looking_at("{{else")
|
return looking_at("{{else")
|
||||||
|
|
||||||
|
def looking_at_template_var() -> bool:
|
||||||
|
return looking_at("{")
|
||||||
|
|
||||||
def looking_at_handlebars_end() -> bool:
|
def looking_at_handlebars_end() -> bool:
|
||||||
return looking_at("{{/")
|
return looking_at("{{/") or looking_at("{{~/")
|
||||||
|
|
||||||
def looking_at_django_start() -> bool:
|
def looking_at_django_start() -> bool:
|
||||||
return looking_at("{% ")
|
return looking_at("{% ")
|
||||||
|
@ -92,8 +95,11 @@ def tokenize(text: str) -> List[Token]:
|
||||||
# This function detects tag like {%- if foo -%}...{% endif %}
|
# This function detects tag like {%- if foo -%}...{% endif %}
|
||||||
return looking_at("{%-") and not looking_at("{%- end")
|
return looking_at("{%-") and not looking_at("{%- end")
|
||||||
|
|
||||||
|
def looking_at_whitespace() -> bool:
|
||||||
|
return looking_at("\n") or looking_at(" ")
|
||||||
|
|
||||||
state = TokenizerState()
|
state = TokenizerState()
|
||||||
tokens = []
|
tokens: List[Token] = []
|
||||||
|
|
||||||
while state.i < len(text):
|
while state.i < len(text):
|
||||||
try:
|
try:
|
||||||
|
@ -142,13 +148,13 @@ def tokenize(text: str) -> List[Token]:
|
||||||
kind = "handlebars_else"
|
kind = "handlebars_else"
|
||||||
elif looking_at_handlebars_start():
|
elif looking_at_handlebars_start():
|
||||||
s = get_handlebars_tag(text, state.i)
|
s = get_handlebars_tag(text, state.i)
|
||||||
tag = s[3:-2].split()[0]
|
tag = s[3:-2].split()[0].strip("#")
|
||||||
if tag.startswith("*"):
|
if tag.startswith("*"):
|
||||||
tag = tag[1:]
|
tag = tag[1:]
|
||||||
kind = "handlebars_start"
|
kind = "handlebars_start"
|
||||||
elif looking_at_handlebars_end():
|
elif looking_at_handlebars_end():
|
||||||
s = get_handlebars_tag(text, state.i)
|
s = get_handlebars_tag(text, state.i)
|
||||||
tag = s[3:-2]
|
tag = s[3:-2].strip("/#~")
|
||||||
kind = "handlebars_end"
|
kind = "handlebars_end"
|
||||||
elif looking_at_django_else():
|
elif looking_at_django_else():
|
||||||
s = get_django_tag(text, state.i)
|
s = get_django_tag(text, state.i)
|
||||||
|
@ -174,15 +180,37 @@ def tokenize(text: str) -> List[Token]:
|
||||||
s = get_django_tag(text, state.i, stripped=True)
|
s = get_django_tag(text, state.i, stripped=True)
|
||||||
tag = s[3:-3].split()[0]
|
tag = s[3:-3].split()[0]
|
||||||
kind = "jinja2_whitespace_stripped_type2_start"
|
kind = "jinja2_whitespace_stripped_type2_start"
|
||||||
|
elif looking_at_template_var():
|
||||||
|
# order is important here
|
||||||
|
s = get_template_var(text, state.i)
|
||||||
|
tag = "var"
|
||||||
|
kind = "template_var"
|
||||||
|
elif looking_at("\n"):
|
||||||
|
s = "\n"
|
||||||
|
tag = "newline"
|
||||||
|
kind = "newline"
|
||||||
|
elif looking_at(" "):
|
||||||
|
s = get_spaces(text, state.i)
|
||||||
|
tag = ""
|
||||||
|
if not tokens or tokens[-1].kind == "newline":
|
||||||
|
kind = "indent"
|
||||||
else:
|
else:
|
||||||
advance(1)
|
kind = "whitespace"
|
||||||
|
elif text[state.i] in "{<":
|
||||||
|
snippet = text[state.i :][:15]
|
||||||
|
raise AssertionError(f"tool cannot parse {snippet}")
|
||||||
|
else:
|
||||||
|
s = get_text(text, state.i)
|
||||||
|
if s == "":
|
||||||
continue
|
continue
|
||||||
|
tag = ""
|
||||||
|
kind = "text"
|
||||||
except TokenizationException as e:
|
except TokenizationException as e:
|
||||||
raise FormattedException(
|
raise FormattedException(
|
||||||
f'''{e.message} at line {state.line} col {state.col}:"{e.line_content}"''',
|
f'''{e.message} at line {state.line} col {state.col}:"{e.line_content}"''',
|
||||||
)
|
)
|
||||||
|
|
||||||
line_span = len(s.split("\n"))
|
line_span = len(s.strip("\n").split("\n"))
|
||||||
token = Token(
|
token = Token(
|
||||||
kind=kind,
|
kind=kind,
|
||||||
s=s,
|
s=s,
|
||||||
|
@ -359,8 +387,13 @@ def validate(fn: Optional[str] = None, text: Optional[str] = None) -> None:
|
||||||
"django_comment",
|
"django_comment",
|
||||||
"handlebar_comment",
|
"handlebar_comment",
|
||||||
"handlebars_singleton",
|
"handlebars_singleton",
|
||||||
|
"indent",
|
||||||
|
"template_var",
|
||||||
"html_comment",
|
"html_comment",
|
||||||
"html_doctype",
|
"html_doctype",
|
||||||
|
"newline",
|
||||||
|
"text",
|
||||||
|
"whitespace",
|
||||||
):
|
):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -471,6 +504,22 @@ def get_handlebars_tag(text: str, i: int) -> str:
|
||||||
return s
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def get_spaces(text: str, i: int) -> str:
|
||||||
|
s = ""
|
||||||
|
while i < len(text) and text[i] in " ":
|
||||||
|
s += text[i]
|
||||||
|
i += 1
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def get_text(text: str, i: int) -> str:
|
||||||
|
s = ""
|
||||||
|
while i < len(text) and text[i] not in "{<":
|
||||||
|
s += text[i]
|
||||||
|
i += 1
|
||||||
|
return s.strip()
|
||||||
|
|
||||||
|
|
||||||
def get_django_tag(text: str, i: int, stripped: bool = False) -> str:
|
def get_django_tag(text: str, i: int, stripped: bool = False) -> str:
|
||||||
end = i + 2
|
end = i + 2
|
||||||
if stripped:
|
if stripped:
|
||||||
|
@ -528,6 +577,20 @@ def get_handlebar_comment(text: str, i: int) -> str:
|
||||||
raise TokenizationException("Unclosed comment", text[i:unclosed_end])
|
raise TokenizationException("Unclosed comment", text[i:unclosed_end])
|
||||||
|
|
||||||
|
|
||||||
|
def get_template_var(text: str, i: int) -> str:
|
||||||
|
end = i + 3
|
||||||
|
unclosed_end = 0
|
||||||
|
while end <= len(text):
|
||||||
|
if text[end - 1] == "}":
|
||||||
|
if end < len(text) and text[end] == "}":
|
||||||
|
end += 1
|
||||||
|
return text[i:end]
|
||||||
|
if not unclosed_end and text[end] == "<":
|
||||||
|
unclosed_end = end
|
||||||
|
end += 1
|
||||||
|
raise TokenizationException("Unclosed var", text[i:unclosed_end])
|
||||||
|
|
||||||
|
|
||||||
def get_django_comment(text: str, i: int) -> str:
|
def get_django_comment(text: str, i: int) -> str:
|
||||||
end = i + 4
|
end = i + 4
|
||||||
unclosed_end = 0
|
unclosed_end = 0
|
||||||
|
|
|
@ -17,17 +17,16 @@ TEST_TEMPLATES_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "t
|
||||||
|
|
||||||
class TestHtmlBranches(unittest.TestCase):
|
class TestHtmlBranches(unittest.TestCase):
|
||||||
def test_get_tag_info(self) -> None:
|
def test_get_tag_info(self) -> None:
|
||||||
html = """
|
html = """<p id="test" class="test1 test2">foo</p>"""
|
||||||
<p id="test" class="test1 test2">foo</p>
|
|
||||||
"""
|
|
||||||
|
|
||||||
start_tag, end_tag = tools.lib.template_parser.tokenize(html)
|
start_tag, text, end_tag = tools.lib.template_parser.tokenize(html)
|
||||||
|
|
||||||
start_tag_info = get_tag_info(start_tag)
|
start_tag_info = get_tag_info(start_tag)
|
||||||
end_tag_info = get_tag_info(end_tag)
|
end_tag_info = get_tag_info(end_tag)
|
||||||
|
|
||||||
self.assertEqual(start_tag_info.text(), "p.test1.test2#test")
|
self.assertEqual(start_tag_info.text(), "p.test1.test2#test")
|
||||||
self.assertEqual(end_tag_info.text(), "p")
|
self.assertEqual(end_tag_info.text(), "p")
|
||||||
|
self.assertEqual(text.s, "foo")
|
||||||
|
|
||||||
def test_html_tag_tree(self) -> None:
|
def test_html_tag_tree(self) -> None:
|
||||||
html = """
|
html = """
|
||||||
|
|
Loading…
Reference in New Issue