zulip/tools/lib/template_parser.py

753 lines
22 KiB
Python

from collections.abc import Callable
from typing_extensions import override
class FormattedError(Exception):
pass
class TemplateParserError(Exception):
def __init__(self, message: str) -> None:
self.message = message
@override
def __str__(self) -> str:
return self.message
class TokenizationError(Exception):
def __init__(self, message: str, line_content: str | None = None) -> None:
self.message = message
self.line_content = line_content
class TokenizerState:
def __init__(self) -> None:
self.i = 0
self.line = 1
self.col = 1
class Token:
def __init__(self, kind: str, s: str, tag: str, line: int, col: int, line_span: int) -> None:
self.kind = kind
self.s = s
self.tag = tag
self.line = line
self.col = col
self.line_span = line_span
# These get set during the validation pass.
self.start_token: Token | None = None
self.end_token: Token | None = None
# These get set during the pretty-print phase.
self.new_s = ""
self.indent: str | None = None
self.orig_indent: str | None = None
self.child_indent: str | None = None
self.indent_is_final = False
self.parent_token: Token | None = None
def tokenize(text: str, template_format: str | None = None) -> list[Token]:
in_code_block = False
def advance(n: int) -> None:
for _ in range(n):
state.i += 1
if state.i >= 0 and text[state.i - 1] == "\n":
state.line += 1
state.col = 1
else:
state.col += 1
def looking_at(s: str) -> bool:
return text[state.i : state.i + len(s)] == s
def looking_at_htmlcomment() -> bool:
return looking_at("<!--")
def looking_at_handlebars_comment() -> bool:
return looking_at("{{!")
def looking_at_djangocomment() -> bool:
return template_format == "django" and looking_at("{#")
def looking_at_handlebars_partial() -> bool:
return template_format == "handlebars" and looking_at("{{>")
def looking_at_handlebars_partial_block() -> bool:
return template_format == "handlebars" and looking_at("{{#>")
def looking_at_html_start() -> bool:
return looking_at("<") and not looking_at("</")
def looking_at_html_end() -> bool:
return looking_at("</")
def looking_at_handlebars_start() -> bool:
return looking_at("{{#") or looking_at("{{^") or looking_at("{{~#")
def looking_at_handlebars_else() -> bool:
return template_format == "handlebars" and looking_at("{{else")
def looking_at_template_var() -> bool:
return looking_at("{")
def looking_at_handlebars_end() -> bool:
return template_format == "handlebars" and (looking_at("{{/") or looking_at("{{~/"))
def looking_at_django_start() -> bool:
return template_format == "django" and looking_at("{% ")
def looking_at_django_else() -> bool:
return template_format == "django" and (
looking_at("{% else")
or looking_at("{% elif")
or looking_at("{%- else")
or looking_at("{%- elif")
)
def looking_at_django_end() -> bool:
return template_format == "django" and looking_at("{% end")
def looking_at_jinja2_end_whitespace_stripped() -> bool:
return template_format == "django" and looking_at("{%- end")
def looking_at_jinja2_start_whitespace_stripped_type2() -> bool:
# This function detects tag like {%- if foo -%}...{% endif %}
return template_format == "django" and looking_at("{%-") and not looking_at("{%- end")
def looking_at_whitespace() -> bool:
return looking_at("\n") or looking_at(" ")
state = TokenizerState()
tokens: list[Token] = []
while state.i < len(text):
try:
if in_code_block:
in_code_block = False
s = get_code(text, state.i)
if s == "":
continue
tag = ""
kind = "code"
elif looking_at_htmlcomment():
s = get_html_comment(text, state.i)
tag = s[4:-3]
kind = "html_comment"
elif looking_at_handlebars_comment():
s = get_handlebars_comment(text, state.i)
tag = s[3:-2]
kind = "handlebars_comment"
elif looking_at_djangocomment():
s = get_django_comment(text, state.i)
tag = s[2:-2]
kind = "django_comment"
elif looking_at_handlebars_partial():
s = get_handlebars_partial(text, state.i)
tag = s[9:-2]
kind = "handlebars_partial"
elif looking_at_handlebars_partial_block():
s = get_handlebars_partial(text, state.i)
tag = s[5:-2]
kind = "handlebars_partial_block"
elif looking_at_html_start():
s = get_html_tag(text, state.i)
if s.endswith("/>"):
end_offset = -2
else:
end_offset = -1
tag_parts = s[1:end_offset].split()
if not tag_parts:
raise TemplateParserError("Tag name missing")
tag = tag_parts[0]
if tag == "!DOCTYPE":
kind = "html_doctype"
elif s.endswith("/>"):
kind = "html_singleton"
else:
kind = "html_start"
if tag in ("code", "pre", "script"):
in_code_block = True
elif looking_at_html_end():
s = get_html_tag(text, state.i)
tag = s[2:-1]
kind = "html_end"
elif looking_at_handlebars_else():
s = get_handlebars_tag(text, state.i)
tag = "else"
kind = "handlebars_else"
elif looking_at_handlebars_start():
s = get_handlebars_tag(text, state.i)
tag = s[3:-2].split()[0].strip("#").removeprefix("*")
kind = "handlebars_start"
elif looking_at_handlebars_end():
s = get_handlebars_tag(text, state.i)
tag = s[3:-2].strip("/#~")
kind = "handlebars_end"
elif looking_at_django_else():
s = get_django_tag(text, state.i)
tag = "else"
kind = "django_else"
elif looking_at_django_end():
s = get_django_tag(text, state.i)
tag = s[6:-3]
kind = "django_end"
elif looking_at_django_start():
# must check this after end/else
s = get_django_tag(text, state.i)
tag = s[3:-2].split()[0]
kind = "django_start"
if s[-3] == "-":
kind = "jinja2_whitespace_stripped_start"
elif looking_at_jinja2_end_whitespace_stripped():
s = get_django_tag(text, state.i)
tag = s[7:-3]
kind = "jinja2_whitespace_stripped_end"
elif looking_at_jinja2_start_whitespace_stripped_type2():
s = get_django_tag(text, state.i, stripped=True)
tag = s[3:-3].split()[0]
kind = "jinja2_whitespace_stripped_type2_start"
elif looking_at_template_var():
# order is important here
s = get_template_var(text, state.i)
tag = "var"
kind = "template_var"
elif looking_at("\n"):
s = "\n"
tag = "newline"
kind = "newline"
elif looking_at(" "):
s = get_spaces(text, state.i)
tag = ""
if not tokens or tokens[-1].kind == "newline":
kind = "indent"
else:
kind = "whitespace"
elif text[state.i] in "{<":
snippet = text[state.i :][:15]
raise AssertionError(f"tool cannot parse {snippet}")
else:
s = get_text(text, state.i)
if s == "":
continue
tag = ""
kind = "text"
except TokenizationError as e:
raise FormattedError(
f'''{e.message} at line {state.line} col {state.col}:"{e.line_content}"''',
)
line_span = len(s.strip("\n").split("\n"))
token = Token(
kind=kind,
s=s,
tag=tag.strip(),
line=state.line,
col=state.col,
line_span=line_span,
)
tokens.append(token)
advance(len(s))
return tokens
HTML_VOID_TAGS = {
"area",
"base",
"br",
"col",
"command",
"embed",
"hr",
"img",
"input",
"keygen",
"link",
"meta",
"param",
"source",
"track",
"wbr",
}
# The following excludes some obscure tags that are never used
# in Zulip code.
HTML_INLINE_TAGS = {
"a",
"b",
"br",
"button",
"cite",
"code",
"em",
"i",
"img",
"input",
"kbd",
"label",
"object",
"script",
"select",
"small",
"span",
"strong",
"textarea",
}
def tag_flavor(token: Token) -> str | None:
kind = token.kind
tag = token.tag
if kind in (
"code",
"django_comment",
"handlebars_comment",
"handlebars_partial",
"html_comment",
"html_doctype",
"html_singleton",
"indent",
"newline",
"template_var",
"text",
"whitespace",
):
return None
if kind in ("handlebars_start", "handlebars_partial_block", "html_start"):
return "start"
elif kind in (
"django_else",
"django_end",
"handlebars_else",
"handlebars_end",
"html_end",
"jinja2_whitespace_stripped_end",
):
return "end"
elif kind in {
"django_start",
"django_else",
"jinja2_whitespace_stripped_start",
"jinja2_whitespace_stripped_type2_start",
}:
if is_django_block_tag(tag):
return "start"
else:
return None
else:
raise AssertionError(f"tools programmer neglected to handle {kind} tokens")
def validate(
fn: str | None = None,
text: str | None = None,
template_format: str | None = None,
) -> list[Token]:
assert fn or text
if fn is None:
fn = "<in memory file>"
if text is None:
with open(fn) as f:
text = f.read()
lines = text.split("\n")
try:
tokens = tokenize(text, template_format=template_format)
except FormattedError as e:
raise TemplateParserError(
f"""
fn: {fn}
{e}"""
)
prevent_whitespace_violations(fn, tokens)
class State:
def __init__(self, func: Callable[[Token | None], None]) -> None:
self.depth = 0
self.foreign = False
self.matcher = func
def no_start_tag(token: Token | None) -> None:
assert token
raise TemplateParserError(
f"""
No start tag
fn: {fn}
end tag:
{token.tag}
line {token.line}, col {token.col}
"""
)
state = State(no_start_tag)
def start_tag_matcher(start_token: Token) -> None:
state.depth += 1
start_tag = start_token.tag.strip("~")
start_line = start_token.line
start_col = start_token.col
old_matcher = state.matcher
old_foreign = state.foreign
if start_tag in ["math", "svg"]:
state.foreign = True
def f(end_token: Token | None) -> None:
if end_token is None:
raise TemplateParserError(
f"""
Problem with {fn}
Missing end tag for the token at row {start_line} {start_col}!
{start_token.s}
It's possible you have a typo in a token that you think is
matching this tag.
"""
)
is_else_tag = end_token.tag == "else"
end_tag = end_token.tag.strip("~")
end_line = end_token.line
end_col = end_token.col
def report_problem() -> str | None:
if (start_tag == "code") and (end_line == start_line + 1):
return "Code tag is split across two lines."
if is_else_tag:
# We are not completely rigorous about having a sensible
# order of if/elif/elif/else, but we catch obviously
# mismatching else tags.
if start_tag not in ("if", "else", "unless"):
return f"Unexpected else/elif tag encountered after {start_tag} tag."
elif start_tag != end_tag:
return f"Mismatched tags: ({start_tag} != {end_tag})"
return None
problem = report_problem()
if problem:
raise TemplateParserError(
f"""
fn: {fn}
{problem}
start:
{start_token.s}
line {start_line}, col {start_col}
end tag:
{end_tag}
line {end_line}, col {end_col}
"""
)
if not is_else_tag:
state.matcher = old_matcher
state.foreign = old_foreign
state.depth -= 1
# TODO: refine this for the else/elif use cases
end_token.start_token = start_token
start_token.end_token = end_token
state.matcher = f
for token in tokens:
kind = token.kind
tag = token.tag
if not state.foreign:
if kind == "html_start" and tag in HTML_VOID_TAGS:
raise TemplateParserError(
f"Tag must be self-closing: {tag} at {fn} line {token.line}, col {token.col}"
)
elif kind == "html_singleton" and tag not in HTML_VOID_TAGS:
raise TemplateParserError(
f"Tag must not be self-closing: {tag} at {fn} line {token.line}, col {token.col}"
)
flavor = tag_flavor(token)
if flavor == "start":
start_tag_matcher(token)
elif flavor == "end":
state.matcher(token)
if state.depth != 0:
state.matcher(None)
ensure_matching_indentation(fn, tokens, lines)
return tokens
def ensure_matching_indentation(fn: str, tokens: list[Token], lines: list[str]) -> None:
def has_bad_indentation() -> bool:
is_inline_tag = start_tag in HTML_INLINE_TAGS and start_token.kind == "html_start"
if end_line > start_line + 1:
if is_inline_tag:
end_row_text = lines[end_line - 1]
if end_row_text.lstrip().startswith(end_token.s) and end_col != start_col:
return True
else:
if end_col != start_col:
return True
return False
for token in tokens:
if token.start_token is None:
continue
end_token = token
start_token = token.start_token
start_line = start_token.line
start_col = start_token.col
start_tag = start_token.tag
end_tag = end_token.tag.strip("~")
end_line = end_token.line
end_col = end_token.col
if has_bad_indentation():
raise TemplateParserError(
f"""
fn: {fn}
Indentation for start/end tags does not match.
start tag: {start_token.s}
start:
line {start_line}, col {start_col}
end:
{end_tag}
line {end_line}, col {end_col}
"""
)
def prevent_extra_newlines(fn: str, tokens: list[Token]) -> None:
count = 0
for token in tokens:
if token.kind != "newline":
count = 0
continue
count += 1
if count >= 4:
raise TemplateParserError(
f"""Please avoid so many blank lines near row {token.line} in {fn}."""
)
def prevent_whitespace_violations(fn: str, tokens: list[Token]) -> None:
if tokens[0].kind in ("indent", "whitespace"):
raise TemplateParserError(f" Please remove the whitespace at the beginning of {fn}.")
prevent_extra_newlines(fn, tokens)
for i in range(1, len(tokens) - 1):
token = tokens[i]
next_token = tokens[i + 1]
if token.kind == "indent":
if next_token.kind in ("indent", "whitespace"):
raise AssertionError("programming error parsing indents")
if next_token.kind == "newline":
raise TemplateParserError(
f"""Please just make row {token.line} in {fn} a truly blank line (no spaces)."""
)
if len(token.s) % 4 != 0:
raise TemplateParserError(
f"""
Please use 4-space indents for template files. Most of our
codebase (including Python and JavaScript) uses 4-space indents,
so it's worth investing in configuring your editor to use
4-space indents for files like
{fn}
The line at row {token.line} is indented with {len(token.s)} spaces.
"""
)
if token.kind == "whitespace":
if len(token.s) > 1:
raise TemplateParserError(
f"""
We did not expect this much whitespace at row {token.line} column {token.col} in {fn}.
"""
)
if next_token.kind == "newline":
raise TemplateParserError(
f"""
Unexpected trailing whitespace at row {token.line} column {token.col} in {fn}.
"""
)
def is_django_block_tag(tag: str) -> bool:
return tag in [
"autoescape",
"block",
"comment",
"for",
"if",
"ifequal",
"macro",
"verbatim",
"blocktrans",
"trans",
"raw",
"with",
]
def get_handlebars_tag(text: str, i: int) -> str:
end = i + 2
while end < len(text) - 1 and text[end] != "}":
end += 1
if text[end] != "}" or text[end + 1] != "}":
raise TokenizationError('Tag missing "}}"', text[i : end + 2])
s = text[i : end + 2]
return s
def get_spaces(text: str, i: int) -> str:
s = ""
while i < len(text) and text[i] in " ":
s += text[i]
i += 1
return s
def get_code(text: str, i: int) -> str:
s = ""
while i < len(text) and text[i] not in "<":
s += text[i]
i += 1
return s
def get_text(text: str, i: int) -> str:
s = ""
while i < len(text) and text[i] not in "{<":
s += text[i]
i += 1
return s.strip()
def get_django_tag(text: str, i: int, stripped: bool = False) -> str:
end = i + 2
if stripped:
end += 1
while end < len(text) - 1 and text[end] != "%":
end += 1
if text[end] != "%" or text[end + 1] != "}":
raise TokenizationError('Tag missing "%}"', text[i : end + 2])
s = text[i : end + 2]
return s
def get_html_tag(text: str, i: int) -> str:
quote_count = 0
end = i + 1
unclosed_end = 0
while end < len(text) and (text[end] != ">" or (quote_count % 2 != 0 and text[end] != "<")):
if text[end] == '"':
quote_count += 1
if not unclosed_end and text[end] == "<":
unclosed_end = end
end += 1
if quote_count % 2 != 0:
if unclosed_end:
raise TokenizationError("Unbalanced quotes", text[i:unclosed_end])
else:
raise TokenizationError("Unbalanced quotes", text[i : end + 1])
if end == len(text) or text[end] != ">":
raise TokenizationError('Tag missing ">"', text[i : end + 1])
s = text[i : end + 1]
return s
def get_html_comment(text: str, i: int) -> str:
end = i + 7
unclosed_end = 0
while end <= len(text):
if text[end - 3 : end] == "-->":
return text[i:end]
if not unclosed_end and text[end] == "<":
unclosed_end = end
end += 1
raise TokenizationError("Unclosed comment", text[i:unclosed_end])
def get_handlebars_comment(text: str, i: int) -> str:
end = i + 5
unclosed_end = 0
while end <= len(text):
if text[end - 2 : end] == "}}":
return text[i:end]
if not unclosed_end and text[end] == "<":
unclosed_end = end
end += 1
raise TokenizationError("Unclosed comment", text[i:unclosed_end])
def get_template_var(text: str, i: int) -> str:
end = i + 3
unclosed_end = 0
while end <= len(text):
if text[end - 1] == "}":
if end < len(text) and text[end] == "}":
end += 1
return text[i:end]
if not unclosed_end and text[end] == "<":
unclosed_end = end
end += 1
raise TokenizationError("Unclosed var", text[i:unclosed_end])
def get_django_comment(text: str, i: int) -> str:
end = i + 4
unclosed_end = 0
while end <= len(text):
if text[end - 2 : end] == "#}":
return text[i:end]
if not unclosed_end and text[end] == "<":
unclosed_end = end
end += 1
raise TokenizationError("Unclosed comment", text[i:unclosed_end])
def get_handlebars_partial(text: str, i: int) -> str:
"""Works for both partials and partial blocks."""
end = i + 10
unclosed_end = 0
while end <= len(text):
if text[end - 2 : end] == "}}":
return text[i:end]
if not unclosed_end and text[end] == "<":
unclosed_end = end
end += 1
raise TokenizationError("Unclosed partial", text[i:unclosed_end])