2016-08-02 00:14:01 +02:00
|
|
|
from __future__ import absolute_import
|
|
|
|
from __future__ import print_function
|
2016-08-02 03:00:12 +02:00
|
|
|
from typing import Callable, Optional
|
2016-08-02 00:14:01 +02:00
|
|
|
from six.moves import range
|
2016-08-02 03:00:12 +02:00
|
|
|
import re
|
2016-08-02 00:14:01 +02:00
|
|
|
|
|
|
|
class TokenizerState(object):
|
|
|
|
def __init__(self):
|
|
|
|
# type: () -> None
|
|
|
|
self.i = 0
|
|
|
|
self.line = 1
|
|
|
|
self.col = 1
|
|
|
|
|
|
|
|
class Token(object):
|
|
|
|
def __init__(self, kind, s, tag, line, col):
|
|
|
|
# type: (str, str, str, int, int) -> None
|
|
|
|
self.kind = kind
|
|
|
|
self.s = s
|
|
|
|
self.tag = tag
|
|
|
|
self.line = line
|
|
|
|
self.col = col
|
|
|
|
|
|
|
|
def tokenize(text):
|
|
|
|
def advance(n):
|
|
|
|
# type: (int) -> None
|
|
|
|
for _ in range(n):
|
|
|
|
state.i += 1
|
|
|
|
if state.i >= 0 and text[state.i - 1] == '\n':
|
|
|
|
state.line += 1
|
|
|
|
state.col = 1
|
|
|
|
else:
|
|
|
|
state.col += 1
|
|
|
|
|
|
|
|
def looking_at(s):
|
|
|
|
# type: (str) -> bool
|
|
|
|
return text[state.i:state.i+len(s)] == s
|
|
|
|
|
|
|
|
def looking_at_html_start():
|
|
|
|
# type: () -> bool
|
|
|
|
return looking_at("<") and not looking_at("</")
|
|
|
|
|
|
|
|
def looking_at_html_end():
|
|
|
|
# type: () -> bool
|
|
|
|
return looking_at("</")
|
|
|
|
|
|
|
|
def looking_at_handlebars_start():
|
|
|
|
# type: () -> bool
|
|
|
|
return looking_at("{{#") or looking_at("{{^")
|
|
|
|
|
|
|
|
def looking_at_handlebars_end():
|
|
|
|
# type: () -> bool
|
|
|
|
return looking_at("{{/")
|
|
|
|
|
|
|
|
def looking_at_django_start():
|
|
|
|
# type: () -> bool
|
|
|
|
return looking_at("{% ") and not looking_at("{% end")
|
|
|
|
|
|
|
|
def looking_at_django_end():
|
|
|
|
# type: () -> bool
|
|
|
|
return looking_at("{% end")
|
|
|
|
|
|
|
|
state = TokenizerState()
|
|
|
|
tokens = []
|
|
|
|
|
|
|
|
while state.i < len(text):
|
|
|
|
if looking_at_html_start():
|
|
|
|
s = get_html_tag(text, state.i)
|
|
|
|
tag = s[1:-1].split()[0]
|
2016-08-07 15:28:17 +02:00
|
|
|
if is_special_html_tag(s, tag):
|
|
|
|
kind = 'html_special'
|
|
|
|
elif s.endswith('/>'):
|
|
|
|
kind = 'html_singleton'
|
|
|
|
else:
|
|
|
|
kind = 'html_start'
|
2016-08-02 00:14:01 +02:00
|
|
|
elif looking_at_html_end():
|
|
|
|
s = get_html_tag(text, state.i)
|
|
|
|
tag = s[2:-1]
|
|
|
|
kind = 'html_end'
|
|
|
|
elif looking_at_handlebars_start():
|
|
|
|
s = get_handlebars_tag(text, state.i)
|
|
|
|
tag = s[3:-2].split()[0]
|
|
|
|
kind = 'handlebars_start'
|
|
|
|
elif looking_at_handlebars_end():
|
|
|
|
s = get_handlebars_tag(text, state.i)
|
|
|
|
tag = s[3:-2]
|
|
|
|
kind = 'handlebars_end'
|
|
|
|
elif looking_at_django_start():
|
|
|
|
s = get_django_tag(text, state.i)
|
|
|
|
tag = s[3:-2].split()[0]
|
|
|
|
kind = 'django_start'
|
|
|
|
elif looking_at_django_end():
|
|
|
|
s = get_django_tag(text, state.i)
|
|
|
|
tag = s[6:-3]
|
|
|
|
kind = 'django_end'
|
|
|
|
else:
|
|
|
|
advance(1)
|
|
|
|
continue
|
|
|
|
|
|
|
|
token = Token(
|
|
|
|
kind=kind,
|
|
|
|
s=s,
|
|
|
|
tag=tag,
|
|
|
|
line=state.line,
|
|
|
|
col=state.col,
|
|
|
|
)
|
|
|
|
tokens.append(token)
|
|
|
|
advance(len(s))
|
|
|
|
|
|
|
|
return tokens
|
|
|
|
|
2016-08-04 01:44:15 +02:00
|
|
|
def validate(fn=None, text=None, check_indent=True):
|
|
|
|
# type: (str, str, bool) -> None
|
|
|
|
assert fn or text
|
|
|
|
|
|
|
|
if fn is None:
|
|
|
|
fn = '<in memory file>'
|
|
|
|
|
|
|
|
if text is None:
|
|
|
|
text = open(fn).read()
|
|
|
|
|
2016-08-02 00:14:01 +02:00
|
|
|
tokens = tokenize(text)
|
|
|
|
|
|
|
|
class State(object):
|
|
|
|
def __init__(self, func):
|
|
|
|
# type: (Callable[[Token], None]) -> None
|
|
|
|
self.depth = 0
|
|
|
|
self.matcher = func
|
|
|
|
|
|
|
|
def no_start_tag(token):
|
|
|
|
# type: (Token) -> None
|
|
|
|
raise Exception('''
|
|
|
|
No start tag
|
|
|
|
fn: %s
|
|
|
|
end tag:
|
|
|
|
%s
|
|
|
|
line %d, col %d
|
|
|
|
''' % (fn, token.tag, token.line, token.col))
|
|
|
|
|
|
|
|
state = State(no_start_tag)
|
|
|
|
|
|
|
|
def start_tag_matcher(start_token):
|
|
|
|
# type: (Token) -> None
|
|
|
|
state.depth += 1
|
|
|
|
start_tag = start_token.tag
|
|
|
|
start_line = start_token.line
|
|
|
|
start_col = start_token.col
|
|
|
|
|
|
|
|
old_matcher = state.matcher
|
|
|
|
def f(end_token):
|
|
|
|
# type: (Token) -> None
|
|
|
|
|
|
|
|
end_tag = end_token.tag
|
|
|
|
end_line = end_token.line
|
|
|
|
end_col = end_token.col
|
|
|
|
|
2016-08-18 16:17:06 +02:00
|
|
|
if start_tag == 'a':
|
|
|
|
max_lines = 3
|
|
|
|
else:
|
|
|
|
max_lines = 1
|
|
|
|
|
2016-08-02 00:14:01 +02:00
|
|
|
problem = None
|
2016-08-18 16:02:18 +02:00
|
|
|
if (start_tag == 'code') and (end_line == start_line + 1):
|
|
|
|
problem = 'Code tag is split across two lines.'
|
2016-08-02 00:14:01 +02:00
|
|
|
if start_tag != end_tag:
|
|
|
|
problem = 'Mismatched tag.'
|
2016-08-18 16:17:06 +02:00
|
|
|
elif check_indent and (end_line > start_line + max_lines):
|
|
|
|
if end_col != start_col:
|
|
|
|
problem = 'Bad indentation.'
|
2016-08-02 00:14:01 +02:00
|
|
|
if problem:
|
|
|
|
raise Exception('''
|
|
|
|
fn: %s
|
|
|
|
%s
|
|
|
|
start:
|
|
|
|
%s
|
|
|
|
line %d, col %d
|
|
|
|
end tag:
|
|
|
|
%s
|
|
|
|
line %d, col %d
|
|
|
|
''' % (fn, problem, start_token.s, start_line, start_col, end_tag, end_line, end_col))
|
|
|
|
state.matcher = old_matcher
|
|
|
|
state.depth -= 1
|
|
|
|
state.matcher = f
|
|
|
|
|
|
|
|
for token in tokens:
|
|
|
|
kind = token.kind
|
|
|
|
tag = token.tag
|
|
|
|
|
|
|
|
if kind == 'html_start':
|
2016-08-07 15:28:17 +02:00
|
|
|
start_tag_matcher(token)
|
2016-08-02 00:14:01 +02:00
|
|
|
elif kind == 'html_end':
|
|
|
|
state.matcher(token)
|
|
|
|
|
|
|
|
elif kind == 'handlebars_start':
|
|
|
|
start_tag_matcher(token)
|
|
|
|
elif kind == 'handlebars_end':
|
|
|
|
state.matcher(token)
|
|
|
|
|
|
|
|
elif kind == 'django_start':
|
|
|
|
if is_django_block_tag(tag):
|
|
|
|
start_tag_matcher(token)
|
|
|
|
elif kind == 'django_end':
|
|
|
|
state.matcher(token)
|
|
|
|
|
|
|
|
null_token = Token(
|
|
|
|
kind=None,
|
|
|
|
s='(NO TAG)',
|
|
|
|
tag='NO TAG',
|
|
|
|
line=0,
|
|
|
|
col=0,
|
|
|
|
)
|
|
|
|
|
|
|
|
if state.depth != 0:
|
|
|
|
state.matcher(null_token)
|
|
|
|
|
|
|
|
def is_special_html_tag(s, tag):
|
|
|
|
# type: (str, str) -> bool
|
|
|
|
return (s.startswith('<!--') or
|
|
|
|
tag in ['link', 'meta', '!DOCTYPE'])
|
|
|
|
|
|
|
|
def is_django_block_tag(tag):
|
|
|
|
# type: (str) -> bool
|
|
|
|
return tag in [
|
|
|
|
'autoescape',
|
|
|
|
'block',
|
|
|
|
'comment',
|
|
|
|
'for',
|
|
|
|
'if',
|
|
|
|
'ifequal',
|
|
|
|
'verbatim',
|
|
|
|
'blocktrans',
|
|
|
|
'trans',
|
|
|
|
'raw',
|
|
|
|
]
|
|
|
|
|
|
|
|
def get_handlebars_tag(text, i):
|
|
|
|
# type: (str, int) -> str
|
|
|
|
end = i + 2
|
|
|
|
while end < len(text) -1 and text[end] != '}':
|
|
|
|
end += 1
|
|
|
|
if text[end] != '}' or text[end+1] != '}':
|
|
|
|
raise Exception('Tag missing }}')
|
|
|
|
s = text[i:end+2]
|
|
|
|
return s
|
|
|
|
|
|
|
|
def get_django_tag(text, i):
|
|
|
|
# type: (str, int) -> str
|
|
|
|
end = i + 2
|
|
|
|
while end < len(text) -1 and text[end] != '%':
|
|
|
|
end += 1
|
|
|
|
if text[end] != '%' or text[end+1] != '}':
|
|
|
|
raise Exception('Tag missing %}')
|
|
|
|
s = text[i:end+2]
|
|
|
|
return s
|
|
|
|
|
|
|
|
def get_html_tag(text, i):
|
|
|
|
# type: (str, int) -> str
|
|
|
|
quote_count = 0
|
|
|
|
end = i + 1
|
|
|
|
while end < len(text) and (text[end] != '>' or quote_count % 2 != 0):
|
|
|
|
if text[end] == '"':
|
|
|
|
quote_count += 1
|
|
|
|
end += 1
|
|
|
|
if end == len(text) or text[end] != '>':
|
|
|
|
raise Exception('Tag missing >')
|
|
|
|
s = text[i:end+1]
|
|
|
|
return s
|
|
|
|
|
2016-08-02 03:00:12 +02:00
|
|
|
class Node(object):
|
|
|
|
def __init__(self, token, parent):
|
|
|
|
# type: (Token, Node) -> None
|
|
|
|
self.token = token
|
|
|
|
self.children = [] # type: List[Node]
|
|
|
|
self.parent = None # type: Optional[Node]
|
|
|
|
|
|
|
|
class TagInfo(object):
|
|
|
|
def __init__(self, tag, classes, ids, token):
|
|
|
|
# type: (str, List[str], List[str], Token) -> None
|
|
|
|
self.tag = tag
|
|
|
|
self.classes = classes
|
|
|
|
self.ids = ids
|
|
|
|
self.token = token
|
|
|
|
self.words = \
|
|
|
|
[self.tag] + \
|
|
|
|
['.' + s for s in classes] + \
|
|
|
|
['#' + s for s in ids]
|
|
|
|
|
|
|
|
def text(self):
|
|
|
|
# type: () -> str
|
|
|
|
s = self.tag
|
|
|
|
if self.classes:
|
|
|
|
s += '.' + '.'.join(self.classes)
|
|
|
|
if self.ids:
|
|
|
|
s += '#' + '#'.join(self.ids)
|
|
|
|
return s
|
|
|
|
|
|
|
|
def get_tag_info(token):
|
|
|
|
# type: (Token) -> TagInfo
|
|
|
|
s = token.s
|
|
|
|
tag = token.tag
|
|
|
|
classes = [] # type: List[str]
|
|
|
|
ids = [] # type: List[str]
|
|
|
|
|
|
|
|
searches = [
|
|
|
|
(classes, ' class="(.*?)"'),
|
|
|
|
(classes, " class='(.*?)'"),
|
|
|
|
(ids, ' id="(.*?)"'),
|
|
|
|
(ids, " id='(.*?)'"),
|
|
|
|
]
|
|
|
|
|
|
|
|
for lst, regex in searches:
|
|
|
|
m = re.search(regex, s)
|
|
|
|
if m:
|
|
|
|
for g in m.groups():
|
|
|
|
lst += g.split()
|
|
|
|
|
|
|
|
return TagInfo(tag=tag, classes=classes, ids=ids, token=token)
|
|
|
|
|
|
|
|
class HtmlTreeBranch(object):
|
|
|
|
'''
|
|
|
|
For <p><div id='yo'>bla<span class='bar'></span></div></p>, store a representation
|
|
|
|
of the tags all the way down to the leaf, which would
|
|
|
|
conceptually be something like "p div(#yo) span(.bar)".
|
|
|
|
'''
|
|
|
|
|
|
|
|
def __init__(self, tags, fn):
|
|
|
|
# type: (List[TagInfo], str) -> None
|
|
|
|
self.tags = tags
|
|
|
|
self.fn = fn
|
|
|
|
self.line = tags[-1].token.line
|
|
|
|
|
|
|
|
self.words = set() # type: Set[str]
|
|
|
|
for tag in tags:
|
|
|
|
for word in tag.words:
|
|
|
|
self.words.add(word)
|
|
|
|
|
|
|
|
def staircase_text(self):
|
|
|
|
# type: () -> str
|
|
|
|
'''
|
|
|
|
produces representation of a node in staircase-like format:
|
|
|
|
|
|
|
|
html
|
|
|
|
body.main-section
|
|
|
|
p#intro
|
|
|
|
|
|
|
|
'''
|
|
|
|
res = '\n'
|
|
|
|
indent = ' ' * 4
|
|
|
|
for t in self.tags:
|
|
|
|
res += indent + t.text() + '\n'
|
|
|
|
indent += ' ' * 4
|
|
|
|
return res
|
|
|
|
|
|
|
|
def text(self):
|
|
|
|
# type: () -> str
|
|
|
|
'''
|
|
|
|
produces one-line representation of branch:
|
|
|
|
|
|
|
|
html body.main-section p#intro
|
|
|
|
'''
|
|
|
|
return ' '.join(t.text() for t in self.tags)
|
|
|
|
|
|
|
|
def html_branches(fn):
|
|
|
|
# type: (str) -> List[HtmlTreeBranch]
|
|
|
|
|
|
|
|
text = open(fn).read()
|
|
|
|
tree = html_tag_tree(text)
|
|
|
|
branches = [] # type: List[HtmlTreeBranch]
|
|
|
|
|
|
|
|
def walk(node, tag_info_list=None):
|
|
|
|
# type: (Node, Optional[List[TagInfo]]) -> Node
|
|
|
|
|
|
|
|
info = get_tag_info(node.token)
|
|
|
|
if tag_info_list is None:
|
|
|
|
tag_info_list = [info]
|
|
|
|
else:
|
|
|
|
tag_info_list = tag_info_list[:] + [info]
|
|
|
|
|
|
|
|
if node.children:
|
|
|
|
for child in node.children:
|
|
|
|
walk(node=child, tag_info_list=tag_info_list)
|
|
|
|
else:
|
|
|
|
tree_branch = HtmlTreeBranch(tags=tag_info_list, fn=fn)
|
|
|
|
branches.append(tree_branch)
|
|
|
|
|
|
|
|
for node in tree.children:
|
|
|
|
walk(node, None)
|
|
|
|
|
|
|
|
return branches
|
|
|
|
|
|
|
|
def html_tag_tree(text):
|
|
|
|
# type: (str) -> Node
|
|
|
|
tokens = tokenize(text)
|
|
|
|
top_level = Node(token=None, parent=None)
|
|
|
|
stack = [top_level]
|
|
|
|
|
|
|
|
for token in tokens:
|
2016-08-07 15:28:17 +02:00
|
|
|
if token.kind in ('html_start', 'html_singleton'):
|
2016-08-02 03:00:12 +02:00
|
|
|
if not is_special_html_tag(token.s, token.tag):
|
|
|
|
parent = stack[-1]
|
|
|
|
node= Node(token=token, parent=parent)
|
|
|
|
parent.children.append(node)
|
2016-08-07 15:28:17 +02:00
|
|
|
if token.kind == 'html_start':
|
2016-08-02 03:00:12 +02:00
|
|
|
stack.append(node)
|
|
|
|
elif token.kind == 'html_end':
|
|
|
|
stack.pop()
|
|
|
|
|
|
|
|
return top_level
|
|
|
|
|