mirror of https://github.com/zulip/zulip.git
Factor out HtmlTreeBranch and related code from template parser.
This code is not directly related to the template parser, so it can safely live in its own file. The only significant change to the code is to the signature of `html_branches` so that it can be called without requiring a file. Since it's only used in html_grep, that has been updated to reflect this change. Fixes: #1774.
This commit is contained in:
parent
4d3350bd7b
commit
331617efab
|
@ -0,0 +1,163 @@
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
|
from .template_parser import Token
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from .template_parser import (
|
||||||
|
is_special_html_tag,
|
||||||
|
tokenize,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class HtmlBranchesException(Exception):
|
||||||
|
# TODO: Have callers pass in line numbers.
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class HtmlTreeBranch(object):
|
||||||
|
"""
|
||||||
|
For <p><div id='yo'>bla<span class='bar'></span></div></p>, store a
|
||||||
|
representation of the tags all the way down to the leaf, which would
|
||||||
|
conceptually be something like "p div(#yo) span(.bar)".
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, tags, fn):
|
||||||
|
# type: (List[TagInfo], str) -> None
|
||||||
|
self.tags = tags
|
||||||
|
self.fn = fn
|
||||||
|
self.line = tags[-1].token.line
|
||||||
|
|
||||||
|
self.words = set() # type: Set[str]
|
||||||
|
for tag in tags:
|
||||||
|
for word in tag.words:
|
||||||
|
self.words.add(word)
|
||||||
|
|
||||||
|
def staircase_text(self):
|
||||||
|
# type: () -> str
|
||||||
|
"""
|
||||||
|
produces representation of a node in staircase-like format:
|
||||||
|
|
||||||
|
html
|
||||||
|
body.main-section
|
||||||
|
p#intro
|
||||||
|
|
||||||
|
"""
|
||||||
|
res = '\n'
|
||||||
|
indent = ' ' * 4
|
||||||
|
for t in self.tags:
|
||||||
|
res += indent + t.text() + '\n'
|
||||||
|
indent += ' ' * 4
|
||||||
|
return res
|
||||||
|
|
||||||
|
def text(self):
|
||||||
|
# type: () -> str
|
||||||
|
"""
|
||||||
|
produces one-line representation of branch:
|
||||||
|
|
||||||
|
html body.main-section p#intro
|
||||||
|
"""
|
||||||
|
return ' '.join(t.text() for t in self.tags)
|
||||||
|
|
||||||
|
|
||||||
|
class Node(object):
|
||||||
|
def __init__(self, token, parent):
|
||||||
|
# type: (Token, Node) -> None
|
||||||
|
self.token = token
|
||||||
|
self.children = [] # type: List[Node]
|
||||||
|
self.parent = None # type: Optional[Node]
|
||||||
|
|
||||||
|
|
||||||
|
class TagInfo(object):
|
||||||
|
def __init__(self, tag, classes, ids, token):
|
||||||
|
# type: (str, List[str], List[str], Token) -> None
|
||||||
|
self.tag = tag
|
||||||
|
self.classes = classes
|
||||||
|
self.ids = ids
|
||||||
|
self.token = token
|
||||||
|
self.words = \
|
||||||
|
[self.tag] + \
|
||||||
|
['.' + s for s in classes] + \
|
||||||
|
['#' + s for s in ids]
|
||||||
|
|
||||||
|
def text(self):
|
||||||
|
# type: () -> str
|
||||||
|
s = self.tag
|
||||||
|
if self.classes:
|
||||||
|
s += '.' + '.'.join(self.classes)
|
||||||
|
if self.ids:
|
||||||
|
s += '#' + '#'.join(self.ids)
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def get_tag_info(token):
|
||||||
|
# type: (Token) -> TagInfo
|
||||||
|
s = token.s
|
||||||
|
tag = token.tag
|
||||||
|
classes = [] # type: List[str]
|
||||||
|
ids = [] # type: List[str]
|
||||||
|
|
||||||
|
searches = [
|
||||||
|
(classes, ' class="(.*?)"'),
|
||||||
|
(classes, " class='(.*?)'"),
|
||||||
|
(ids, ' id="(.*?)"'),
|
||||||
|
(ids, " id='(.*?)'"),
|
||||||
|
]
|
||||||
|
|
||||||
|
for lst, regex in searches:
|
||||||
|
m = re.search(regex, s)
|
||||||
|
if m:
|
||||||
|
for g in m.groups():
|
||||||
|
lst += g.split()
|
||||||
|
|
||||||
|
return TagInfo(tag=tag, classes=classes, ids=ids, token=token)
|
||||||
|
|
||||||
|
|
||||||
|
def html_branches(text, fn=None):
|
||||||
|
# type: (str, str) -> List[HtmlTreeBranch]
|
||||||
|
|
||||||
|
tree = html_tag_tree(text)
|
||||||
|
branches = [] # type: List[HtmlTreeBranch]
|
||||||
|
|
||||||
|
def walk(node, tag_info_list=None):
|
||||||
|
# type: (Node, Optional[List[TagInfo]]) -> Node
|
||||||
|
|
||||||
|
info = get_tag_info(node.token)
|
||||||
|
if tag_info_list is None:
|
||||||
|
tag_info_list = [info]
|
||||||
|
else:
|
||||||
|
tag_info_list = tag_info_list[:] + [info]
|
||||||
|
|
||||||
|
if node.children:
|
||||||
|
for child in node.children:
|
||||||
|
walk(node=child, tag_info_list=tag_info_list)
|
||||||
|
else:
|
||||||
|
tree_branch = HtmlTreeBranch(tags=tag_info_list, fn=fn)
|
||||||
|
branches.append(tree_branch)
|
||||||
|
|
||||||
|
for node in tree.children:
|
||||||
|
walk(node, None)
|
||||||
|
|
||||||
|
return branches
|
||||||
|
|
||||||
|
|
||||||
|
def html_tag_tree(text):
|
||||||
|
# type: (str) -> Node
|
||||||
|
tokens = tokenize(text)
|
||||||
|
top_level = Node(token=None, parent=None)
|
||||||
|
stack = [top_level]
|
||||||
|
|
||||||
|
for token in tokens:
|
||||||
|
if token.kind in ('html_start', 'html_singleton'):
|
||||||
|
if not is_special_html_tag(token.s, token.tag):
|
||||||
|
parent = stack[-1]
|
||||||
|
node= Node(token=token, parent=parent)
|
||||||
|
parent.children.append(node)
|
||||||
|
if token.kind == 'html_start':
|
||||||
|
stack.append(node)
|
||||||
|
elif token.kind == 'html_end':
|
||||||
|
stack.pop()
|
||||||
|
|
||||||
|
return top_level
|
|
@ -3,14 +3,14 @@ from __future__ import print_function
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from six.moves import range
|
from six.moves import range
|
||||||
|
|
||||||
from .template_parser import html_branches, Token, HtmlTreeBranch
|
from .html_branches import html_branches, HtmlTreeBranch
|
||||||
|
|
||||||
def show_all_branches(fns):
|
def show_all_branches(fns):
|
||||||
# type: (List[str]) -> None
|
# type: (List[str]) -> None
|
||||||
for fn in fns:
|
for fn in fns:
|
||||||
print(fn)
|
print(fn)
|
||||||
text = open(fn).read()
|
text = open(fn).read()
|
||||||
branches = html_branches(text)
|
branches = html_branches(text, fn=fn)
|
||||||
for branch in branches:
|
for branch in branches:
|
||||||
print(branch.text())
|
print(branch.text())
|
||||||
print('---')
|
print('---')
|
||||||
|
@ -27,7 +27,8 @@ class Grepper(object):
|
||||||
all_branches = [] # type: List[HtmlTreeBranch]
|
all_branches = [] # type: List[HtmlTreeBranch]
|
||||||
|
|
||||||
for fn in fns:
|
for fn in fns:
|
||||||
branches = html_branches(fn)
|
text = open(fn).read()
|
||||||
|
branches = html_branches(text, fn=fn)
|
||||||
all_branches += branches
|
all_branches += branches
|
||||||
|
|
||||||
self.word_dict = defaultdict(set) # type: Dict[str, Set[HtmlTreeBranch]]
|
self.word_dict = defaultdict(set) # type: Dict[str, Set[HtmlTreeBranch]]
|
||||||
|
|
|
@ -266,145 +266,3 @@ def get_html_tag(text, i):
|
||||||
raise TemplateParserException('Tag missing >')
|
raise TemplateParserException('Tag missing >')
|
||||||
s = text[i:end+1]
|
s = text[i:end+1]
|
||||||
return s
|
return s
|
||||||
|
|
||||||
class Node(object):
|
|
||||||
def __init__(self, token, parent):
|
|
||||||
# type: (Token, Node) -> None
|
|
||||||
self.token = token
|
|
||||||
self.children = [] # type: List[Node]
|
|
||||||
self.parent = None # type: Optional[Node]
|
|
||||||
|
|
||||||
class TagInfo(object):
|
|
||||||
def __init__(self, tag, classes, ids, token):
|
|
||||||
# type: (str, List[str], List[str], Token) -> None
|
|
||||||
self.tag = tag
|
|
||||||
self.classes = classes
|
|
||||||
self.ids = ids
|
|
||||||
self.token = token
|
|
||||||
self.words = \
|
|
||||||
[self.tag] + \
|
|
||||||
['.' + s for s in classes] + \
|
|
||||||
['#' + s for s in ids]
|
|
||||||
|
|
||||||
def text(self):
|
|
||||||
# type: () -> str
|
|
||||||
s = self.tag
|
|
||||||
if self.classes:
|
|
||||||
s += '.' + '.'.join(self.classes)
|
|
||||||
if self.ids:
|
|
||||||
s += '#' + '#'.join(self.ids)
|
|
||||||
return s
|
|
||||||
|
|
||||||
def get_tag_info(token):
|
|
||||||
# type: (Token) -> TagInfo
|
|
||||||
s = token.s
|
|
||||||
tag = token.tag
|
|
||||||
classes = [] # type: List[str]
|
|
||||||
ids = [] # type: List[str]
|
|
||||||
|
|
||||||
searches = [
|
|
||||||
(classes, ' class="(.*?)"'),
|
|
||||||
(classes, " class='(.*?)'"),
|
|
||||||
(ids, ' id="(.*?)"'),
|
|
||||||
(ids, " id='(.*?)'"),
|
|
||||||
]
|
|
||||||
|
|
||||||
for lst, regex in searches:
|
|
||||||
m = re.search(regex, s)
|
|
||||||
if m:
|
|
||||||
for g in m.groups():
|
|
||||||
lst += g.split()
|
|
||||||
|
|
||||||
return TagInfo(tag=tag, classes=classes, ids=ids, token=token)
|
|
||||||
|
|
||||||
class HtmlTreeBranch(object):
|
|
||||||
'''
|
|
||||||
For <p><div id='yo'>bla<span class='bar'></span></div></p>, store a representation
|
|
||||||
of the tags all the way down to the leaf, which would
|
|
||||||
conceptually be something like "p div(#yo) span(.bar)".
|
|
||||||
'''
|
|
||||||
|
|
||||||
def __init__(self, tags, fn):
|
|
||||||
# type: (List[TagInfo], str) -> None
|
|
||||||
self.tags = tags
|
|
||||||
self.fn = fn
|
|
||||||
self.line = tags[-1].token.line
|
|
||||||
|
|
||||||
self.words = set() # type: Set[str]
|
|
||||||
for tag in tags:
|
|
||||||
for word in tag.words:
|
|
||||||
self.words.add(word)
|
|
||||||
|
|
||||||
def staircase_text(self):
|
|
||||||
# type: () -> str
|
|
||||||
'''
|
|
||||||
produces representation of a node in staircase-like format:
|
|
||||||
|
|
||||||
html
|
|
||||||
body.main-section
|
|
||||||
p#intro
|
|
||||||
|
|
||||||
'''
|
|
||||||
res = '\n'
|
|
||||||
indent = ' ' * 4
|
|
||||||
for t in self.tags:
|
|
||||||
res += indent + t.text() + '\n'
|
|
||||||
indent += ' ' * 4
|
|
||||||
return res
|
|
||||||
|
|
||||||
def text(self):
|
|
||||||
# type: () -> str
|
|
||||||
'''
|
|
||||||
produces one-line representation of branch:
|
|
||||||
|
|
||||||
html body.main-section p#intro
|
|
||||||
'''
|
|
||||||
return ' '.join(t.text() for t in self.tags)
|
|
||||||
|
|
||||||
def html_branches(fn):
|
|
||||||
# type: (str) -> List[HtmlTreeBranch]
|
|
||||||
|
|
||||||
text = open(fn).read()
|
|
||||||
tree = html_tag_tree(text)
|
|
||||||
branches = [] # type: List[HtmlTreeBranch]
|
|
||||||
|
|
||||||
def walk(node, tag_info_list=None):
|
|
||||||
# type: (Node, Optional[List[TagInfo]]) -> Node
|
|
||||||
|
|
||||||
info = get_tag_info(node.token)
|
|
||||||
if tag_info_list is None:
|
|
||||||
tag_info_list = [info]
|
|
||||||
else:
|
|
||||||
tag_info_list = tag_info_list[:] + [info]
|
|
||||||
|
|
||||||
if node.children:
|
|
||||||
for child in node.children:
|
|
||||||
walk(node=child, tag_info_list=tag_info_list)
|
|
||||||
else:
|
|
||||||
tree_branch = HtmlTreeBranch(tags=tag_info_list, fn=fn)
|
|
||||||
branches.append(tree_branch)
|
|
||||||
|
|
||||||
for node in tree.children:
|
|
||||||
walk(node, None)
|
|
||||||
|
|
||||||
return branches
|
|
||||||
|
|
||||||
def html_tag_tree(text):
|
|
||||||
# type: (str) -> Node
|
|
||||||
tokens = tokenize(text)
|
|
||||||
top_level = Node(token=None, parent=None)
|
|
||||||
stack = [top_level]
|
|
||||||
|
|
||||||
for token in tokens:
|
|
||||||
if token.kind in ('html_start', 'html_singleton'):
|
|
||||||
if not is_special_html_tag(token.s, token.tag):
|
|
||||||
parent = stack[-1]
|
|
||||||
node= Node(token=token, parent=parent)
|
|
||||||
parent.children.append(node)
|
|
||||||
if token.kind == 'html_start':
|
|
||||||
stack.append(node)
|
|
||||||
elif token.kind == 'html_end':
|
|
||||||
stack.pop()
|
|
||||||
|
|
||||||
return top_level
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,102 @@
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
import tools.lib.template_parser
|
||||||
|
|
||||||
|
from tools.lib.html_branches import (
|
||||||
|
get_tag_info,
|
||||||
|
html_branches,
|
||||||
|
html_tag_tree,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestHtmlBranches(unittest.TestCase):
|
||||||
|
|
||||||
|
def test_get_tag_info(self):
|
||||||
|
# type: () -> None
|
||||||
|
html = """
|
||||||
|
<p id="test" class="test1 test2">foo</p>
|
||||||
|
"""
|
||||||
|
|
||||||
|
start_tag, end_tag = tools.lib.template_parser.tokenize(html)
|
||||||
|
|
||||||
|
start_tag_info = get_tag_info(start_tag)
|
||||||
|
end_tag_info = get_tag_info(end_tag)
|
||||||
|
|
||||||
|
self.assertEqual(start_tag_info.text(), 'p.test1.test2#test')
|
||||||
|
self.assertEqual(end_tag_info.text(), 'p')
|
||||||
|
|
||||||
|
def test_html_tag_tree(self):
|
||||||
|
# type: () -> None
|
||||||
|
html = """
|
||||||
|
<!-- test -->
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<!-- test -->
|
||||||
|
<head>
|
||||||
|
<title>Test</title>
|
||||||
|
<meta charset="utf-8" />
|
||||||
|
<link rel="stylesheet" href="style.css" />
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>Hello<br />world!</p>
|
||||||
|
<p>Goodbye<!-- test -->world!</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
<!-- test -->
|
||||||
|
"""
|
||||||
|
|
||||||
|
tree = html_tag_tree(html)
|
||||||
|
|
||||||
|
self.assertEqual(tree.children[0].token.kind, 'html_start')
|
||||||
|
self.assertEqual(tree.children[0].token.tag, 'html')
|
||||||
|
|
||||||
|
self.assertEqual(tree.children[0].children[0].token.kind, 'html_start')
|
||||||
|
self.assertEqual(tree.children[0].children[0].token.tag, 'head')
|
||||||
|
|
||||||
|
self.assertEqual(tree.children[0].children[0].children[0].token.kind, 'html_start')
|
||||||
|
self.assertEqual(tree.children[0].children[0].children[0].token.tag, 'title')
|
||||||
|
|
||||||
|
self.assertEqual(tree.children[0].children[1].token.kind, 'html_start')
|
||||||
|
self.assertEqual(tree.children[0].children[1].token.tag, 'body')
|
||||||
|
|
||||||
|
self.assertEqual(tree.children[0].children[1].children[0].token.kind, 'html_start')
|
||||||
|
self.assertEqual(tree.children[0].children[1].children[0].token.tag, 'p')
|
||||||
|
|
||||||
|
self.assertEqual(tree.children[0].children[1].children[0].children[0].token.kind, 'html_singleton')
|
||||||
|
self.assertEqual(tree.children[0].children[1].children[0].children[0].token.tag, 'br')
|
||||||
|
|
||||||
|
self.assertEqual(tree.children[0].children[1].children[1].token.kind, 'html_start')
|
||||||
|
self.assertEqual(tree.children[0].children[1].children[1].token.tag, 'p')
|
||||||
|
|
||||||
|
def test_html_branches(self):
|
||||||
|
# type: () -> None
|
||||||
|
html = """
|
||||||
|
<!-- test -->
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<!-- test -->
|
||||||
|
<head>
|
||||||
|
<title>Test</title>
|
||||||
|
<meta charset="utf-8" />
|
||||||
|
<link rel="stylesheet" href="style.css" />
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>Hello<br />world!</p>
|
||||||
|
<p>Goodbye<!-- test -->world!</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
<!-- test -->
|
||||||
|
"""
|
||||||
|
|
||||||
|
branches = html_branches(html)
|
||||||
|
|
||||||
|
self.assertEqual(branches[0].text(), 'html head title')
|
||||||
|
self.assertEqual(branches[1].text(), 'html body p br')
|
||||||
|
self.assertEqual(branches[2].text(), 'html body p')
|
||||||
|
|
||||||
|
self.assertEqual(branches[0].staircase_text(), '\n html\n head\n title\n')
|
||||||
|
self.assertEqual(branches[1].staircase_text(), '\n html\n body\n p\n br\n')
|
||||||
|
self.assertEqual(branches[2].staircase_text(), '\n html\n body\n p\n')
|
|
@ -7,8 +7,6 @@ import unittest
|
||||||
try:
|
try:
|
||||||
from tools.lib.template_parser import (
|
from tools.lib.template_parser import (
|
||||||
TemplateParserException,
|
TemplateParserException,
|
||||||
get_tag_info,
|
|
||||||
html_tag_tree,
|
|
||||||
is_django_block_tag,
|
is_django_block_tag,
|
||||||
tokenize,
|
tokenize,
|
||||||
validate,
|
validate,
|
||||||
|
@ -228,60 +226,3 @@ class ParserTest(unittest.TestCase):
|
||||||
token = tokenize(tag)[0]
|
token = tokenize(tag)[0]
|
||||||
self.assertEqual(token.kind, 'django_end')
|
self.assertEqual(token.kind, 'django_end')
|
||||||
self.assertEqual(token.tag, 'if')
|
self.assertEqual(token.tag, 'if')
|
||||||
|
|
||||||
def test_get_tag_info(self):
|
|
||||||
# type: () -> None
|
|
||||||
html = '''
|
|
||||||
<p id="test" class="test1 test2">foo</p>
|
|
||||||
'''
|
|
||||||
|
|
||||||
start_tag, end_tag = tokenize(html)
|
|
||||||
|
|
||||||
start_tag_info = get_tag_info(start_tag)
|
|
||||||
end_tag_info = get_tag_info(end_tag)
|
|
||||||
|
|
||||||
self.assertEqual(start_tag_info.text(), 'p.test1.test2#test')
|
|
||||||
self.assertEqual(end_tag_info.text(), 'p')
|
|
||||||
|
|
||||||
def test_html_tag_tree(self):
|
|
||||||
# type: () -> None
|
|
||||||
html = '''
|
|
||||||
<!-- test -->
|
|
||||||
<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
<!-- test -->
|
|
||||||
<head>
|
|
||||||
<title>Test</title>
|
|
||||||
<meta charset="utf-8" />
|
|
||||||
<link rel="stylesheet" href="style.css" />
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<p>Hello<br />world!</p>
|
|
||||||
<p>Goodbye<!-- test -->world!</p>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
<!-- test -->
|
|
||||||
'''
|
|
||||||
|
|
||||||
tree = html_tag_tree(html)
|
|
||||||
|
|
||||||
self.assertEqual(tree.children[0].token.kind, 'html_start')
|
|
||||||
self.assertEqual(tree.children[0].token.tag, 'html')
|
|
||||||
|
|
||||||
self.assertEqual(tree.children[0].children[0].token.kind, 'html_start')
|
|
||||||
self.assertEqual(tree.children[0].children[0].token.tag, 'head')
|
|
||||||
|
|
||||||
self.assertEqual(tree.children[0].children[0].children[0].token.kind, 'html_start')
|
|
||||||
self.assertEqual(tree.children[0].children[0].children[0].token.tag, 'title')
|
|
||||||
|
|
||||||
self.assertEqual(tree.children[0].children[1].token.kind, 'html_start')
|
|
||||||
self.assertEqual(tree.children[0].children[1].token.tag, 'body')
|
|
||||||
|
|
||||||
self.assertEqual(tree.children[0].children[1].children[0].token.kind, 'html_start')
|
|
||||||
self.assertEqual(tree.children[0].children[1].children[0].token.tag, 'p')
|
|
||||||
|
|
||||||
self.assertEqual(tree.children[0].children[1].children[0].children[0].token.kind, 'html_singleton')
|
|
||||||
self.assertEqual(tree.children[0].children[1].children[0].children[0].token.tag, 'br')
|
|
||||||
|
|
||||||
self.assertEqual(tree.children[0].children[1].children[1].token.kind, 'html_start')
|
|
||||||
self.assertEqual(tree.children[0].children[1].children[1].token.tag, 'p')
|
|
||||||
|
|
Loading…
Reference in New Issue