Factor out HtmlTreeBranch and related code from template parser.

This code is not directly related to the template parser, so it can safely live in its own file. The only significant change to the code is to the signature of `html_branches` so that it can be called without requiring a file. Since it's only used in html_grep, that has been updated to reflect this change. Fixes: #1774.
2016-09-11 14:23:29 -04:00 · 2016-09-11 14:23:29 -04:00 · 331617efab
parent 4d3350bd7b
commit 331617efab
5 changed files with 269 additions and 204 deletions
--- a/tools/lib/html_branches.py
+++ b/tools/lib/html_branches.py
@ -0,0 +1,163 @@
 from __future__ import absolute_import
 from __future__ import print_function
 from typing import Optional
 from .template_parser import Token
 import re
 from .template_parser import (
    is_special_html_tag,
    tokenize,
 )
 class HtmlBranchesException(Exception):
    # TODO: Have callers pass in line numbers.
    pass
 class HtmlTreeBranch(object):
    """
    For <p><div id='yo'>bla<span class='bar'></span></div></p>, store a
    representation of the tags all the way down to the leaf, which would
    conceptually be something like "p div(#yo) span(.bar)".
    """
    def __init__(self, tags, fn):
        # type: (List[TagInfo], str) -> None
        self.tags = tags
        self.fn = fn
        self.line = tags[-1].token.line
        self.words = set()  # type: Set[str]
        for tag in tags:
            for word in tag.words:
                self.words.add(word)
    def staircase_text(self):
        # type: () -> str
        """
        produces representation of a node in staircase-like format:
            html
                body.main-section
                    p#intro
        """
        res = '\n'
        indent = ' ' * 4
        for t in self.tags:
            res += indent + t.text() + '\n'
            indent += ' ' * 4
        return res
    def text(self):
        # type: () -> str
        """
        produces one-line representation of branch:
        html body.main-section p#intro
        """
        return ' '.join(t.text() for t in self.tags)
 class Node(object):
    def __init__(self, token, parent):
        # type: (Token, Node) -> None
        self.token = token
        self.children = []  # type: List[Node]
        self.parent = None  # type: Optional[Node]
 class TagInfo(object):
    def __init__(self, tag, classes, ids, token):
        # type: (str, List[str], List[str], Token) -> None
        self.tag = tag
        self.classes = classes
        self.ids = ids
        self.token = token
        self.words = \
            [self.tag] + \
            ['.' + s for s in classes] + \
            ['#' + s for s in ids]
    def text(self):
        # type: () -> str
        s = self.tag
        if self.classes:
            s += '.' + '.'.join(self.classes)
        if self.ids:
            s += '#' + '#'.join(self.ids)
        return s
 def get_tag_info(token):
    # type: (Token) -> TagInfo
    s = token.s
    tag = token.tag
    classes = []  # type: List[str]
    ids = []  # type: List[str]
    searches = [
        (classes, ' class="(.*?)"'),
        (classes, " class='(.*?)'"),
        (ids, ' id="(.*?)"'),
        (ids, " id='(.*?)'"),
    ]
    for lst, regex in searches:
        m = re.search(regex, s)
        if m:
            for g in m.groups():
                lst += g.split()
    return TagInfo(tag=tag, classes=classes, ids=ids, token=token)
 def html_branches(text, fn=None):
    # type: (str, str) -> List[HtmlTreeBranch]
    tree = html_tag_tree(text)
    branches = []  # type: List[HtmlTreeBranch]
    def walk(node, tag_info_list=None):
        # type: (Node, Optional[List[TagInfo]]) -> Node
        info = get_tag_info(node.token)
        if tag_info_list is None:
            tag_info_list = [info]
        else:
            tag_info_list = tag_info_list[:] + [info]
        if node.children:
            for child in node.children:
                walk(node=child, tag_info_list=tag_info_list)
        else:
            tree_branch = HtmlTreeBranch(tags=tag_info_list, fn=fn)
            branches.append(tree_branch)
    for node in tree.children:
        walk(node, None)
    return branches
 def html_tag_tree(text):
    # type: (str) -> Node
    tokens = tokenize(text)
    top_level = Node(token=None, parent=None)
    stack = [top_level]
    for token in tokens:
        if token.kind in ('html_start', 'html_singleton'):
            if not is_special_html_tag(token.s, token.tag):
                parent = stack[-1]
                node= Node(token=token, parent=parent)
                parent.children.append(node)
            if token.kind == 'html_start':
                stack.append(node)
        elif token.kind == 'html_end':
            stack.pop()
    return top_level
--- a/tools/lib/html_grep.py
+++ b/tools/lib/html_grep.py
@ -3,14 +3,14 @@ from __future__ import print_function
 from collections import defaultdict
 from six.moves import range
-from .template_parser import html_branches, Token, HtmlTreeBranch
+from .html_branches import html_branches, HtmlTreeBranch
 def show_all_branches(fns):
    # type: (List[str]) -> None
    for fn in fns:
        print(fn)
        text = open(fn).read()
-        branches = html_branches(text)
+        branches = html_branches(text, fn=fn)
        for branch in branches:
            print(branch.text())
        print('---')
@ -27,7 +27,8 @@ class Grepper(object):
        all_branches = [] # type: List[HtmlTreeBranch]
        for fn in fns:
-            branches = html_branches(fn)
+            text = open(fn).read()
            branches = html_branches(text, fn=fn)
            all_branches += branches
        self.word_dict = defaultdict(set) # type: Dict[str, Set[HtmlTreeBranch]]
--- a/tools/lib/template_parser.py
+++ b/tools/lib/template_parser.py
@ -266,145 +266,3 @@ def get_html_tag(text, i):
        raise TemplateParserException('Tag missing >')
    s = text[i:end+1]
    return s
 class Node(object):
    def __init__(self, token, parent):
        # type: (Token, Node) -> None
        self.token = token
        self.children = [] # type: List[Node]
        self.parent = None # type: Optional[Node]
 class TagInfo(object):
    def __init__(self, tag, classes, ids, token):
        # type: (str, List[str], List[str], Token) -> None
        self.tag = tag
        self.classes = classes
        self.ids = ids
        self.token = token
        self.words = \
            [self.tag] + \
            ['.' + s for s in classes] + \
            ['#' + s for s in ids]
    def text(self):
        # type: () -> str
        s = self.tag
        if self.classes:
            s += '.' + '.'.join(self.classes)
        if self.ids:
            s += '#' + '#'.join(self.ids)
        return s
 def get_tag_info(token):
    # type: (Token) -> TagInfo
    s = token.s
    tag = token.tag
    classes = [] # type: List[str]
    ids = [] # type: List[str]
    searches = [
        (classes, ' class="(.*?)"'),
        (classes, " class='(.*?)'"),
        (ids, ' id="(.*?)"'),
        (ids, " id='(.*?)'"),
    ]
    for lst, regex in searches:
        m = re.search(regex, s)
        if m:
            for g in m.groups():
                lst += g.split()
    return TagInfo(tag=tag, classes=classes, ids=ids, token=token)
 class HtmlTreeBranch(object):
    '''
    For <p><div id='yo'>bla<span class='bar'></span></div></p>, store a representation
    of the tags all the way down to the leaf, which would
    conceptually be something like "p div(#yo) span(.bar)".
    '''
    def __init__(self, tags, fn):
        # type: (List[TagInfo], str) -> None
        self.tags = tags
        self.fn = fn
        self.line = tags[-1].token.line
        self.words = set() # type: Set[str]
        for tag in tags:
            for word in tag.words:
                self.words.add(word)
    def staircase_text(self):
        # type: () -> str
        '''
        produces representation of a node in staircase-like format:
            html
                body.main-section
                    p#intro
        '''
        res = '\n'
        indent = ' ' * 4
        for t in self.tags:
            res += indent + t.text() + '\n'
            indent += ' ' * 4
        return res
    def text(self):
        # type: () -> str
        '''
        produces one-line representation of branch:
        html body.main-section p#intro
        '''
        return ' '.join(t.text() for t in self.tags)
 def html_branches(fn):
    # type: (str) -> List[HtmlTreeBranch]
    text = open(fn).read()
    tree = html_tag_tree(text)
    branches = [] # type: List[HtmlTreeBranch]
    def walk(node, tag_info_list=None):
        # type: (Node, Optional[List[TagInfo]]) -> Node
        info = get_tag_info(node.token)
        if tag_info_list is None:
            tag_info_list = [info]
        else:
            tag_info_list = tag_info_list[:] + [info]
        if node.children:
            for child in node.children:
                walk(node=child, tag_info_list=tag_info_list)
        else:
            tree_branch = HtmlTreeBranch(tags=tag_info_list, fn=fn)
            branches.append(tree_branch)
    for node in tree.children:
        walk(node, None)
    return branches
 def html_tag_tree(text):
    # type: (str) -> Node
    tokens = tokenize(text)
    top_level = Node(token=None, parent=None)
    stack = [top_level]
    for token in tokens:
        if token.kind in ('html_start', 'html_singleton'):
            if not is_special_html_tag(token.s, token.tag):
                parent = stack[-1]
                node= Node(token=token, parent=parent)
                parent.children.append(node)
            if token.kind == 'html_start':
                stack.append(node)
        elif token.kind == 'html_end':
            stack.pop()
    return top_level
--- a/tools/tests/test_html_branches.py
+++ b/tools/tests/test_html_branches.py
@ -0,0 +1,102 @@
 from __future__ import absolute_import
 from __future__ import print_function
 import unittest
 import tools.lib.template_parser
 from tools.lib.html_branches import (
    get_tag_info,
    html_branches,
    html_tag_tree,
 )
 class TestHtmlBranches(unittest.TestCase):
    def test_get_tag_info(self):
        # type: () -> None
        html = """
            <p id="test" class="test1 test2">foo</p>
        """
        start_tag, end_tag = tools.lib.template_parser.tokenize(html)
        start_tag_info = get_tag_info(start_tag)
        end_tag_info = get_tag_info(end_tag)
        self.assertEqual(start_tag_info.text(), 'p.test1.test2#test')
        self.assertEqual(end_tag_info.text(), 'p')
    def test_html_tag_tree(self):
        # type: () -> None
        html = """
            <!-- test -->
            <!DOCTYPE html>
            <html>
            <!-- test -->
            <head>
                <title>Test</title>
                <meta charset="utf-8" />
                <link rel="stylesheet" href="style.css" />
            </head>
            <body>
                <p>Hello<br />world!</p>
                <p>Goodbye<!-- test -->world!</p>
            </body>
            </html>
            <!-- test -->
        """
        tree = html_tag_tree(html)
        self.assertEqual(tree.children[0].token.kind, 'html_start')
        self.assertEqual(tree.children[0].token.tag, 'html')
        self.assertEqual(tree.children[0].children[0].token.kind, 'html_start')
        self.assertEqual(tree.children[0].children[0].token.tag, 'head')
        self.assertEqual(tree.children[0].children[0].children[0].token.kind, 'html_start')
        self.assertEqual(tree.children[0].children[0].children[0].token.tag, 'title')
        self.assertEqual(tree.children[0].children[1].token.kind, 'html_start')
        self.assertEqual(tree.children[0].children[1].token.tag, 'body')
        self.assertEqual(tree.children[0].children[1].children[0].token.kind, 'html_start')
        self.assertEqual(tree.children[0].children[1].children[0].token.tag, 'p')
        self.assertEqual(tree.children[0].children[1].children[0].children[0].token.kind, 'html_singleton')
        self.assertEqual(tree.children[0].children[1].children[0].children[0].token.tag, 'br')
        self.assertEqual(tree.children[0].children[1].children[1].token.kind, 'html_start')
        self.assertEqual(tree.children[0].children[1].children[1].token.tag, 'p')
    def test_html_branches(self):
        # type: () -> None
        html = """
            <!-- test -->
            <!DOCTYPE html>
            <html>
            <!-- test -->
            <head>
                <title>Test</title>
                <meta charset="utf-8" />
                <link rel="stylesheet" href="style.css" />
            </head>
            <body>
                <p>Hello<br />world!</p>
                <p>Goodbye<!-- test -->world!</p>
            </body>
            </html>
            <!-- test -->
        """
        branches = html_branches(html)
        self.assertEqual(branches[0].text(), 'html head title')
        self.assertEqual(branches[1].text(), 'html body p br')
        self.assertEqual(branches[2].text(), 'html body p')
        self.assertEqual(branches[0].staircase_text(), '\n    html\n        head\n            title\n')
        self.assertEqual(branches[1].staircase_text(), '\n    html\n        body\n            p\n                br\n')
        self.assertEqual(branches[2].staircase_text(), '\n    html\n        body\n            p\n')
--- a/tools/tests/test_template_parser.py
+++ b/tools/tests/test_template_parser.py
@ -7,8 +7,6 @@ import unittest
 try:
    from tools.lib.template_parser import (
        TemplateParserException,
        get_tag_info,
        html_tag_tree,
        is_django_block_tag,
        tokenize,
        validate,
@ -228,60 +226,3 @@ class ParserTest(unittest.TestCase):
        token = tokenize(tag)[0]
        self.assertEqual(token.kind, 'django_end')
        self.assertEqual(token.tag, 'if')
    def test_get_tag_info(self):
        # type: () -> None
        html = '''
            <p id="test" class="test1 test2">foo</p>
        '''
        start_tag, end_tag = tokenize(html)
        start_tag_info = get_tag_info(start_tag)
        end_tag_info = get_tag_info(end_tag)
        self.assertEqual(start_tag_info.text(), 'p.test1.test2#test')
        self.assertEqual(end_tag_info.text(), 'p')
    def test_html_tag_tree(self):
        # type: () -> None
        html = '''
            <!-- test -->
            <!DOCTYPE html>
            <html>
            <!-- test -->
            <head>
                <title>Test</title>
                <meta charset="utf-8" />
                <link rel="stylesheet" href="style.css" />
            </head>
            <body>
                <p>Hello<br />world!</p>
                <p>Goodbye<!-- test -->world!</p>
            </body>
            </html>
            <!-- test -->
        '''
        tree = html_tag_tree(html)
        self.assertEqual(tree.children[0].token.kind, 'html_start')
        self.assertEqual(tree.children[0].token.tag, 'html')
        self.assertEqual(tree.children[0].children[0].token.kind, 'html_start')
        self.assertEqual(tree.children[0].children[0].token.tag, 'head')
        self.assertEqual(tree.children[0].children[0].children[0].token.kind, 'html_start')
        self.assertEqual(tree.children[0].children[0].children[0].token.tag, 'title')
        self.assertEqual(tree.children[0].children[1].token.kind, 'html_start')
        self.assertEqual(tree.children[0].children[1].token.tag, 'body')
        self.assertEqual(tree.children[0].children[1].children[0].token.kind, 'html_start')
        self.assertEqual(tree.children[0].children[1].children[0].token.tag, 'p')
        self.assertEqual(tree.children[0].children[1].children[0].children[0].token.kind, 'html_singleton')
        self.assertEqual(tree.children[0].children[1].children[0].children[0].token.tag, 'br')
        self.assertEqual(tree.children[0].children[1].children[1].token.kind, 'html_start')
        self.assertEqual(tree.children[0].children[1].children[1].token.tag, 'p')