zulip/tools/lib/html_branches.py

from typing import Dict, List, Optional, Set

import re
from collections import defaultdict

from .template_parser import (
    tokenize,
    Token,
)


class HtmlBranchesException(Exception):
    # TODO: Have callers pass in line numbers.
    pass


class HtmlTreeBranch:
    """
    For <p><div id='yo'>bla<span class='bar'></span></div></p>, store a
    representation of the tags all the way down to the leaf, which would
    conceptually be something like "p div(#yo) span(.bar)".
    """

    def __init__(self, tags, fn):
        # type: (List['TagInfo'], Optional[str]) -> None
        self.tags = tags
        self.fn = fn
        self.line = tags[-1].token.line

        self.words = set()  # type: Set[str]
        for tag in tags:
            for word in tag.words:
                self.words.add(word)

    def staircase_text(self):
        # type: () -> str
        """
        produces representation of a node in staircase-like format:

            html
                body.main-section
                    p#intro

        """
        res = '\n'
        indent = ' ' * 4
        for t in self.tags:
            res += indent + t.text() + '\n'
            indent += ' ' * 4
        return res

    def text(self):
        # type: () -> str
        """
        produces one-line representation of branch:

        html body.main-section p#intro
        """
        return ' '.join(t.text() for t in self.tags)


class Node:
    def __init__(self, token, parent):  # FIXME parent parameter is not used!
        # type: (Token, Optional[Node]) -> None
        self.token = token
        self.children = []  # type: List[Node]
        self.parent = None  # type: Optional[Node]


class TagInfo:
    def __init__(self, tag, classes, ids, token):
        # type: (str, List[str], List[str], Token) -> None
        self.tag = tag
        self.classes = classes
        self.ids = ids
        self.token = token
        self.words = \
            [self.tag] + \
            ['.' + s for s in classes] + \
            ['#' + s for s in ids]

    def text(self):
        # type: () -> str
        s = self.tag
        if self.classes:
            s += '.' + '.'.join(self.classes)
        if self.ids:
            s += '#' + '#'.join(self.ids)
        return s


def get_tag_info(token):
    # type: (Token) -> TagInfo
    s = token.s
    tag = token.tag
    classes = []  # type: List[str]
    ids = []  # type: List[str]

    searches = [
        (classes, ' class="(.*?)"'),
        (classes, " class='(.*?)'"),
        (ids, ' id="(.*?)"'),
        (ids, " id='(.*?)'"),
    ]

    for lst, regex in searches:
        m = re.search(regex, s)
        if m:
            for g in m.groups():
                lst += split_for_id_and_class(g)

    return TagInfo(tag=tag, classes=classes, ids=ids, token=token)


def split_for_id_and_class(element):
    # type: (str) -> List[str]
    # Here we split a given string which is expected to contain id or class
    # attributes from HTML tags. This also takes care of template variables
    # in string during splitting process. For eg. 'red black {{ a|b|c }}'
    # is split as ['red', 'black', '{{ a|b|c }}']
    outside_braces = True  # type: bool
    lst = []
    s = ''

    for ch in element:
        if ch == '{':
            outside_braces = False
        if ch == '}':
            outside_braces = True
        if ch == ' ' and outside_braces:
            if not s == '':
                lst.append(s)
            s = ''
        else:
            s += ch
    if not s == '':
        lst.append(s)

    return lst


def html_branches(text, fn=None):
    # type: (str, Optional[str]) -> List[HtmlTreeBranch]
    tree = html_tag_tree(text)
    branches = []  # type: List[HtmlTreeBranch]

    def walk(node, tag_info_list=None):
        # type: (Node, Optional[List[TagInfo]]) -> None
        info = get_tag_info(node.token)
        if tag_info_list is None:
            tag_info_list = [info]
        else:
            tag_info_list = tag_info_list[:] + [info]

        if node.children:
            for child in node.children:
                walk(node=child, tag_info_list=tag_info_list)
        else:
            tree_branch = HtmlTreeBranch(tags=tag_info_list, fn=fn)
            branches.append(tree_branch)

    for node in tree.children:
        walk(node, None)

    return branches


def html_tag_tree(text):
    # type: (str) -> Node
    tokens = tokenize(text)
    top_level = Node(token=None, parent=None)
    stack = [top_level]

    for token in tokens:
        # Add tokens to the Node tree first (conditionally).
        if token.kind in ('html_start', 'html_singleton'):
            parent = stack[-1]
            node = Node(token=token, parent=parent)
            parent.children.append(node)

        # Then update the stack to have the next node that
        # we will be appending to at the top.
        if token.kind == 'html_start':
            stack.append(node)
        elif token.kind == 'html_end':
            stack.pop()

    return top_level


def build_id_dict(templates):
    # type: (List[str]) -> (Dict[str, List[str]])
    template_id_dict = defaultdict(list)  # type: (Dict[str, List[str]])

    for fn in templates:
        with open(fn) as f:
            text = f.read()
        list_tags = tokenize(text)

        for tag in list_tags:
            info = get_tag_info(tag)

            for ids in info.ids:
                template_id_dict[ids].append("Line " + str(info.token.line) + ":" + fn)

    return template_id_dict
mypy: Added Dict, List and Set imports. Fixed mypy errors associated with the upgrade. 2017-03-03 19:01:52 +01:00			`from typing import Dict, List, Optional, Set`
Factor out HtmlTreeBranch and related code from template parser. This code is not directly related to the template parser, so it can safely live in its own file. The only significant change to the code is to the signature of `html_branches` so that it can be called without requiring a file. Since it's only used in html_grep, that has been updated to reflect this change. Fixes: #1774. 2016-09-11 20:23:29 +02:00
			`import re`
Lint for duplicate ids in templates. In this commit we enhance our current template linter to detect duplicate ids and report them during lint checks. html_branches.py was topped up with a new function build_id_dict for the purpose. Also the get_tag_info function in same file was updated to parse ids and classes more robustly in cases of template variables. split_for_id_and_class function was added to serve this purpose. Unit tests for both the functions were created under tests/test_html_branches. Also a directory under tests called test_template_data was created to hold templates for testing under newly created functionality. check_templates was modified to print to console any duplicates detected. showell reviewed my commit and helped me out. Fixes #2950. 2017-01-06 15:11:15 +01:00			`from collections import defaultdict`
Factor out HtmlTreeBranch and related code from template parser. This code is not directly related to the template parser, so it can safely live in its own file. The only significant change to the code is to the signature of `html_branches` so that it can be called without requiring a file. Since it's only used in html_grep, that has been updated to reflect this change. Fixes: #1774. 2016-09-11 20:23:29 +02:00
			`from .template_parser import (`
			`tokenize,`
Clean up imports in html_branches.py. 2016-09-12 00:30:10 +02:00			`Token,`
Factor out HtmlTreeBranch and related code from template parser. This code is not directly related to the template parser, so it can safely live in its own file. The only significant change to the code is to the signature of `html_branches` so that it can be called without requiring a file. Since it's only used in html_grep, that has been updated to reflect this change. Fixes: #1774. 2016-09-11 20:23:29 +02:00			`)`


			`class HtmlBranchesException(Exception):`
			`# TODO: Have callers pass in line numbers.`
			`pass`


Remove inheritance from object. 2017-11-05 11:57:15 +01:00			`class HtmlTreeBranch:`
Factor out HtmlTreeBranch and related code from template parser. This code is not directly related to the template parser, so it can safely live in its own file. The only significant change to the code is to the signature of `html_branches` so that it can be called without requiring a file. Since it's only used in html_grep, that has been updated to reflect this change. Fixes: #1774. 2016-09-11 20:23:29 +02:00			`"""`
			`For <p><div id='yo'>bla<span class='bar'></span></div></p>, store a`
			`representation of the tags all the way down to the leaf, which would`
			`conceptually be something like "p div(#yo) span(.bar)".`
			`"""`

			`def __init__(self, tags, fn):`
mypy: Fix future syntax errors and other minor mistakes. When we move to the Python 3 mypy syntax, we can't reference a class before its definition. 2017-10-05 08:05:41 +02:00			`# type: (List['TagInfo'], Optional[str]) -> None`
Factor out HtmlTreeBranch and related code from template parser. This code is not directly related to the template parser, so it can safely live in its own file. The only significant change to the code is to the signature of `html_branches` so that it can be called without requiring a file. Since it's only used in html_grep, that has been updated to reflect this change. Fixes: #1774. 2016-09-11 20:23:29 +02:00			`self.tags = tags`
			`self.fn = fn`
			`self.line = tags[-1].token.line`

			`self.words = set() # type: Set[str]`
			`for tag in tags:`
			`for word in tag.words:`
			`self.words.add(word)`

			`def staircase_text(self):`
			`# type: () -> str`
			`"""`
			`produces representation of a node in staircase-like format:`

			`html`
			`body.main-section`
			`p#intro`

			`"""`
			`res = '\n'`
			`indent = ' ' * 4`
			`for t in self.tags:`
			`res += indent + t.text() + '\n'`
			`indent += ' ' * 4`
			`return res`

			`def text(self):`
			`# type: () -> str`
			`"""`
			`produces one-line representation of branch:`

			`html body.main-section p#intro`
			`"""`
			`return ' '.join(t.text() for t in self.tags)`


Remove inheritance from object. 2017-11-05 11:57:15 +01:00			`class Node:`
mypy: Allow Optional parameters in html_branches.py. 2017-08-09 20:44:32 +02:00			`def __init__(self, token, parent): # FIXME parent parameter is not used!`
			`# type: (Token, Optional[Node]) -> None`
Factor out HtmlTreeBranch and related code from template parser. This code is not directly related to the template parser, so it can safely live in its own file. The only significant change to the code is to the signature of `html_branches` so that it can be called without requiring a file. Since it's only used in html_grep, that has been updated to reflect this change. Fixes: #1774. 2016-09-11 20:23:29 +02:00			`self.token = token`
			`self.children = [] # type: List[Node]`
			`self.parent = None # type: Optional[Node]`


Remove inheritance from object. 2017-11-05 11:57:15 +01:00			`class TagInfo:`
Factor out HtmlTreeBranch and related code from template parser. This code is not directly related to the template parser, so it can safely live in its own file. The only significant change to the code is to the signature of `html_branches` so that it can be called without requiring a file. Since it's only used in html_grep, that has been updated to reflect this change. Fixes: #1774. 2016-09-11 20:23:29 +02:00			`def __init__(self, tag, classes, ids, token):`
			`# type: (str, List[str], List[str], Token) -> None`
			`self.tag = tag`
			`self.classes = classes`
			`self.ids = ids`
			`self.token = token`
			`self.words = \`
			`[self.tag] + \`
			`['.' + s for s in classes] + \`
			`['#' + s for s in ids]`

			`def text(self):`
			`# type: () -> str`
			`s = self.tag`
			`if self.classes:`
			`s += '.' + '.'.join(self.classes)`
			`if self.ids:`
			`s += '#' + '#'.join(self.ids)`
			`return s`


			`def get_tag_info(token):`
			`# type: (Token) -> TagInfo`
			`s = token.s`
			`tag = token.tag`
			`classes = [] # type: List[str]`
			`ids = [] # type: List[str]`

			`searches = [`
			`(classes, ' class="(.*?)"'),`
			`(classes, " class='(.*?)'"),`
			`(ids, ' id="(.*?)"'),`
			`(ids, " id='(.*?)'"),`
			`]`

			`for lst, regex in searches:`
			`m = re.search(regex, s)`
			`if m:`
			`for g in m.groups():`
Lint for duplicate ids in templates. In this commit we enhance our current template linter to detect duplicate ids and report them during lint checks. html_branches.py was topped up with a new function build_id_dict for the purpose. Also the get_tag_info function in same file was updated to parse ids and classes more robustly in cases of template variables. split_for_id_and_class function was added to serve this purpose. Unit tests for both the functions were created under tests/test_html_branches. Also a directory under tests called test_template_data was created to hold templates for testing under newly created functionality. check_templates was modified to print to console any duplicates detected. showell reviewed my commit and helped me out. Fixes #2950. 2017-01-06 15:11:15 +01:00			`lst += split_for_id_and_class(g)`
Factor out HtmlTreeBranch and related code from template parser. This code is not directly related to the template parser, so it can safely live in its own file. The only significant change to the code is to the signature of `html_branches` so that it can be called without requiring a file. Since it's only used in html_grep, that has been updated to reflect this change. Fixes: #1774. 2016-09-11 20:23:29 +02:00
			`return TagInfo(tag=tag, classes=classes, ids=ids, token=token)`


Lint for duplicate ids in templates. In this commit we enhance our current template linter to detect duplicate ids and report them during lint checks. html_branches.py was topped up with a new function build_id_dict for the purpose. Also the get_tag_info function in same file was updated to parse ids and classes more robustly in cases of template variables. split_for_id_and_class function was added to serve this purpose. Unit tests for both the functions were created under tests/test_html_branches. Also a directory under tests called test_template_data was created to hold templates for testing under newly created functionality. check_templates was modified to print to console any duplicates detected. showell reviewed my commit and helped me out. Fixes #2950. 2017-01-06 15:11:15 +01:00			`def split_for_id_and_class(element):`
			`# type: (str) -> List[str]`
			`# Here we split a given string which is expected to contain id or class`
			`# attributes from HTML tags. This also takes care of template variables`
			`# in string during splitting process. For eg. 'red black {{ a\|b\|c }}'`
			`# is split as ['red', 'black', '{{ a\|b\|c }}']`
pep8: Add compliance with rule E261 to html_branches.py. 2017-05-07 16:54:55 +02:00			`outside_braces = True # type: bool`
Lint for duplicate ids in templates. In this commit we enhance our current template linter to detect duplicate ids and report them during lint checks. html_branches.py was topped up with a new function build_id_dict for the purpose. Also the get_tag_info function in same file was updated to parse ids and classes more robustly in cases of template variables. split_for_id_and_class function was added to serve this purpose. Unit tests for both the functions were created under tests/test_html_branches. Also a directory under tests called test_template_data was created to hold templates for testing under newly created functionality. check_templates was modified to print to console any duplicates detected. showell reviewed my commit and helped me out. Fixes #2950. 2017-01-06 15:11:15 +01:00			`lst = []`
			`s = ''`

			`for ch in element:`
			`if ch == '{':`
			`outside_braces = False`
			`if ch == '}':`
			`outside_braces = True`
			`if ch == ' ' and outside_braces:`
			`if not s == '':`
			`lst.append(s)`
			`s = ''`
			`else:`
			`s += ch`
			`if not s == '':`
			`lst.append(s)`

			`return lst`


Factor out HtmlTreeBranch and related code from template parser. This code is not directly related to the template parser, so it can safely live in its own file. The only significant change to the code is to the signature of `html_branches` so that it can be called without requiring a file. Since it's only used in html_grep, that has been updated to reflect this change. Fixes: #1774. 2016-09-11 20:23:29 +02:00			`def html_branches(text, fn=None):`
mypy: Allow Optional parameters in html_branches.py. 2017-08-09 20:44:32 +02:00			`# type: (str, Optional[str]) -> List[HtmlTreeBranch]`
Factor out HtmlTreeBranch and related code from template parser. This code is not directly related to the template parser, so it can safely live in its own file. The only significant change to the code is to the signature of `html_branches` so that it can be called without requiring a file. Since it's only used in html_grep, that has been updated to reflect this change. Fixes: #1774. 2016-09-11 20:23:29 +02:00			`tree = html_tag_tree(text)`
			`branches = [] # type: List[HtmlTreeBranch]`

			`def walk(node, tag_info_list=None):`
Fix several new errors caught by mypy 0.501. Clear out a bunch of easy to review errors, so we can focus on the more complicated ones. 2017-03-03 20:30:49 +01:00			`# type: (Node, Optional[List[TagInfo]]) -> None`
Factor out HtmlTreeBranch and related code from template parser. This code is not directly related to the template parser, so it can safely live in its own file. The only significant change to the code is to the signature of `html_branches` so that it can be called without requiring a file. Since it's only used in html_grep, that has been updated to reflect this change. Fixes: #1774. 2016-09-11 20:23:29 +02:00			`info = get_tag_info(node.token)`
			`if tag_info_list is None:`
			`tag_info_list = [info]`
			`else:`
			`tag_info_list = tag_info_list[:] + [info]`

			`if node.children:`
			`for child in node.children:`
			`walk(node=child, tag_info_list=tag_info_list)`
			`else:`
			`tree_branch = HtmlTreeBranch(tags=tag_info_list, fn=fn)`
			`branches.append(tree_branch)`

			`for node in tree.children:`
			`walk(node, None)`

			`return branches`


			`def html_tag_tree(text):`
			`# type: (str) -> Node`
			`tokens = tokenize(text)`
			`top_level = Node(token=None, parent=None)`
			`stack = [top_level]`

			`for token in tokens:`
tools: Simplify html_tag_tree(). Because of some recent changes to the tokenizer, we no longer need to call is_special_html_tag() to filter out special tags. I also tried to make the start/end logic for pushing/popping the stack more obvious. 2016-09-11 22:36:55 +02:00			`# Add tokens to the Node tree first (conditionally).`
Factor out HtmlTreeBranch and related code from template parser. This code is not directly related to the template parser, so it can safely live in its own file. The only significant change to the code is to the signature of `html_branches` so that it can be called without requiring a file. Since it's only used in html_grep, that has been updated to reflect this change. Fixes: #1774. 2016-09-11 20:23:29 +02:00			`if token.kind in ('html_start', 'html_singleton'):`
tools: Simplify html_tag_tree(). Because of some recent changes to the tokenizer, we no longer need to call is_special_html_tag() to filter out special tags. I also tried to make the start/end logic for pushing/popping the stack more obvious. 2016-09-11 22:36:55 +02:00			`parent = stack[-1]`
pep8: Fix E225 pep8 violations. 2016-11-28 23:29:01 +01:00			`node = Node(token=token, parent=parent)`
tools: Simplify html_tag_tree(). Because of some recent changes to the tokenizer, we no longer need to call is_special_html_tag() to filter out special tags. I also tried to make the start/end logic for pushing/popping the stack more obvious. 2016-09-11 22:36:55 +02:00			`parent.children.append(node)`

			`# Then update the stack to have the next node that`
			`# we will be appending to at the top.`
			`if token.kind == 'html_start':`
			`stack.append(node)`
Factor out HtmlTreeBranch and related code from template parser. This code is not directly related to the template parser, so it can safely live in its own file. The only significant change to the code is to the signature of `html_branches` so that it can be called without requiring a file. Since it's only used in html_grep, that has been updated to reflect this change. Fixes: #1774. 2016-09-11 20:23:29 +02:00			`elif token.kind == 'html_end':`
			`stack.pop()`

			`return top_level`
Lint for duplicate ids in templates. In this commit we enhance our current template linter to detect duplicate ids and report them during lint checks. html_branches.py was topped up with a new function build_id_dict for the purpose. Also the get_tag_info function in same file was updated to parse ids and classes more robustly in cases of template variables. split_for_id_and_class function was added to serve this purpose. Unit tests for both the functions were created under tests/test_html_branches. Also a directory under tests called test_template_data was created to hold templates for testing under newly created functionality. check_templates was modified to print to console any duplicates detected. showell reviewed my commit and helped me out. Fixes #2950. 2017-01-06 15:11:15 +01:00

			`def build_id_dict(templates):`
mypy: Fix future syntax errors and other minor mistakes. When we move to the Python 3 mypy syntax, we can't reference a class before its definition. 2017-10-05 08:05:41 +02:00			`# type: (List[str]) -> (Dict[str, List[str]])`
			`template_id_dict = defaultdict(list) # type: (Dict[str, List[str]])`
Lint for duplicate ids in templates. In this commit we enhance our current template linter to detect duplicate ids and report them during lint checks. html_branches.py was topped up with a new function build_id_dict for the purpose. Also the get_tag_info function in same file was updated to parse ids and classes more robustly in cases of template variables. split_for_id_and_class function was added to serve this purpose. Unit tests for both the functions were created under tests/test_html_branches. Also a directory under tests called test_template_data was created to hold templates for testing under newly created functionality. check_templates was modified to print to console any duplicates detected. showell reviewed my commit and helped me out. Fixes #2950. 2017-01-06 15:11:15 +01:00
			`for fn in templates:`
python: Modernize legacy Python 2 syntax with pyupgrade. Generated by `pyupgrade --py3-plus --keep-percent-format` on all our Python code except `zthumbor` and `zulip-ec2-configure-interfaces`, followed by manual indentation fixes. Signed-off-by: Anders Kaseorg <anders@zulipchat.com> 2020-04-09 21:51:58 +02:00			`with open(fn) as f:`
python: Migrate open statements to use with. This is low priority, but it's nice to be consistently using the best practice pattern. Fixes: #12419. 2019-07-14 21:37:08 +02:00			`text = f.read()`
Lint for duplicate ids in templates. In this commit we enhance our current template linter to detect duplicate ids and report them during lint checks. html_branches.py was topped up with a new function build_id_dict for the purpose. Also the get_tag_info function in same file was updated to parse ids and classes more robustly in cases of template variables. split_for_id_and_class function was added to serve this purpose. Unit tests for both the functions were created under tests/test_html_branches. Also a directory under tests called test_template_data was created to hold templates for testing under newly created functionality. check_templates was modified to print to console any duplicates detected. showell reviewed my commit and helped me out. Fixes #2950. 2017-01-06 15:11:15 +01:00			`list_tags = tokenize(text)`

			`for tag in list_tags:`
			`info = get_tag_info(tag)`

			`for ids in info.ids:`
			`template_id_dict[ids].append("Line " + str(info.token.line) + ":" + fn)`

			`return template_id_dict`