from typing import Dict, List, Optional, Set import re from collections import defaultdict from .template_parser import ( tokenize, Token, ) class HtmlBranchesException(Exception): # TODO: Have callers pass in line numbers. pass class HtmlTreeBranch: """ For

bla

, store a representation of the tags all the way down to the leaf, which would conceptually be something like "p div(#yo) span(.bar)". """ def __init__(self, tags, fn): # type: (List['TagInfo'], Optional[str]) -> None self.tags = tags self.fn = fn self.line = tags[-1].token.line self.words = set() # type: Set[str] for tag in tags: for word in tag.words: self.words.add(word) def staircase_text(self): # type: () -> str """ produces representation of a node in staircase-like format: html body.main-section p#intro """ res = '\n' indent = ' ' * 4 for t in self.tags: res += indent + t.text() + '\n' indent += ' ' * 4 return res def text(self): # type: () -> str """ produces one-line representation of branch: html body.main-section p#intro """ return ' '.join(t.text() for t in self.tags) class Node: def __init__(self, token, parent): # FIXME parent parameter is not used! # type: (Token, Optional[Node]) -> None self.token = token self.children = [] # type: List[Node] self.parent = None # type: Optional[Node] class TagInfo: def __init__(self, tag, classes, ids, token): # type: (str, List[str], List[str], Token) -> None self.tag = tag self.classes = classes self.ids = ids self.token = token self.words = \ [self.tag] + \ ['.' + s for s in classes] + \ ['#' + s for s in ids] def text(self): # type: () -> str s = self.tag if self.classes: s += '.' + '.'.join(self.classes) if self.ids: s += '#' + '#'.join(self.ids) return s def get_tag_info(token): # type: (Token) -> TagInfo s = token.s tag = token.tag classes = [] # type: List[str] ids = [] # type: List[str] searches = [ (classes, ' class="(.*?)"'), (classes, " class='(.*?)'"), (ids, ' id="(.*?)"'), (ids, " id='(.*?)'"), ] for lst, regex in searches: m = re.search(regex, s) if m: for g in m.groups(): lst += split_for_id_and_class(g) return TagInfo(tag=tag, classes=classes, ids=ids, token=token) def split_for_id_and_class(element): # type: (str) -> List[str] # Here we split a given string which is expected to contain id or class # attributes from HTML tags. This also takes care of template variables # in string during splitting process. For eg. 'red black {{ a|b|c }}' # is split as ['red', 'black', '{{ a|b|c }}'] outside_braces = True # type: bool lst = [] s = '' for ch in element: if ch == '{': outside_braces = False if ch == '}': outside_braces = True if ch == ' ' and outside_braces: if not s == '': lst.append(s) s = '' else: s += ch if not s == '': lst.append(s) return lst def html_branches(text, fn=None): # type: (str, Optional[str]) -> List[HtmlTreeBranch] tree = html_tag_tree(text) branches = [] # type: List[HtmlTreeBranch] def walk(node, tag_info_list=None): # type: (Node, Optional[List[TagInfo]]) -> None info = get_tag_info(node.token) if tag_info_list is None: tag_info_list = [info] else: tag_info_list = tag_info_list[:] + [info] if node.children: for child in node.children: walk(node=child, tag_info_list=tag_info_list) else: tree_branch = HtmlTreeBranch(tags=tag_info_list, fn=fn) branches.append(tree_branch) for node in tree.children: walk(node, None) return branches def html_tag_tree(text): # type: (str) -> Node tokens = tokenize(text) top_level = Node(token=None, parent=None) stack = [top_level] for token in tokens: # Add tokens to the Node tree first (conditionally). if token.kind in ('html_start', 'html_singleton'): parent = stack[-1] node = Node(token=token, parent=parent) parent.children.append(node) # Then update the stack to have the next node that # we will be appending to at the top. if token.kind == 'html_start': stack.append(node) elif token.kind == 'html_end': stack.pop() return top_level def build_id_dict(templates): # type: (List[str]) -> (Dict[str, List[str]]) template_id_dict = defaultdict(list) # type: (Dict[str, List[str]]) for fn in templates: with open(fn, 'r') as f: text = f.read() list_tags = tokenize(text) for tag in list_tags: info = get_tag_info(tag) for ids in info.ids: template_id_dict[ids].append("Line " + str(info.token.line) + ":" + fn) return template_id_dict