From df53f54cbd3dac43350593efe173bcbf20f61365 Mon Sep 17 00:00:00 2001 From: Zev Benjamin Date: Fri, 31 May 2013 15:03:16 -0400 Subject: [PATCH] Add library to diff HTML fragments and mark changes I would really like to parse the HTML we produce from the library to ensure that we don't generate malformed-HTML. This is unfortunately hard because we both want pretty strict parsing and we want to parse html5 fragments. For now, we just do a basic sanity check. We also may want to switch to Google Diff-Match-Patch, as that can clean up the resulting diffs. (imported from commit 3772f92135cfd7423c335335f861f2c11462a8db) --- zephyr/lib/html_diff.py | 97 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 zephyr/lib/html_diff.py diff --git a/zephyr/lib/html_diff.py b/zephyr/lib/html_diff.py new file mode 100644 index 0000000000..c169989e39 --- /dev/null +++ b/zephyr/lib/html_diff.py @@ -0,0 +1,97 @@ +import difflib +import platform +import logging + +# TODO: handle changes in link hrefs + +def highlight_with_class(klass, text): + return '%s' % (klass, text) + +def highlight_inserted(text): + return highlight_with_class('highlight_text_inserted', text) + +def highlight_deleted(text): + return highlight_with_class('highlight_text_deleted', text) + +def highlight_replaced(text): + return highlight_with_class('highlight_text_replaced', text) + +def chunkize(text, in_tag): + start = 0 + idx = 0 + chunks = [] + for c in text: + if c == '<': + in_tag = True + if start != idx: + chunks.append(('text', text[start:idx])) + start = idx + elif c == '>': + in_tag = False + if start != idx + 1: + chunks.append(('tag', text[start:idx + 1])) + start = idx + 1 + idx += 1 + + if start != idx: + chunks.append(('tag' if in_tag else 'text', text[start:idx])) + return chunks, in_tag + +def highlight_chunks(chunks, highlight_func): + retval = '' + for type, text in chunks: + if type == 'text': + retval += highlight_func(text) + else: + retval += text + return retval + +def verify_html(html): + # TODO: Actually parse the resulting HTML to ensure we don't + # create mal-formed markup. This is unfortunately hard because + # we both want pretty strict parsing and we want to parse html5 + # fragments. For now, we do a basic sanity check. + in_tag = False + for c in html: + if c == '<': + if in_tag: + return False + in_tag = True + elif c == '>': + if not in_tag: + return False + in_tag = False + if in_tag: + return False + return True + +def highlight_html_differences(s1, s2): + sm = difflib.SequenceMatcher(lambda c: c in " \t\v\n", s1, s2, autojunk=False) + retval = '' + in_tag = False + + for op, i1, i2, j1, j2 in sm.get_opcodes(): + if op == 'replace': + chunks, in_tag = chunkize(s2[j1:j2], in_tag) + retval += highlight_chunks(chunks, highlight_replaced) + elif op == 'delete': + retval += highlight_deleted(' ') + elif op == 'insert': + chunks, in_tag = chunkize(s2[j1:j2], in_tag) + retval += highlight_chunks(chunks, highlight_inserted) + elif op == 'equal': + chunks, in_tag = chunkize(s2[j1:j2], in_tag) + retval += s2[j1:j2] + + if not verify_html(retval): + from zephyr.lib.actions import internal_send_message + # We probably want more information here + logging.getLogger('').error('HTML diff produced mal-formed HTML') + + subject = "HTML diff failure on %s" % (platform.node(),) + internal_send_message("humbug+errors@humbughq.com", "stream", + "errors", subject, "HTML diff produced malformed HTML") + return s2 + + return retval +