From df53f54cbd3dac43350593efe173bcbf20f61365 Mon Sep 17 00:00:00 2001
From: Zev Benjamin <zev@humbughq.com>
Date: Fri, 31 May 2013 15:03:16 -0400
Subject: [PATCH] Add library to diff HTML fragments and mark changes

I would really like to parse the HTML we produce from the library to
ensure that we don't generate malformed-HTML.  This is unfortunately
hard because we both want pretty strict parsing and we want to parse
html5 fragments.  For now, we just do a basic sanity check.

We also may want to switch to Google Diff-Match-Patch, as that can
clean up the resulting diffs.

(imported from commit 3772f92135cfd7423c335335f861f2c11462a8db)
---
 zephyr/lib/html_diff.py | 97 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 zephyr/lib/html_diff.py
diff --git a/zephyr/lib/html_diff.py b/zephyr/lib/html_diff.py
new file mode 100644
index 0000000000..c169989e39
--- /dev/null
+++ b/zephyr/lib/html_diff.py
@@ -0,0 +1,97 @@
+import difflib
+import platform
+import logging
+
+# TODO: handle changes in link hrefs
+
+def highlight_with_class(klass, text):
+    return '<span class="%s">%s</span>' % (klass, text)
+
+def highlight_inserted(text):
+    return highlight_with_class('highlight_text_inserted', text)
+
+def highlight_deleted(text):
+    return highlight_with_class('highlight_text_deleted', text)
+
+def highlight_replaced(text):
+    return highlight_with_class('highlight_text_replaced', text)
+
+def chunkize(text, in_tag):
+    start = 0
+    idx = 0
+    chunks = []
+    for c in text:
+        if c == '<':
+            in_tag = True
+            if start != idx:
+                chunks.append(('text', text[start:idx]))
+            start = idx
+        elif c == '>':
+            in_tag = False
+            if start != idx + 1:
+                chunks.append(('tag', text[start:idx + 1]))
+            start = idx + 1
+        idx += 1
+
+    if start != idx:
+        chunks.append(('tag' if in_tag else 'text', text[start:idx]))
+    return chunks, in_tag
+    
+def highlight_chunks(chunks, highlight_func):
+    retval = ''
+    for type, text in chunks:
+        if type == 'text':
+            retval += highlight_func(text)
+        else:
+            retval += text
+    return retval
+
+def verify_html(html):
+    # TODO: Actually parse the resulting HTML to ensure we don't
+    # create mal-formed markup.  This is unfortunately hard because
+    # we both want pretty strict parsing and we want to parse html5
+    # fragments.  For now, we do a basic sanity check.
+    in_tag = False
+    for c in html:
+        if c == '<':
+            if in_tag:
+                return False
+            in_tag = True
+        elif c == '>':
+            if not in_tag:
+                return False
+            in_tag = False
+    if in_tag:
+        return False
+    return True
+
+def highlight_html_differences(s1, s2):
+    sm = difflib.SequenceMatcher(lambda c: c in " \t\v\n", s1, s2, autojunk=False)
+    retval = ''
+    in_tag = False
+
+    for op, i1, i2, j1, j2 in sm.get_opcodes():
+        if op == 'replace':
+            chunks, in_tag = chunkize(s2[j1:j2], in_tag)
+            retval += highlight_chunks(chunks, highlight_replaced)
+        elif op == 'delete':
+            retval += highlight_deleted('&nbsp;')
+        elif op == 'insert':
+            chunks, in_tag = chunkize(s2[j1:j2], in_tag)
+            retval += highlight_chunks(chunks, highlight_inserted)
+        elif op == 'equal':
+            chunks, in_tag = chunkize(s2[j1:j2], in_tag)
+            retval += s2[j1:j2]
+
+    if not verify_html(retval):
+        from zephyr.lib.actions import internal_send_message
+        # We probably want more information here
+        logging.getLogger('').error('HTML diff produced mal-formed HTML')
+
+        subject = "HTML diff failure on %s" % (platform.node(),)
+        internal_send_message("humbug+errors@humbughq.com", "stream",
+                              "errors", subject, "HTML diff produced malformed HTML")
+        return s2
+
+    return retval
+