mirror of https://github.com/zulip/zulip.git
Add library to diff HTML fragments and mark changes
I would really like to parse the HTML we produce from the library to ensure that we don't generate malformed-HTML. This is unfortunately hard because we both want pretty strict parsing and we want to parse html5 fragments. For now, we just do a basic sanity check. We also may want to switch to Google Diff-Match-Patch, as that can clean up the resulting diffs. (imported from commit 3772f92135cfd7423c335335f861f2c11462a8db)
This commit is contained in:
parent
835dd6673a
commit
df53f54cbd
|
@ -0,0 +1,97 @@
|
|||
import difflib
|
||||
import platform
|
||||
import logging
|
||||
|
||||
# TODO: handle changes in link hrefs
|
||||
|
||||
def highlight_with_class(klass, text):
|
||||
return '<span class="%s">%s</span>' % (klass, text)
|
||||
|
||||
def highlight_inserted(text):
|
||||
return highlight_with_class('highlight_text_inserted', text)
|
||||
|
||||
def highlight_deleted(text):
|
||||
return highlight_with_class('highlight_text_deleted', text)
|
||||
|
||||
def highlight_replaced(text):
|
||||
return highlight_with_class('highlight_text_replaced', text)
|
||||
|
||||
def chunkize(text, in_tag):
|
||||
start = 0
|
||||
idx = 0
|
||||
chunks = []
|
||||
for c in text:
|
||||
if c == '<':
|
||||
in_tag = True
|
||||
if start != idx:
|
||||
chunks.append(('text', text[start:idx]))
|
||||
start = idx
|
||||
elif c == '>':
|
||||
in_tag = False
|
||||
if start != idx + 1:
|
||||
chunks.append(('tag', text[start:idx + 1]))
|
||||
start = idx + 1
|
||||
idx += 1
|
||||
|
||||
if start != idx:
|
||||
chunks.append(('tag' if in_tag else 'text', text[start:idx]))
|
||||
return chunks, in_tag
|
||||
|
||||
def highlight_chunks(chunks, highlight_func):
|
||||
retval = ''
|
||||
for type, text in chunks:
|
||||
if type == 'text':
|
||||
retval += highlight_func(text)
|
||||
else:
|
||||
retval += text
|
||||
return retval
|
||||
|
||||
def verify_html(html):
|
||||
# TODO: Actually parse the resulting HTML to ensure we don't
|
||||
# create mal-formed markup. This is unfortunately hard because
|
||||
# we both want pretty strict parsing and we want to parse html5
|
||||
# fragments. For now, we do a basic sanity check.
|
||||
in_tag = False
|
||||
for c in html:
|
||||
if c == '<':
|
||||
if in_tag:
|
||||
return False
|
||||
in_tag = True
|
||||
elif c == '>':
|
||||
if not in_tag:
|
||||
return False
|
||||
in_tag = False
|
||||
if in_tag:
|
||||
return False
|
||||
return True
|
||||
|
||||
def highlight_html_differences(s1, s2):
|
||||
sm = difflib.SequenceMatcher(lambda c: c in " \t\v\n", s1, s2, autojunk=False)
|
||||
retval = ''
|
||||
in_tag = False
|
||||
|
||||
for op, i1, i2, j1, j2 in sm.get_opcodes():
|
||||
if op == 'replace':
|
||||
chunks, in_tag = chunkize(s2[j1:j2], in_tag)
|
||||
retval += highlight_chunks(chunks, highlight_replaced)
|
||||
elif op == 'delete':
|
||||
retval += highlight_deleted(' ')
|
||||
elif op == 'insert':
|
||||
chunks, in_tag = chunkize(s2[j1:j2], in_tag)
|
||||
retval += highlight_chunks(chunks, highlight_inserted)
|
||||
elif op == 'equal':
|
||||
chunks, in_tag = chunkize(s2[j1:j2], in_tag)
|
||||
retval += s2[j1:j2]
|
||||
|
||||
if not verify_html(retval):
|
||||
from zephyr.lib.actions import internal_send_message
|
||||
# We probably want more information here
|
||||
logging.getLogger('').error('HTML diff produced mal-formed HTML')
|
||||
|
||||
subject = "HTML diff failure on %s" % (platform.node(),)
|
||||
internal_send_message("humbug+errors@humbughq.com", "stream",
|
||||
"errors", subject, "HTML diff produced malformed HTML")
|
||||
return s2
|
||||
|
||||
return retval
|
||||
|
Loading…
Reference in New Issue