2013-10-10 21:37:26 +02:00
|
|
|
from __future__ import absolute_import
|
2016-06-17 16:59:29 +02:00
|
|
|
|
2017-03-03 19:01:52 +01:00
|
|
|
from typing import Callable, List, Tuple, Text
|
2013-10-10 21:37:26 +02:00
|
|
|
|
2013-10-31 18:33:19 +01:00
|
|
|
from django.conf import settings
|
|
|
|
|
2013-05-31 22:33:26 +02:00
|
|
|
from diff_match_patch import diff_match_patch
|
2013-05-31 21:03:16 +02:00
|
|
|
import platform
|
|
|
|
import logging
|
|
|
|
|
|
|
|
# TODO: handle changes in link hrefs
|
|
|
|
|
|
|
|
def highlight_with_class(klass, text):
|
2016-12-21 13:17:53 +01:00
|
|
|
# type: (Text, Text) -> Text
|
2013-05-31 21:03:16 +02:00
|
|
|
return '<span class="%s">%s</span>' % (klass, text)
|
|
|
|
|
|
|
|
def highlight_inserted(text):
|
2016-12-21 13:17:53 +01:00
|
|
|
# type: (Text) -> Text
|
2013-05-31 21:03:16 +02:00
|
|
|
return highlight_with_class('highlight_text_inserted', text)
|
|
|
|
|
|
|
|
def highlight_deleted(text):
|
2016-12-21 13:17:53 +01:00
|
|
|
# type: (Text) -> Text
|
2013-05-31 21:03:16 +02:00
|
|
|
return highlight_with_class('highlight_text_deleted', text)
|
|
|
|
|
|
|
|
def chunkize(text, in_tag):
|
2016-12-21 13:17:53 +01:00
|
|
|
# type: (Text, bool) -> Tuple[List[Tuple[Text, Text]], bool]
|
2013-05-31 21:03:16 +02:00
|
|
|
start = 0
|
|
|
|
idx = 0
|
2017-05-17 21:10:10 +02:00
|
|
|
chunks = [] # type: List[Tuple[Text, Text]]
|
2013-05-31 21:03:16 +02:00
|
|
|
for c in text:
|
|
|
|
if c == '<':
|
|
|
|
in_tag = True
|
|
|
|
if start != idx:
|
|
|
|
chunks.append(('text', text[start:idx]))
|
|
|
|
start = idx
|
|
|
|
elif c == '>':
|
|
|
|
in_tag = False
|
|
|
|
if start != idx + 1:
|
|
|
|
chunks.append(('tag', text[start:idx + 1]))
|
|
|
|
start = idx + 1
|
|
|
|
idx += 1
|
|
|
|
|
|
|
|
if start != idx:
|
|
|
|
chunks.append(('tag' if in_tag else 'text', text[start:idx]))
|
|
|
|
return chunks, in_tag
|
2013-06-05 00:55:30 +02:00
|
|
|
|
2013-05-31 21:03:16 +02:00
|
|
|
def highlight_chunks(chunks, highlight_func):
|
2016-12-21 13:17:53 +01:00
|
|
|
# type: (List[Tuple[Text, Text]], Callable[[Text], Text]) -> Text
|
2016-06-17 16:59:29 +02:00
|
|
|
retval = u''
|
2013-05-31 21:03:16 +02:00
|
|
|
for type, text in chunks:
|
|
|
|
if type == 'text':
|
|
|
|
retval += highlight_func(text)
|
|
|
|
else:
|
|
|
|
retval += text
|
|
|
|
return retval
|
|
|
|
|
|
|
|
def verify_html(html):
|
2016-12-21 13:17:53 +01:00
|
|
|
# type: (Text) -> bool
|
2013-05-31 21:03:16 +02:00
|
|
|
# TODO: Actually parse the resulting HTML to ensure we don't
|
|
|
|
# create mal-formed markup. This is unfortunately hard because
|
|
|
|
# we both want pretty strict parsing and we want to parse html5
|
|
|
|
# fragments. For now, we do a basic sanity check.
|
|
|
|
in_tag = False
|
|
|
|
for c in html:
|
|
|
|
if c == '<':
|
|
|
|
if in_tag:
|
|
|
|
return False
|
|
|
|
in_tag = True
|
|
|
|
elif c == '>':
|
|
|
|
if not in_tag:
|
|
|
|
return False
|
|
|
|
in_tag = False
|
|
|
|
if in_tag:
|
|
|
|
return False
|
|
|
|
return True
|
|
|
|
|
2017-06-10 01:49:12 +02:00
|
|
|
def check_tags(text):
|
|
|
|
# type: (Text) -> Text
|
|
|
|
# The current diffing algorithm produces malformed html when text is
|
|
|
|
# added to existing new lines. This patch manually corrects that.
|
|
|
|
in_tag = False
|
|
|
|
if text.endswith('<'):
|
|
|
|
text = text[:-1]
|
|
|
|
for c in text:
|
|
|
|
if c == '<':
|
|
|
|
in_tag = True
|
|
|
|
elif c == '>' and not in_tag:
|
|
|
|
text = '<' + text
|
|
|
|
break
|
|
|
|
return text
|
|
|
|
|
2013-05-31 21:03:16 +02:00
|
|
|
def highlight_html_differences(s1, s2):
|
2016-12-21 13:17:53 +01:00
|
|
|
# type: (Text, Text) -> Text
|
2013-05-31 22:33:26 +02:00
|
|
|
differ = diff_match_patch()
|
|
|
|
ops = differ.diff_main(s1, s2)
|
|
|
|
differ.diff_cleanupSemantic(ops)
|
2016-06-17 16:59:29 +02:00
|
|
|
retval = u''
|
2013-05-31 21:03:16 +02:00
|
|
|
in_tag = False
|
|
|
|
|
2013-05-31 22:33:26 +02:00
|
|
|
idx = 0
|
|
|
|
while idx < len(ops):
|
|
|
|
op, text = ops[idx]
|
2017-06-10 01:49:12 +02:00
|
|
|
text = check_tags(text)
|
|
|
|
if idx != 0:
|
|
|
|
prev_op, prev_text = ops[idx - 1]
|
|
|
|
prev_text = check_tags(prev_text)
|
|
|
|
# Remove visual offset from editing newlines
|
|
|
|
if '<p><br>' in text:
|
|
|
|
text = text.replace('<p><br>', '<p>')
|
|
|
|
elif prev_text.endswith('<p>') and text.startswith('<br>'):
|
|
|
|
text = text[4:]
|
2017-06-06 01:46:41 +02:00
|
|
|
if op == diff_match_patch.DIFF_DELETE:
|
2013-05-31 22:33:26 +02:00
|
|
|
chunks, in_tag = chunkize(text, in_tag)
|
2017-06-06 01:46:41 +02:00
|
|
|
retval += highlight_chunks(chunks, highlight_deleted)
|
2013-05-31 22:33:26 +02:00
|
|
|
elif op == diff_match_patch.DIFF_INSERT:
|
|
|
|
chunks, in_tag = chunkize(text, in_tag)
|
2013-05-31 21:03:16 +02:00
|
|
|
retval += highlight_chunks(chunks, highlight_inserted)
|
2013-05-31 22:33:26 +02:00
|
|
|
elif op == diff_match_patch.DIFF_EQUAL:
|
|
|
|
chunks, in_tag = chunkize(text, in_tag)
|
|
|
|
retval += text
|
|
|
|
idx += 1
|
2013-05-31 21:03:16 +02:00
|
|
|
|
|
|
|
if not verify_html(retval):
|
2013-07-29 23:03:31 +02:00
|
|
|
from zerver.lib.actions import internal_send_message
|
2017-05-22 23:37:15 +02:00
|
|
|
from zerver.models import get_system_bot
|
2013-05-31 21:03:16 +02:00
|
|
|
# We probably want more information here
|
|
|
|
logging.getLogger('').error('HTML diff produced mal-formed HTML')
|
|
|
|
|
2013-10-31 18:33:19 +01:00
|
|
|
if settings.ERROR_BOT is not None:
|
|
|
|
subject = "HTML diff failure on %s" % (platform.node(),)
|
2017-05-22 23:37:15 +02:00
|
|
|
realm = get_system_bot(settings.ERROR_BOT).realm
|
2017-01-22 05:23:36 +01:00
|
|
|
internal_send_message(realm, settings.ERROR_BOT, "stream",
|
2013-10-31 18:33:19 +01:00
|
|
|
"errors", subject, "HTML diff produced malformed HTML")
|
2013-05-31 21:03:16 +02:00
|
|
|
return s2
|
|
|
|
|
|
|
|
return retval
|