markdown: Render ordered lists using <ol> markup.

This brings us in line, and also allows us to style these more like unordered lists, which is visually more appealing. On the backend, we now use the default list blockprocessor + sane list extension of python-markdown to get proper list markup; on the frontend, we mostly return to upstream's code as they have followed CommonMark on this issue. Using <ol> here necessarily removes the behaviour of not renumbering on lists written like 3, 4, 7; hopefully users will be OK with the change. Fixes #12822.
2019-08-11 13:41:34 +08:00 · 2019-08-11 13:41:34 +08:00 · 574c35c0b8
parent 4a176714e1
commit 574c35c0b8
8 changed files with 119 additions and 242 deletions
--- a/frontend_tests/node_tests/markdown.js
+++ b/frontend_tests/node_tests/markdown.js
@ -284,7 +284,7 @@ run_test('marked', () => {
        {input: 'Some text first\n* a\n* list \n* here\n\nand then after',
         expected: '<p>Some text first</p>\n<ul>\n<li>a</li>\n<li>list </li>\n<li>here</li>\n</ul>\n<p>and then after</p>'},
        {input: '1. an\n2. ordered \n3. list',
-         expected: '<p>1. an<br>\n2. ordered<br>\n3. list</p>'},
+         expected: '<ol>\n<li>an</li>\n<li>ordered </li>\n<li>list</li>\n</ol>'},
        {input: '\n~~~quote\nquote this for me\n~~~\nthanks\n',
         expected: '<blockquote>\n<p>quote this for me</p>\n</blockquote>\n<p>thanks</p>'},
        {input: 'This is a @**Cordelia Lear** mention',
--- a/static/js/markdown.js
+++ b/static/js/markdown.js
@ -341,82 +341,6 @@ exports.set_realm_filters = function (realm_filters) {
    marked.InlineLexer.rules.zulip.realm_filters = marked_rules;
 };

-var preprocess_auto_olists = (function () {
-    var TAB_LENGTH = 2;
-    var re = /^( *)(\d+)\. +(.*)/;
-
-    function getIndent(match) {
-        return Math.floor(match[1].length / TAB_LENGTH);
-    }
-
-    function extendArray(arr, arr2) {
-        Array.prototype.push.apply(arr, arr2);
-    }
-
-    function renumber(mlist) {
-        if (!mlist.length) {
-            return [];
-        }
-
-        var startNumber = parseInt(mlist[0][2], 10);
-        var changeNumbers = _.every(mlist, function (m) {
-            return startNumber === parseInt(m[2], 10);
-        });
-
-        var counter = startNumber;
-        return _.map(mlist, function (m) {
-            var number = changeNumbers ? counter.toString() : m[2];
-            counter += 1;
-            return m[1] + number + '. ' + m[3];
-        });
-    }
-
-    return function (src) {
-        var newLines = [];
-        var currentList = [];
-        var currentIndent = 0;
-
-        _.each(src.split('\n'), function (line) {
-            var m = line.match(re);
-            var isNextItem = m && currentList.length && currentIndent === getIndent(m);
-            var isBlankLine = line.trim() === "";
-            if (!isNextItem && !isBlankLine) {
-                // This is a non-blank line that doesn't start with a
-                // bullet, so we're done with the previous numbered
-                // list and can start a new one.
-                extendArray(newLines, renumber(currentList));
-                currentList = [];
-            }
-
-            if (!m) {
-                // This line doesn't start with a bullet.  If it's not
-                // between bullets of a list (i.e. `currentList =
-                // []`), this is just normal content outside a bulleted
-                // list and we can append it to `new_lines`.
-                if (!currentList.length) {
-                    newLines.push(line);
-                }
-
-                // Otherwise, it's a blank line in between bullets,
-                // because if this was a bullet, `m` would be truthy,
-                // and if it wasn't blank, we could have terminated the
-                // list (see above).  We can just skip this blank line
-                // syntax, as our bulleted list CSS styling will
-                // control vertical spacing between bullets.
-            } else if (isNextItem) {
-                currentList.push(m);
-            } else {
-                currentList = [m];
-                currentIndent = getIndent(m);
-            }
-        });
-
-        extendArray(newLines, renumber(currentList));
-
-        return newLines.join('\n');
-    };
-}());
-
 exports.initialize = function () {

    function disable_markdown_regex(rules, name) {
@ -466,11 +390,6 @@ exports.initialize = function () {
        return emoji.translate_emoticons_to_names(src);
    }

-    // Disable ordered lists
-    // We used GFM + tables, so replace the list start regex for that ruleset
-    // We remove the |[\d+]\. that matches the numbering in a numbered list
-    marked.Lexer.rules.tables.list = /^( *)((?:\*)) [\s\S]+?(?:\n+(?=(?: *[\-*_]){3,} *(?:\n+|$))|\n{2,}(?! )(?!\1(?:\*) )\n*|\s*$)/;
-
    // Disable lheadings
    // We only keep the # Heading format.
    disable_markdown_regex(marked.Lexer.rules.tables, 'lheading');
@ -517,7 +436,6 @@ exports.initialize = function () {
        renderer: r,
        preprocessors: [
            preprocess_code_blocks,
-            preprocess_auto_olists,
            preprocess_translate_emoticons,
        ],
    });
--- a/static/styles/rendered_markdown.scss
+++ b/static/styles/rendered_markdown.scss
@ -25,7 +25,7 @@

    /* Ensure ordered lists are nicely centered in 1-line messages */
    ol {
-        margin: 2px 0 5px 6px;
+        margin: 2px 0 5px 20px;
    }

    /* Swap the left and right margins of ordered list for Right-To-Left languages */
--- a/static/third/marked/lib/marked.js
+++ b/static/third/marked/lib/marked.js
@ -26,8 +26,7 @@ var block = {
  text: /^[^\n]+/
 };

-// Zulip modification: Remove numbers from this pattern.
-block.bullet = /(?:[*+-])/;
+block.bullet = /(?:[*+-]|\d{1,9}\.)/;
 block.item = /^( *)(bull) [^\n]*(?:\n(?!\1bull )[^\n]*)*/;
 block.item = replace(block.item, 'gm')
  (/bull/g, block.bullet)
@ -167,9 +166,12 @@ Lexer.prototype.token = function(src, top, bq) {
    , bull
    , b
    , item
+    , listStart
+    , listItems
    , space
    , i
-    , l;
+    , l
+    , isordered;

  while (src) {
    // newline
@ -284,15 +286,21 @@ Lexer.prototype.token = function(src, top, bq) {
    if (cap = this.rules.list.exec(src)) {
      src = src.substring(cap[0].length);
      bull = cap[2];
+      isordered = bull.length > 1;

-      this.tokens.push({
+      listStart = {
        type: 'list_start',
-        ordered: bull.length > 1
-      });
+        ordered: isordered,
+        start: isordered ? +bull : '',
+        loose: false
+      };
+
+      this.tokens.push(listStart);

      // Get each top-level item.
      cap = cap[0].match(this.rules.item);

+      listItems = [];
      next = false;
      l = cap.length;
      i = 0;
@ -303,7 +311,7 @@ Lexer.prototype.token = function(src, top, bq) {
        // Remove the list item's bullet
        // so it is seen as the next token.
        space = item.length;
-        item = item.replace(/^ *([*+-]|\d+\.) +/, '');
+        item = item.replace(/^ *([*+-]|\d+\.) */, '');

        // Outdent whatever the
        // list item contains. Hacky.
@ -316,9 +324,10 @@ Lexer.prototype.token = function(src, top, bq) {

        // Determine whether the next list item belongs here.
        // Backpedal if it does not belong in this list.
-        if (this.options.smartLists && i !== l - 1) {
+        if (i !== l - 1) {
          b = block.bullet.exec(cap[i + 1])[0];
-          if (bull !== b && !(bull.length > 1 && b.length > 1)) {
+          if (bull.length > 1 ? b.length === 1
+            : (b.length > 1 || (this.options.smartLists && b !== bull))) {
            src = cap.slice(i + 1).join('\n') + src;
            i = l - 1;
          }
@ -333,20 +342,34 @@ Lexer.prototype.token = function(src, top, bq) {
          if (!loose) loose = next;
        }

-        this.tokens.push({
-          type: loose
-            ? 'loose_item_start'
-            : 'list_item_start'
-        });
+        if (loose) {
+          listStart.loose = true;
+        }
+
+        t = {
+          type: 'list_item_start',
+          loose: loose
+        };
+
+        listItems.push(t);
+        this.tokens.push(t);

        // Recurse.
-        this.token(item, false, bq);
+        this.token(item, false);

        this.tokens.push({
          type: 'list_item_end'
        });
      }

+      if (listStart.loose) {
+        l = listItems.length;
+        i = 0;
+        for (; i < l; i++) {
+          listItems[i].loose = true;
+        }
+      }
+
      this.tokens.push({
        type: 'list_end'
      });
@ -1031,9 +1054,10 @@ Renderer.prototype.hr = function() {
  return this.options.xhtml ? '<hr/>\n' : '<hr>\n';
 };

-Renderer.prototype.list = function(body, ordered) {
-  var type = ordered ? 'ol' : 'ul';
-  return '<' + type + '>\n' + body + '</' + type + '>\n';
+Renderer.prototype.list = function(body, ordered, start) {
+  var type = ordered ? 'ol' : 'ul',
+      startatt = (ordered && start !== 1) ? (' start="' + start + '"') : '';
+  return '<' + type + startatt + '>\n' + body + '</' + type + '>\n';
 };

 Renderer.prototype.listitem = function(text) {
@ -1282,34 +1306,27 @@ Parser.prototype.tok = function() {
    }
    case 'list_start': {
      var body = ''
-        , ordered = this.token.ordered;
+        , ordered = this.token.ordered
+        , start = this.token.start;

      while (this.next().type !== 'list_end') {
        body += this.tok();
      }

-      return this.renderer.list(body, ordered);
+      return this.renderer.list(body, ordered, start);
    }
    case 'list_item_start': {
-      var body = '';
+      var body = ''
+        , loose = this.token.loose;

      while (this.next().type !== 'list_item_end') {
-        body += this.token.type === 'text'
+        body += !loose && this.token.type === 'text'
          ? this.parseText()
          : this.tok();
      }

      return this.renderer.listitem(body);
    }
-    case 'loose_item_start': {
-      var body = '';
-
-      while (this.next().type !== 'list_item_end') {
-        body += this.tok();
-      }
-
-      return this.renderer.listitem(body);
-    }
    case 'html': {
      var html = !this.token.pre && !this.options.pedantic
        ? this.inline.output(this.token.text)
--- a/zerver/lib/bugdown/init.py
+++ b/zerver/lib/bugdown/init.py
@ -27,7 +27,7 @@ import requests
 from django.conf import settings
 from django.db.models import Q

-from markdown.extensions import codehilite, nl2br, tables
+from markdown.extensions import codehilite, nl2br, tables, sane_lists
 from zerver.lib.bugdown import fenced_code
 from zerver.lib.bugdown.fenced_code import FENCE_RE
 from zerver.lib.camo import get_camo_url
@ -1421,24 +1421,24 @@ class AutoLink(CompiledPattern):
        db_data = self.markdown.zulip_db_data
        return url_to_a(db_data, url)

-class UListProcessor(markdown.blockprocessors.UListProcessor):
-    """ Process unordered list blocks.
-
-        Based on markdown.blockprocessors.UListProcessor, but does not accept
-        '+' or '-' as a bullet character."""
-
-    TAG = 'ul'
-    RE = re.compile('^[ ]{0,3}[*][ ]+(.*)')
-
+class OListProcessor(sane_lists.SaneOListProcessor):
    def __init__(self, parser: Any) -> None:
-
-        # HACK: Set the tab length to 2 just for the initialization of
-        # this class, so that bulleted lists (and only bulleted lists)
-        # work off 2-space indentation.
        parser.markdown.tab_length = 2
        super().__init__(parser)
        parser.markdown.tab_length = 4

+class UListProcessor(sane_lists.SaneUListProcessor):
+    """ Does not accept '+' or '-' as a bullet character. """
+
+    def __init__(self, parser: Any) -> None:
+        parser.markdown.tab_length = 2
+        super().__init__(parser)
+        parser.markdown.tab_length = 4
+
+        self.RE = re.compile('^[ ]{0,%d}[*][ ]+(.*)' % (self.tab_length - 1,))
+        self.CHILD_RE = re.compile(r'^[ ]{0,%d}(([*]))[ ]+(.*)' %
+                                   (self.tab_length - 1,))
+
 class ListIndentProcessor(markdown.blockprocessors.ListIndentProcessor):
    """ Process unordered list blocks.

@ -1482,16 +1482,15 @@ class BlockQuoteProcessor(markdown.blockprocessors.BlockQuoteProcessor):
        # And then run the upstream processor's code for removing the '>'
        return super().clean(line)

-class BugdownUListPreprocessor(markdown.preprocessors.Preprocessor):
-    """ Allows unordered list blocks that come directly after a
-        paragraph to be rendered as an unordered list
+class BugdownListPreprocessor(markdown.preprocessors.Preprocessor):
+    """ Allows list blocks that come directly after another block
+        to be rendered as a list.

        Detects paragraphs that have a matching list item that comes
        directly after a line of text, and inserts a newline between
        to satisfy Markdown"""

-    LI_RE = re.compile('^[ ]{0,3}[*][ ]+(.*)', re.MULTILINE)
-    HANGING_ULIST_RE = re.compile('^.+\\n([ ]{0,3}[*][ ]+.*)', re.MULTILINE)
+    LI_RE = re.compile(r'^[ ]{0,3}(\*|\d\.)[ ]+(.*)', re.MULTILINE)

    def run(self, lines: List[str]) -> List[str]:
        """ Insert a newline between a paragraph and ulist if missing """
@ -1509,91 +1508,17 @@ class BugdownUListPreprocessor(markdown.preprocessors.Preprocessor):
                fence = None

            # If we're not in a fenced block and we detect an upcoming list
-            #  hanging off a paragraph, add a newline
-            if (not fence and lines[i] and
-                self.LI_RE.match(lines[i+1]) and
-                    not self.LI_RE.match(lines[i])):
-
-                copy.insert(i+inserts+1, '')
-                inserts += 1
+            # hanging off any block (including a list of another type), add
+            # a newline.
+            li1 = self.LI_RE.match(lines[i])
+            li2 = self.LI_RE.match(lines[i+1])
+            if not fence and lines[i]:
+                if (li2 and not li1) or (li1 and li2 and
+                                         (len(li1.group(1)) == 1) != (len(li2.group(1)) == 1)):
+                    copy.insert(i+inserts+1, '')
+                    inserts += 1
        return copy

-class AutoNumberOListPreprocessor(markdown.preprocessors.Preprocessor):
-    """ Finds a sequence of lines numbered by the same number"""
-    RE = re.compile(r'^([ ]*)(\d+)\.[ ]+(.*)')
-    TAB_LENGTH = 2
-
-    def run(self, lines: List[str]) -> List[str]:
-        new_lines = []  # type: List[str]
-        current_list = []  # type: List[Match[str]]
-        current_indent = 0
-
-        for line in lines:
-            m = self.RE.match(line)
-
-            # Remember if this line is a continuation of already started list
-            is_next_item = (m and current_list
-                            and current_indent == len(m.group(1)) // self.TAB_LENGTH)
-
-            is_blank_line = line.strip() == ""
-
-            if not is_next_item and not is_blank_line:
-                # This is a non-blank line that doesn't start with a
-                # bullet, so we're done with the previous numbered
-                # list and can start a new one.
-                new_lines.extend(self.renumber(current_list))
-                current_list = []
-
-            if not m:
-                # This line doesn't start with a bullet.  If it's not
-                # between bullets of a list (i.e. `current_list =
-                # []`), this is just normal content outside a bulleted
-                # list and we can append it to `new_lines`.
-                if not current_list:
-                    # Ordinary line
-                    new_lines.append(line)
-
-                # Otherwise, it's a blank line in between bullets,
-                # because if this was a bullet, `m` would be truthy,
-                # and if it wasn't blank, we could have terminated the
-                # list (see above).  We can just skip this blank line
-                # syntax, as our bulleted list CSS styling will
-                # control vertical spacing between bullets.
-            elif is_next_item:
-                # Another list item
-                current_list.append(m)
-            else:
-                # First list item
-                current_list = [m]
-                current_indent = len(m.group(1)) // self.TAB_LENGTH
-
-        new_lines.extend(self.renumber(current_list))
-
-        return new_lines
-
-    def renumber(self, mlist: List[Match[str]]) -> List[str]:
-        if not mlist:
-            return []
-
-        start_number = int(mlist[0].group(2))
-
-        # Change numbers only if every one is the same
-        change_numbers = True
-        for m in mlist:
-            if int(m.group(2)) != start_number:
-                change_numbers = False
-                break
-
-        lines = []  # type: List[str]
-        counter = start_number
-
-        for m in mlist:
-            number = str(counter) if change_numbers else m.group(2)
-            lines.append('%s%s. %s' % (m.group(1), number, m.group(3)))
-            counter += 1
-
-        return lines
-
 # Name for the outer capture group we use to separate whitespace and
 # other delimiters from the actual content.  This value won't be an
 # option in user-entered capture groups.
@ -1874,8 +1799,7 @@ class Bugdown(markdown.Markdown):
        # html_block - insecure
        # reference - references don't make sense in a chat context.
        preprocessors = markdown.util.Registry()
-        preprocessors.register(AutoNumberOListPreprocessor(self), 'auto_number_olist', 40)
-        preprocessors.register(BugdownUListPreprocessor(self), 'hanging_ulists', 35)
+        preprocessors.register(BugdownListPreprocessor(self), 'hanging_lists', 35)
        preprocessors.register(markdown.preprocessors.NormalizeWhitespace(self), 'normalize_whitespace', 30)
        preprocessors.register(fenced_code.FencedBlockPreprocessor(self), 'fenced_code_block', 25)
        preprocessors.register(AlertWordsNotificationProcessor(self), 'custom_text_notifications', 20)
@ -1897,6 +1821,7 @@ class Bugdown(markdown.Markdown):
        parser.blockprocessors.register(HashHeaderProcessor(parser), 'hashheader', 78)
        # We get priority 75 from 'table' extension
        parser.blockprocessors.register(markdown.blockprocessors.HRProcessor(parser), 'hr', 70)
+        parser.blockprocessors.register(OListProcessor(parser), 'olist', 68)
        parser.blockprocessors.register(UListProcessor(parser), 'ulist', 65)
        parser.blockprocessors.register(ListIndentProcessor(parser), 'indent', 60)
        parser.blockprocessors.register(BlockQuoteProcessor(parser), 'quote', 55)
--- a/zerver/lib/push_notifications.py
+++ b/zerver/lib/push_notifications.py
@ -489,15 +489,31 @@ def get_mobile_push_content(rendered_content: str) -> str:
        quote_text += '\n'
        return quote_text

+    def render_olist(ol: lxml.html.HtmlElement) -> str:
+        items = []
+        counter = int(ol.get('start')) if ol.get('start') else 1
+        nested_levels = len(list(ol.iterancestors('ol')))
+        indent = ('\n' + '  ' * nested_levels) if nested_levels else ''
+
+        for li in ol:
+            items.append(indent + str(counter) + '. ' + process(li).strip())
+            counter += 1
+
+        return '\n'.join(items)
+
    def process(elem: lxml.html.HtmlElement) -> str:
-        plain_text = get_text(elem)
-        sub_text = ''
-        for child in elem:
-            sub_text += process(child)
-        if elem.tag == 'blockquote':
-            sub_text = format_as_quote(sub_text)
-        plain_text += sub_text
-        plain_text += elem.tail or ""
+        plain_text = ''
+        if elem.tag == 'ol':
+            plain_text = render_olist(elem)
+        else:
+            plain_text = get_text(elem)
+            sub_text = ''
+            for child in elem:
+                sub_text += process(child)
+            if elem.tag == 'blockquote':
+                sub_text = format_as_quote(sub_text)
+            plain_text += sub_text
+            plain_text += elem.tail or ""
        return plain_text

    if settings.PUSH_NOTIFICATION_REDACT_CONTENT:
--- a/zerver/tests/fixtures/markdown_test_cases.json
+++ b/zerver/tests/fixtures/markdown_test_cases.json
@ -292,27 +292,27 @@
    },
    {
      "name": "numbered_list",
-      "input": "1. A\n 2. B",
-      "expected_output": "<p>1. A<br>\n 2. B</p>",
-      "text_content": "1. A\n 2. B"
+      "input": "1. A\n2. B\n  3. C",
+      "expected_output": "<ol>\n<li>A</li>\n<li>B<ol start=\"3\">\n<li>C</li>\n</ol>\n</li>\n</ol>",
+      "text_content": "1. A\n2. B\n  3. C"
    },
    {
-      "name": "auto_renumbered_list",
-      "input": "1. A\n1. B\n 1. C\n1. D",
-      "expected_output": "<p>1. A<br>\n2. B<br>\n 3. C<br>\n4. D</p>",
-      "text_content": "1. A\n2. B\n 3. C\n4. D"
+      "name": "auto_renumbered_list_trick",
+      "input": "1. A\n1. B\n  1. C\n1. D",
+      "expected_output": "<ol>\n<li>A</li>\n<li>B<ol>\n<li>C</li>\n</ol>\n</li>\n<li>D</li>\n</ol>",
+      "text_content": "1. A\n2. B\n  1. C\n3. D"
    },
    {
      "name": "auto_renumbered_list_from",
      "input": "3. A\n3. B\n3. C\n3. D",
-      "expected_output": "<p>3. A<br>\n4. B<br>\n5. C<br>\n6. D</p>",
+      "expected_output": "<ol start=\"3\">\n<li>A</li>\n<li>B</li>\n<li>C</li>\n<li>D</li>\n</ol>",
      "text_content": "3. A\n4. B\n5. C\n6. D"
    },
    {
-      "name": "not_auto_renumbered_list",
-      "input": "1. A\n3. B\n 2. C\n1. D",
-      "expected_output": "<p>1. A<br>\n3. B<br>\n 2. C<br>\n1. D</p>",
-      "text_content": "1. A\n3. B\n 2. C\n1. D"
+      "name": "two_auto_renumbered_lists",
+      "input": "1. A\n3. B\n  2. C\n1. D",
+      "expected_output": "<ol>\n<li>A</li>\n<li>B<ol start=\"2\">\n<li>C</li>\n</ol>\n</li>\n<li>D</li>\n</ol>",
+      "text_content": "1. A\n2. B\n  2. C\n3. D"
    },
    {
      "name": "linkify_interference",
@ -775,8 +775,9 @@
    {
      "name": "auto_renumbered_list_blankline",
      "input": "1. A\n\n1. B\n\n1. C\n\n1. D\nordinary paragraph\n1. AA\n\n1. BB",
-      "expected_output": "<p>1. A<br>\n2. B<br>\n3. C<br>\n4. D<br>\nordinary paragraph<br>\n1. AA<br>\n2. BB</p>",
-      "text_content": "1. A\n2. B\n3. C\n4. D\nordinary paragraph\n1. AA\n2. BB"
+      "expected_output": "<ol>\n<li>\n<p>A</p>\n</li>\n<li>\n<p>B</p>\n</li>\n<li>\n<p>C</p>\n</li>\n<li>\n<p>D<br>\nordinary paragraph</p>\n</li>\n<li>\n<p>AA</p>\n</li>\n<li>\n<p>BB</p>\n</li>\n</ol>",
+      "marked_expected_output": "<ol>\n<li><p>A</p>\n</li>\n<li><p>B</p>\n</li>\n<li><p>C</p>\n</li>\n<li><p>D<br>\nordinary paragraph</p>\n</li>\n<li><p>AA</p>\n</li>\n<li><p>BB</p>\n</li>\n</ol>",
+      "text_content": "1. A\n2. B\n3. C\n4. D\nordinary paragraph\n5. AA\n6. BB"
    }
  ],
  "linkify_tests": [
--- a/zerver/tests/test_bugdown.py
+++ b/zerver/tests/test_bugdown.py
@ -463,7 +463,7 @@ class BugdownTest(ZulipTestCase):
        converted = render_markdown(msg, content)
        self.assertEqual(converted, expected)

-        content = '>- http://cdn.wallpapersafari.com/13/6/16eVjx.jpeg\n\nAwesome!'
+        content = '>* http://cdn.wallpapersafari.com/13/6/16eVjx.jpeg\n\nAwesome!'
        expected = '<blockquote>\n<ul>\n<li><a href="http://cdn.wallpapersafari.com/13/6/16eVjx.jpeg" target="_blank" title="http://cdn.wallpapersafari.com/13/6/16eVjx.jpeg">http://cdn.wallpapersafari.com/13/6/16eVjx.jpeg</a></li>\n</ul>\n</blockquote>\n<p>Awesome!</p>'
        sender_user_profile = self.example_user('othello')
        msg = Message(sender=sender_user_profile, sending_client=get_client("test"))