markdown: Add data-codehilite-language attr for fenced code.

When converting fenced code markdown, we add the language (if specified) in a data-attribute by tweaking the HTML generated. Doing so, allows the frontend to make use of this attr to display view-in-playground option for codeblocks. We use pygments to get the lexer subclass name and use that instead of directly using the language in the data-attribute. Doing so, helps us map different language aliases (like `js` and `javascript`) into a common variable (like `JavaScript`) - and avoids the client from dealing with multiple tags corresponding to the same language. The html structure for a message like this: ``` js ..content.. ``` would now be: <div class="codehilite" data-codehilite-language="JavaScript"> <pre>..content..</pre> </div> Tests and fixtures amended.
2020-09-06 12:11:37 +05:30 · 2020-09-06 12:11:37 +05:30 · 033351609d
parent e9d0bdea65
commit 033351609d
3 changed files with 27 additions and 4 deletions
--- a/zerver/lib/markdown/fenced_code.py
+++ b/zerver/lib/markdown/fenced_code.py
@ -80,7 +80,10 @@ from typing import Any, Dict, Iterable, List, Mapping, MutableSequence, Optional

 import markdown
 from django.utils.html import escape
+from lxml import etree
 from markdown.extensions.codehilite import CodeHilite, CodeHiliteExtension
+from pygments.lexers import get_lexer_by_name
+from pygments.util import ClassNotFound

 from zerver.lib.exceptions import MarkdownRenderingException
 from zerver.lib.tex import render_tex
@ -392,6 +395,24 @@ class FencedBlockPreprocessor(markdown.preprocessors.Preprocessor):
        else:
            code = CODE_WRAP.format(langclass, self._escape(text))

+        # In order to display a "view-in-playground" option in the frontend,
+        # we need to know the language used in the codeblock. We tweak the HTML
+        # CodeHilite generates to add this language as a data-attribute.
+        if lang:
+            parsed_code = etree.HTML(code)
+            div_tag = parsed_code[0][0]
+            # We get the lexer subclass name instead of directly processing the lang, to avoid
+            # different tags being generated for each of the lang's alias. Eg: `js` and `javascript`
+            # would now be mapped to `JavaScript`. In case no lexer with that alias is found, we
+            # return back the text, wrapped in a data-codehilite tag.
+            try:
+                lexer_subclass_name = get_lexer_by_name(lang).name
+            except ClassNotFound:
+                lexer_subclass_name = lang
+            div_tag.attrib['data-codehilite-language'] = lexer_subclass_name
+            # Lxml implicitly converts tags like <span></span> into <span/>
+            # specifying method="c14n" when converting to string, prevents that.
+            code = etree.tostring(div_tag, method="c14n").decode()
        return code

    def format_quote(self, text: str) -> str:
--- a/zerver/tests/fixtures/markdown_test_cases.json
+++ b/zerver/tests/fixtures/markdown_test_cases.json
@ -3,7 +3,7 @@
    {
      "name": "codeblock_hilite",
      "input": "Hamlet said:\n~~~~.python \ndef speak(self):\n    x = 1\n~~~~",
-      "expected_output": "<p>Hamlet said:</p>\n<div class=\"codehilite\"><pre><span></span><code><span class=\"k\">def</span> <span class=\"nf\">speak</span><span class=\"p\">(</span><span class=\"bp\">self</span><span class=\"p\">):</span>\n    <span class=\"n\">x</span> <span class=\"o\">=</span> <span class=\"mi\">1</span>\n</code></pre></div>",
+      "expected_output": "<p>Hamlet said:</p>\n<div class=\"codehilite\" data-codehilite-language=\"Python\"><pre><span></span><code><span class=\"k\">def</span> <span class=\"nf\">speak</span><span class=\"p\">(</span><span class=\"bp\">self</span><span class=\"p\">):</span>\n    <span class=\"n\">x</span> <span class=\"o\">=</span> <span class=\"mi\">1</span>\n</code></pre></div>",
      "marked_expected_output": "<p>Hamlet said:</p>\n<div class=\"codehilite\"><pre><span></span><code>def speak(self):\n    x = 1\n</code></pre></div>",
      "text_content": "Hamlet said:\ndef speak(self):\n    x = 1\n"
    },
@ -786,13 +786,13 @@
    {
      "name": "tex_fenced_tex",
      "input": "```tex\n\n\\pi \\textbf{ is not } 3.14\n```",
-      "expected_output": "<div class=\"codehilite\"><pre><span></span><code><span class=\"k\">\\pi</span> <span class=\"k\">\\textbf</span><span class=\"nb\">{</span> is not <span class=\"nb\">}</span> 3.14\n</code></pre></div>",
+      "expected_output": "<div class=\"codehilite\" data-codehilite-language=\"TeX\"><pre><span></span><code><span class=\"k\">\\pi</span> <span class=\"k\">\\textbf</span><span class=\"nb\">{</span> is not <span class=\"nb\">}</span> 3.14\n</code></pre></div>",
      "marked_expected_output": "<div class=\"codehilite\"><pre><span></span><code>\\pi \\textbf{ is not } 3.14\n</code></pre></div>"
    },
    {
      "name": "tex_fenced_latex",
      "input": "```latex\n\n\\pi \\textbf{ is not } 3.14\n```",
-      "expected_output": "<div class=\"codehilite\"><pre><span></span><code><span class=\"k\">\\pi</span> <span class=\"k\">\\textbf</span><span class=\"nb\">{</span> is not <span class=\"nb\">}</span> 3.14\n</code></pre></div>",
+      "expected_output": "<div class=\"codehilite\" data-codehilite-language=\"TeX\"><pre><span></span><code><span class=\"k\">\\pi</span> <span class=\"k\">\\textbf</span><span class=\"nb\">{</span> is not <span class=\"nb\">}</span> 3.14\n</code></pre></div>",
      "marked_expected_output": "<div class=\"codehilite\"><pre><span></span><code>\\pi \\textbf{ is not } 3.14\n</code></pre></div>"
    },
    {
--- a/zerver/tests/test_markdown.py
+++ b/zerver/tests/test_markdown.py
@ -1376,6 +1376,7 @@ class MarkdownTest(ZulipTestCase):
        msg_without_language = markdown_convert_wrapper(text.format(''))
        msg_with_quote = markdown_convert_wrapper(text.format('quote'))
        msg_with_math = markdown_convert_wrapper(text.format('math'))
+        msg_with_none = markdown_convert_wrapper(text.format('none'))

        # Render with default=javascript
        do_set_realm_property(realm, 'default_code_block_language', 'javascript')
@ -1403,7 +1404,8 @@ class MarkdownTest(ZulipTestCase):
        self.assertTrue(msg_with_python == msg_with_python_default_js == msg_without_language_default_py)
        self.assertTrue(msg_with_quote == msg_without_language_default_quote)
        self.assertTrue(msg_with_math == msg_without_language_default_math)
-        self.assertTrue(msg_without_language == msg_with_none_default_py == msg_without_language_final)
+        self.assertTrue(msg_without_language == msg_without_language_final)
+        self.assertTrue(msg_with_none == msg_with_none_default_py)

        # Test checking inside nested quotes
        nested_text = "````quote\n\n{}\n\n{}````".format(text.format('js'), text.format(''))