From ae58ed5a74d9fb902bc5f746086cf0400009abcf Mon Sep 17 00:00:00 2001
From: Tim Abbott <tabbott@zulip.com>
Date: Mon, 14 Sep 2020 21:43:56 -0700
Subject: [PATCH] markdown: Tweak data-code-language testing and comments.

This should make it clearer the precise decisions we've made about the
intended semantics of this feature.
---
 zerver/lib/markdown/fenced_code.py            | 33 ++++++++++++-------
 .../tests/fixtures/markdown_test_cases.json   |  7 ++++
 2 files changed, 28 insertions(+), 12 deletions(-)
diff --git a/zerver/lib/markdown/fenced_code.py b/zerver/lib/markdown/fenced_code.py
index 6e3f3cfb8c..56093b5f95 100644
--- a/zerver/lib/markdown/fenced_code.py
+++ b/zerver/lib/markdown/fenced_code.py
@@ -395,23 +395,32 @@ class FencedBlockPreprocessor(markdown.preprocessors.Preprocessor):
         else:
             code = CODE_WRAP.format(langclass, self._escape(text))
 
-        # In order to display a "view-in-playground" option in the frontend,
-        # we need to know the language used in the codeblock. We tweak the HTML
-        # CodeHilite generates to add this language as a data-attribute.
+        # To support our "view in playground" feature, the frontend
+        # needs to know what Pygments language was used for
+        # highlighting this code block.  We record this in a data
+        # attribute attached to the outer `pre` element.
+        # Unfortunately, the pygments API doesn't offer a way to add
+        # this, so we need to do it in a post-processing step.
         if lang:
             parsed_code = etree.HTML(code)
             div_tag = parsed_code[0][0]
-            # We get the lexer subclass name instead of directly processing the lang, to avoid
-            # different tags being generated for each of the lang's alias. Eg: `js` and `javascript`
-            # would now be mapped to `JavaScript`. In case no lexer with that alias is found, we
-            # return back the text, wrapped in a data-codehilite tag.
+
+            # For the value of our data element, we get the lexer
+            # subclass name instead of directly using the language,
+            # since that canonicalizes aliases (Eg: `js` and
+            # `javascript` will be mapped to `JavaScript`).
             try:
-                lexer_subclass_name = get_lexer_by_name(lang).name
+                code_language = get_lexer_by_name(lang).name
             except ClassNotFound:
-                lexer_subclass_name = lang
-            div_tag.attrib['data-code-language'] = lexer_subclass_name
-            # Lxml implicitly converts tags like <span></span> into <span/>
-            # specifying method="c14n" when converting to string, prevents that.
+                # If there isn't a Pygments lexer by this name, we
+                # still tag it with the user's data-code-language
+                # value, since this allows hooking up a "playground"
+                # for custom "languages" that aren't known to Pygments.
+                code_language = lang
+
+            div_tag.attrib['data-code-language'] = code_language
+            # lxml implicitly converts tags like <span></span> into <span/>.
+            # Specifying method="c14n" when converting to string prevents that.
             code = etree.tostring(div_tag, method="c14n").decode()
         return code
 
diff --git a/zerver/tests/fixtures/markdown_test_cases.json b/zerver/tests/fixtures/markdown_test_cases.json
index 455b8b0514..726605e5f6 100644
--- a/zerver/tests/fixtures/markdown_test_cases.json
+++ b/zerver/tests/fixtures/markdown_test_cases.json
@@ -7,6 +7,13 @@
       "marked_expected_output": "<p>Hamlet said:</p>\n<div class=\"codehilite\"><pre><span></span><code>def speak(self):\n    x = 1\n</code></pre></div>",
       "text_content": "Hamlet said:\ndef speak(self):\n    x = 1\n"
     },
+    {
+      "name": "codeblock_hilite",
+      "input": "``` inventedlanguage\ndef speak(self):\n    x = 1\n```",
+      "expected_output": "<div class=\"codehilite\" data-code-language=\"inventedlanguage\"><pre><span></span><code>def speak(self):\n    x = 1\n</code></pre></div>",
+      "marked_expected_output": "<div class=\"codehilite\"><pre><span></span><code>def speak(self):\n    x = 1\n</code></pre></div>",
+      "text_content": "def speak(self):\n    x = 1\n"
+    },
     {
       "name": "ampampamp",
       "input": "& &amp; &amp;amp;\n~~~~\n& &amp; &amp;amp;\n~~~~\n    & &amp; &amp;amp;",