From a724a38c03838262f6d13969066b2bffd05da3b4 Mon Sep 17 00:00:00 2001
From: YashRE42 <33805964+YashRE42@users.noreply.github.com>
Date: Thu, 14 Feb 2019 21:45:30 +0530
Subject: [PATCH] markdown: Improve handling of broken img urls.

Some urls which end with image file extensions (eg .jpg) may link to
html pages. This adds handling for linx.li, wikipedia.org and
pasteboard.co. If it is possible, we redirect to the actual image url
otherwise we do not attempt to render it as an image.

Fixes #10438.
---
 zerver/lib/bugdown/__init__.py | 29 +++++++++++++++++++++++++++++
 zerver/tests/test_bugdown.py   | 11 +++++++++++
 2 files changed, 40 insertions(+)

diff --git a/zerver/lib/bugdown/__init__.py b/zerver/lib/bugdown/__init__.py
index 4d56b2808e..a38c95e9dd 100644
--- a/zerver/lib/bugdown/__init__.py
+++ b/zerver/lib/bugdown/__init__.py
@@ -601,12 +601,33 @@ class InlineInterestingLinkProcessor(markdown.treeprocessors.Treeprocessor):
         if not self.markdown.image_preview_enabled:
             return False
         parsed_url = urllib.parse.urlparse(url)
+        # remove html urls which end with img extensions that can not be shorted
+        if parsed_url.netloc == 'pasteboard.co':
+            return False
+
         # List from http://support.google.com/chromeos/bin/answer.py?hl=en&answer=183093
         for ext in [".bmp", ".gif", ".jpg", "jpeg", ".png", ".webp"]:
             if parsed_url.path.lower().endswith(ext):
                 return True
         return False
 
+    def corrected_image_source(self, url: str) -> str:
+        # This function adjusts any urls from linx.li and
+        # wikipedia.org to point to the actual image url.  It's
+        # structurally very similar to dropbox_image, and possibly
+        # should be rewritten to use open graph, but has some value.
+        parsed_url = urllib.parse.urlparse(url)
+        if parsed_url.netloc.lower().endswith('.wikipedia.org'):
+            # Redirecting from "/wiki/File:" to "/wiki/Special:FilePath/File:"
+            # A possible alternative, that avoids the redirect after hitting "Special:"
+            # is using the first characters of md5($filename) to generate the url
+            domain = parsed_url.scheme + "://" + parsed_url.netloc
+            correct_url = domain + parsed_url.path[:6] + 'Special:FilePath' + parsed_url.path[5:]
+            return correct_url
+        if parsed_url.netloc == 'linx.li':
+            return 'https://linx.li/s' + parsed_url.path
+        return None
+
     def dropbox_image(self, url: str) -> Optional[Dict[str, Any]]:
         # TODO: The returned Dict could possibly be a TypedDict in future.
         parsed_url = urllib.parse.urlparse(url)
@@ -978,9 +999,17 @@ class InlineInterestingLinkProcessor(markdown.treeprocessors.Treeprocessor):
                       class_attr=class_attr,
                       already_thumbnailed=True)
                 continue
+
             if self.is_image(url):
+                image_source = self.corrected_image_source(url)
+                if image_source is not None:
+                    found_url = ResultWithFamily(
+                        family=found_url.family,
+                        result=(image_source, image_source)
+                    )
                 self.handle_image_inlining(root, found_url)
                 continue
+
             if get_tweet_id(url) is not None:
                 if rendered_tweet_count >= self.TWITTER_MAX_TO_PREVIEW:
                     # Only render at most one tweet per message
diff --git a/zerver/tests/test_bugdown.py b/zerver/tests/test_bugdown.py
index fb6cb64cdb..fbae52124e 100644
--- a/zerver/tests/test_bugdown.py
+++ b/zerver/tests/test_bugdown.py
@@ -448,6 +448,17 @@ class BugdownTest(ZulipTestCase):
         converted = render_markdown(msg, content)
         self.assertEqual(converted, expected)
 
+    @override_settings(INLINE_IMAGE_PREVIEW=True)
+    def test_corrected_image_source(self) -> None:
+        # testing only wikipedia because linx.li urls can be expected to expire
+        content = 'https://en.wikipedia.org/wiki/File:Wright_of_Derby,_The_Orrery.jpg'
+        expected = '<div class="message_inline_image"><a href="https://en.wikipedia.org/wiki/Special:FilePath/File:Wright_of_Derby,_The_Orrery.jpg" target="_blank" title="https://en.wikipedia.org/wiki/Special:FilePath/File:Wright_of_Derby,_The_Orrery.jpg"><img data-src-fullsize="/thumbnail?url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FSpecial%3AFilePath%2FFile%3AWright_of_Derby%2C_The_Orrery.jpg&amp;size=full" src="/thumbnail?url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FSpecial%3AFilePath%2FFile%3AWright_of_Derby%2C_The_Orrery.jpg&amp;size=thumbnail"></a></div>'
+
+        sender_user_profile = self.example_user('othello')
+        msg = Message(sender=sender_user_profile, sending_client=get_client("test"))
+        converted = render_markdown(msg, content)
+        self.assertEqual(converted, expected)
+
     @override_settings(INLINE_IMAGE_PREVIEW=False)
     def test_image_preview_enabled(self) -> None:
         ret = bugdown.image_preview_enabled()