zulip/zephyr/lib/bugdown/__init__.py

import markdown
import logging
import traceback
import urlparse
import re

from zephyr.lib.avatar  import gravatar_hash
from zephyr.lib.bugdown import codehilite, fenced_code
from zephyr.lib.bugdown.fenced_code import FENCE_RE

class Gravatar(markdown.inlinepatterns.Pattern):
    def handleMatch(self, match):
        img = markdown.util.etree.Element('img')
        img.set('class', 'message_body_gravatar img-rounded')
        img.set('src', 'https://secure.gravatar.com/avatar/%s?d=identicon&s=30'
            % (gravatar_hash(match.group('email')),))
        return img

def fixup_link(link):
    """Set certain attributes we want on every link."""
    link.set('target', '_blank')
    link.set('title',  link.get('href'))

class AutoLink(markdown.inlinepatterns.Pattern):
    def handleMatch(self, match):
        url = match.group('url')
        a = markdown.util.etree.Element('a')
        a.set('href', url)
        a.text = url
        fixup_link(a)
        return a

class UListProcessor(markdown.blockprocessors.OListProcessor):
    """ Process unordered list blocks.

        Based on markdown.blockprocessors.UListProcessor, but does not accept
        '+' or '-' as a bullet character."""

    TAG = 'ul'
    RE = re.compile(r'^[ ]{0,3}[*][ ]+(.*)')

class BugdownUListPreprocessor(markdown.preprocessors.Preprocessor):
    """ Allows unordered list blocks that come directly after a
        paragraph to be rendered as an unordered list

        Detects paragraphs that have a matching list item that comes
        directly after a line of text, and inserts a newline between
        to satisfy Markdown"""

    LI_RE = re.compile(r'^[ ]{0,3}[*][ ]+(.*)', re.MULTILINE)
    HANGING_ULIST_RE = re.compile(r'^.+\n([ ]{0,3}[*][ ]+.*)', re.MULTILINE)

    def run(self, lines):
        """ Insert a newline between a paragraph and ulist if missing """
        inserts = 0
        fence = None
        copy = lines[:]
        for i in xrange(len(lines) - 1):
            # Ignore anything that is inside a fenced code block
            m = FENCE_RE.match(lines[i])
            if not fence and m:
                fence = m.group('fence')
            elif fence and m and fence == m.group('fence'):
                fence = None

            # If we're not in a fenced block and we detect an upcoming list
            #  hanging off a paragraph, add a newline
            if not fence and lines[i] and \
                self.LI_RE.match(lines[i+1]) and not self.LI_RE.match(lines[i]):
                copy.insert(i+inserts+1, '')
                inserts += 1
        return copy

# Based on markdown.inlinepatterns.LinkPattern
class LinkPattern(markdown.inlinepatterns.Pattern):
    """ Return a link element from the given match. """
    def handleMatch(self, m):
        el = markdown.util.etree.Element("a")
        el.text = m.group(2)
        href = m.group(9)

        if href:
            if href[0] == "<":
                href = href[1:-1]
            el.set("href", self.sanitize_url(self.unescape(href.strip())))
        else:
            el.set("href", "")

        fixup_link(el)
        return el

    def sanitize_url(self, url):
        """
        Sanitize a url against xss attacks.
        See the docstring on markdown.inlinepatterns.LinkPattern.sanitize_url.
        """
        try:
            parts = urlparse.urlparse(url.replace(' ', '%20'))
            scheme, netloc, path, params, query, fragment = parts
        except ValueError:
            # Bad url - so bad it couldn't be parsed.
            return ''

        # Humbug modification: If scheme is not specified, assume http://
        # It's unlikely that users want relative links within humbughq.com.
        # We re-enter sanitize_url because netloc etc. need to be re-parsed.
        if not scheme:
            return self.sanitize_url('http://' + url)

        locless_schemes = ['', 'mailto', 'news']
        if netloc == '' and scheme not in locless_schemes:
            # This fails regardless of anything else.
            # Return immediately to save additional proccessing
            return ''

        for part in parts[2:]:
            if ":" in part:
                # Not a safe url
                return ''

        # Url passes all tests. Return url as-is.
        return urlparse.urlunparse(parts)

class Bugdown(markdown.Extension):
    def extendMarkdown(self, md, md_globals):
        del md.preprocessors['reference']

        for k in ('image_link', 'image_reference', 'automail',
                  'autolink', 'link', 'reference', 'short_reference',
                  'escape'):
            del md.inlinePatterns[k]

        for k in ('hashheader', 'setextheader', 'olist', 'ulist'):
            del md.parser.blockprocessors[k]

        md.parser.blockprocessors.add('ulist', UListProcessor(md.parser), '>hr')

        md.inlinePatterns.add('gravatar', Gravatar(r'!gravatar\((?P<email>[^)]*)\)'), '_begin')
        md.inlinePatterns.add('link', LinkPattern(markdown.inlinepatterns.LINK_RE, md), '>backtick')

        # A link starts at a word boundary, and ends at space or end-of-input.
        # But any trailing punctuation (other than /) is not included.
        # We accomplish this with a non-greedy match followed by a greedy
        # lookahead assertion.
        #
        # markdown.inlinepatterns.Pattern compiles this with re.UNICODE, which
        # is important because we're using \w.
        link_regex = r'\b(?P<url>https?://[^\s]+?)(?=[^\w/]*(\s|\Z))'
        md.inlinePatterns.add('autolink', AutoLink(link_regex), '>link')

        md.preprocessors.add('hanging_ulists',
                                 BugdownUListPreprocessor(md),
                                 "_begin")

_md_engine = markdown.Markdown(
    safe_mode     = 'escape',
    output_format = 'html',
    extensions    = ['nl2br',
        codehilite.makeExtension(configs=[
            ('force_linenos', False),
            ('guess_lang',    False)]),
        fenced_code.makeExtension(),
        Bugdown()])

# We want to log Markdown parser failures, but shouldn't log the actual input
# message for privacy reasons.  The compromise is to replace all alphanumeric
# characters with 'x'.
#
# We also use repr() to improve reproducibility, and to escape terminal control
# codes, which can do surprisingly nasty things.
_privacy_re = re.compile(r'\w', flags=re.UNICODE)
def _sanitize_for_log(md):
    return repr(_privacy_re.sub('x', md))

def convert(md):
    """Convert Markdown to HTML, with Humbug-specific settings and hacks."""

    # Reset the parser; otherwise it will get slower over time.
    _md_engine.reset()

    try:
        html = _md_engine.convert(md)
    except:
        # FIXME: Do something more reasonable here!
        html = '<p>[Humbug note: Sorry, we could not understand the formatting of your message]</p>'
        logging.getLogger('').error('Exception in Markdown parser: %sInput (sanitized) was: %s'
            % (traceback.format_exc(), _sanitize_for_log(md)))

    return html
Move our various Markdown hacks into their own file (imported from commit b03a5c64cc95964936c4aba7d667807969e35d21) 2012-10-15 22:03:50 +02:00			`import markdown`
bugdown: Log Markdown parser failures (imported from commit 9e225a32b71edbfd9007cc2fbae32be31896233d) 2012-10-22 05:06:28 +02:00			`import logging`
			`import traceback`
bugdown: Use a local copy of markdown.inlinepatterns.LinkPattern With changes for the way things are imported in bugdown. (imported from commit 11adf7911f7bb945367221f8fa317caa71de3fc4) 2012-12-04 19:57:54 +01:00			`import urlparse`
bugdown: Hide alphanumeric characters in exception logs, for privacy (imported from commit 39481494b7910307f56e566035c1b464c83d196e) 2012-10-25 21:38:47 +02:00			`import re`
Move our various Markdown hacks into their own file (imported from commit b03a5c64cc95964936c4aba7d667807969e35d21) 2012-10-15 22:03:50 +02:00
Use our local copy of codehilite (imported from commit bc9704a19017061d55ff0e16589d57ee0c46caa5) 2012-10-20 05:34:14 +02:00			`from zephyr.lib.avatar import gravatar_hash`
bugdown: Use our local copy of fenced_code And wire it up to our local copy of codehilite. This fixes highlighting in fenced code blocks, e.g. ~~~~ .js var x = function () { return "hi"; }; ~~~~ (imported from commit 0efb0c9b98a3acdf55e18bb1918af7960f3425be) 2012-11-19 18:31:03 +01:00			`from zephyr.lib.bugdown import codehilite, fenced_code`
Handle lists that start immediately after paragraphs (imported from commit 055593d7a324598e133d53db0c33103016426c8f) 2013-01-24 19:35:20 +01:00			`from zephyr.lib.bugdown.fenced_code import FENCE_RE`
Allow users to embed Gravatars in messages We'll use this internally for the commit bot. We might eventually disable it for external users. (imported from commit 3136cd9faadc6b81355889d2ee6472985da87fbe) 2012-10-17 04:42:19 +02:00
			`class Gravatar(markdown.inlinepatterns.Pattern):`
			`def handleMatch(self, match):`
			`img = markdown.util.etree.Element('img')`
			`img.set('class', 'message_body_gravatar img-rounded')`
			`img.set('src', 'https://secure.gravatar.com/avatar/%s?d=identicon&s=30'`
bugdown: Use named match for Gravatar email (imported from commit 778c4b6d754f975c89a91336593d2f62e49249d5) 2012-10-22 02:15:44 +02:00			`% (gravatar_hash(match.group('email')),))`
Allow users to embed Gravatars in messages We'll use this internally for the commit bot. We might eventually disable it for external users. (imported from commit 3136cd9faadc6b81355889d2ee6472985da87fbe) 2012-10-17 04:42:19 +02:00			`return img`

bugdown: Set link target, title attributes on the server For consistency. Fixes #266. (imported from commit 63e199a0fe4534df804a82cc98a1fdcf8ccb45da) 2012-12-04 20:15:50 +01:00			`def fixup_link(link):`
			`"""Set certain attributes we want on every link."""`
			`link.set('target', '_blank')`
			`link.set('title', link.get('href'))`

bugdown: Linkify using a Markdown extension This prevents trying to linkify inside code blocks. (imported from commit 97dd20ecee19f41650aa98e68aa8e9908ece5b33) 2012-10-22 02:32:18 +02:00			`class AutoLink(markdown.inlinepatterns.Pattern):`
			`def handleMatch(self, match):`
			`url = match.group('url')`
			`a = markdown.util.etree.Element('a')`
			`a.set('href', url)`
			`a.text = url`
bugdown: Set link target, title attributes on the server For consistency. Fixes #266. (imported from commit 63e199a0fe4534df804a82cc98a1fdcf8ccb45da) 2012-12-04 20:15:50 +01:00			`fixup_link(a)`
bugdown: Linkify using a Markdown extension This prevents trying to linkify inside code blocks. (imported from commit 97dd20ecee19f41650aa98e68aa8e9908ece5b33) 2012-10-22 02:32:18 +02:00			`return a`

bugdown: Disable + as a bullet character for an unordered list Fixes #272. (imported from commit 8afaf14965ed1f6a4bb3ccfc9d4c2d807148666d) 2012-11-02 18:25:37 +01:00			`class UListProcessor(markdown.blockprocessors.OListProcessor):`
			`""" Process unordered list blocks.`

			`Based on markdown.blockprocessors.UListProcessor, but does not accept`
Don't accept - as valid list delimiters (imported from commit 287353a29289ee536a59f47f87ff66893bf261ec) 2013-01-23 23:07:01 +01:00			`'+' or '-' as a bullet character."""`
bugdown: Disable + as a bullet character for an unordered list Fixes #272. (imported from commit 8afaf14965ed1f6a4bb3ccfc9d4c2d807148666d) 2012-11-02 18:25:37 +01:00
			`TAG = 'ul'`
Don't accept - as valid list delimiters (imported from commit 287353a29289ee536a59f47f87ff66893bf261ec) 2013-01-23 23:07:01 +01:00			`RE = re.compile(r'^[ ]{0,3}[][ ]+(.)')`
bugdown: Disable + as a bullet character for an unordered list Fixes #272. (imported from commit 8afaf14965ed1f6a4bb3ccfc9d4c2d807148666d) 2012-11-02 18:25:37 +01:00
Handle lists that start immediately after paragraphs (imported from commit 055593d7a324598e133d53db0c33103016426c8f) 2013-01-24 19:35:20 +01:00			`class BugdownUListPreprocessor(markdown.preprocessors.Preprocessor):`
			`""" Allows unordered list blocks that come directly after a`
			`paragraph to be rendered as an unordered list`

			`Detects paragraphs that have a matching list item that comes`
			`directly after a line of text, and inserts a newline between`
			`to satisfy Markdown"""`

			`LI_RE = re.compile(r'^[ ]{0,3}[][ ]+(.)', re.MULTILINE)`
			`HANGING_ULIST_RE = re.compile(r'^.+\n([ ]{0,3}[][ ]+.)', re.MULTILINE)`

			`def run(self, lines):`
			`""" Insert a newline between a paragraph and ulist if missing """`
			`inserts = 0`
			`fence = None`
			`copy = lines[:]`
			`for i in xrange(len(lines) - 1):`
			`# Ignore anything that is inside a fenced code block`
			`m = FENCE_RE.match(lines[i])`
			`if not fence and m:`
			`fence = m.group('fence')`
			`elif fence and m and fence == m.group('fence'):`
			`fence = None`

			`# If we're not in a fenced block and we detect an upcoming list`
			`# hanging off a paragraph, add a newline`
			`if not fence and lines[i] and \`
			`self.LI_RE.match(lines[i+1]) and not self.LI_RE.match(lines[i]):`
			`copy.insert(i+inserts+1, '')`
			`inserts += 1`
			`return copy`

bugdown: Use a local copy of markdown.inlinepatterns.LinkPattern With changes for the way things are imported in bugdown. (imported from commit 11adf7911f7bb945367221f8fa317caa71de3fc4) 2012-12-04 19:57:54 +01:00			`# Based on markdown.inlinepatterns.LinkPattern`
			`class LinkPattern(markdown.inlinepatterns.Pattern):`
			`""" Return a link element from the given match. """`
			`def handleMatch(self, m):`
			`el = markdown.util.etree.Element("a")`
			`el.text = m.group(2)`
			`href = m.group(9)`

			`if href:`
			`if href[0] == "<":`
			`href = href[1:-1]`
			`el.set("href", self.sanitize_url(self.unescape(href.strip())))`
			`else:`
			`el.set("href", "")`

bugdown: Set link target, title attributes on the server For consistency. Fixes #266. (imported from commit 63e199a0fe4534df804a82cc98a1fdcf8ccb45da) 2012-12-04 20:15:50 +01:00			`fixup_link(el)`
bugdown: Use a local copy of markdown.inlinepatterns.LinkPattern With changes for the way things are imported in bugdown. (imported from commit 11adf7911f7bb945367221f8fa317caa71de3fc4) 2012-12-04 19:57:54 +01:00			`return el`

			`def sanitize_url(self, url):`
			`"""`
			`Sanitize a url against xss attacks.`
			`See the docstring on markdown.inlinepatterns.LinkPattern.sanitize_url.`
			`"""`
			`try:`
bugdown: Fix confusing variable use in LinkPattern (imported from commit 1538911149263340a5ea210c6d804a937f07cd5e) 2012-12-04 20:01:07 +01:00			`parts = urlparse.urlparse(url.replace(' ', '%20'))`
			`scheme, netloc, path, params, query, fragment = parts`
bugdown: Use a local copy of markdown.inlinepatterns.LinkPattern With changes for the way things are imported in bugdown. (imported from commit 11adf7911f7bb945367221f8fa317caa71de3fc4) 2012-12-04 19:57:54 +01:00			`except ValueError:`
			`# Bad url - so bad it couldn't be parsed.`
			`return ''`

bugdown: Assume http:// for links without a protocol It's unlikely that users want relative links within humbughq.com. Fixes #447. (imported from commit d43a5758e6df448b07f56dc2de28078adaab8aeb) 2012-12-04 20:04:34 +01:00			`# Humbug modification: If scheme is not specified, assume http://`
			`# It's unlikely that users want relative links within humbughq.com.`
			`# We re-enter sanitize_url because netloc etc. need to be re-parsed.`
			`if not scheme:`
			`return self.sanitize_url('http://' + url)`

bugdown: Use a local copy of markdown.inlinepatterns.LinkPattern With changes for the way things are imported in bugdown. (imported from commit 11adf7911f7bb945367221f8fa317caa71de3fc4) 2012-12-04 19:57:54 +01:00			`locless_schemes = ['', 'mailto', 'news']`
			`if netloc == '' and scheme not in locless_schemes:`
			`# This fails regardless of anything else.`
			`# Return immediately to save additional proccessing`
			`return ''`

bugdown: Fix confusing variable use in LinkPattern (imported from commit 1538911149263340a5ea210c6d804a937f07cd5e) 2012-12-04 20:01:07 +01:00			`for part in parts[2:]:`
bugdown: Use a local copy of markdown.inlinepatterns.LinkPattern With changes for the way things are imported in bugdown. (imported from commit 11adf7911f7bb945367221f8fa317caa71de3fc4) 2012-12-04 19:57:54 +01:00			`if ":" in part:`
			`# Not a safe url`
			`return ''`

			`# Url passes all tests. Return url as-is.`
bugdown: Fix confusing variable use in LinkPattern (imported from commit 1538911149263340a5ea210c6d804a937f07cd5e) 2012-12-04 20:01:07 +01:00			`return urlparse.urlunparse(parts)`
bugdown: Use a local copy of markdown.inlinepatterns.LinkPattern With changes for the way things are imported in bugdown. (imported from commit 11adf7911f7bb945367221f8fa317caa71de3fc4) 2012-12-04 19:57:54 +01:00
Markdown: Disable images (imported from commit 6656b15fa690b463265af6384a73529ee635f688) 2012-10-16 17:35:58 +02:00			`class Bugdown(markdown.Extension):`
			`def extendMarkdown(self, md, md_globals):`
bugdown: Disable reference-based links This is syntax like Here's [a link][] [a link]: http://google.com This is not very useful for short chat-style messages. It will confuse users, especially because we don't document it. And disabling it saves the effort of applying the same link fixups as elsewhere. (imported from commit c23391465486db545302b79c084b4f9cd5cdcc6a) 2012-12-04 20:22:14 +01:00			`del md.preprocessors['reference']`

			`for k in ('image_link', 'image_reference', 'automail',`
bugdown: Remove special treatment of backslash Fixes #562. (imported from commit aa39cf390ef44275c7d5a84ca954c75535d372b1) 2012-12-11 21:19:15 +01:00			`'autolink', 'link', 'reference', 'short_reference',`
			`'escape'):`
bugdown: Decrease code duplication (imported from commit 40158134b60e477d8a353d050fa62c9ded0e9e9f) 2012-10-22 02:35:36 +02:00			`del md.inlinePatterns[k]`

bugdown: Disable + as a bullet character for an unordered list Fixes #272. (imported from commit 8afaf14965ed1f6a4bb3ccfc9d4c2d807148666d) 2012-11-02 18:25:37 +01:00			`for k in ('hashheader', 'setextheader', 'olist', 'ulist'):`
bugdown: Decrease code duplication (imported from commit 40158134b60e477d8a353d050fa62c9ded0e9e9f) 2012-10-22 02:35:36 +02:00			`del md.parser.blockprocessors[k]`
Markdown: Disable images (imported from commit 6656b15fa690b463265af6384a73529ee635f688) 2012-10-16 17:35:58 +02:00
bugdown: Disable + as a bullet character for an unordered list Fixes #272. (imported from commit 8afaf14965ed1f6a4bb3ccfc9d4c2d807148666d) 2012-11-02 18:25:37 +01:00			`md.parser.blockprocessors.add('ulist', UListProcessor(md.parser), '>hr')`

bugdown: Use named match for Gravatar email (imported from commit 778c4b6d754f975c89a91336593d2f62e49249d5) 2012-10-22 02:15:44 +02:00			`md.inlinePatterns.add('gravatar', Gravatar(r'!gravatar\((?P<email>[^)]*)\)'), '_begin')`
bugdown: Remove special treatment of backslash Fixes #562. (imported from commit aa39cf390ef44275c7d5a84ca954c75535d372b1) 2012-12-11 21:19:15 +01:00			`md.inlinePatterns.add('link', LinkPattern(markdown.inlinepatterns.LINK_RE, md), '>backtick')`
Allow users to embed Gravatars in messages We'll use this internally for the commit bot. We might eventually disable it for external users. (imported from commit 3136cd9faadc6b81355889d2ee6472985da87fbe) 2012-10-17 04:42:19 +02:00
bugdown: Remove trailing punctuation from automatic links And allow parentheses etc. within a link, if they're not at the end. Fixes #401. (imported from commit 5261fce74fe381ceece2e7406776cc5bde19deb9) 2012-11-20 19:33:10 +01:00			`# A link starts at a word boundary, and ends at space or end-of-input.`
			`# But any trailing punctuation (other than /) is not included.`
			`# We accomplish this with a non-greedy match followed by a greedy`
			`# lookahead assertion.`
			`#`
			`# markdown.inlinepatterns.Pattern compiles this with re.UNICODE, which`
			`# is important because we're using \w.`
			`link_regex = r'\b(?P<url>https?://[^\s]+?)(?=[^\w/]*(\s\|\Z))'`
bugdown: Install autolink pattern after link pattern This fixes explicit links. (imported from commit 6867d271344d35c2c8d54b3393219113a095bc4f) 2012-10-22 23:47:49 +02:00			`md.inlinePatterns.add('autolink', AutoLink(link_regex), '>link')`
bugdown: Linkify using a Markdown extension This prevents trying to linkify inside code blocks. (imported from commit 97dd20ecee19f41650aa98e68aa8e9908ece5b33) 2012-10-22 02:32:18 +02:00
Handle lists that start immediately after paragraphs (imported from commit 055593d7a324598e133d53db0c33103016426c8f) 2013-01-24 19:35:20 +01:00			`md.preprocessors.add('hanging_ulists',`
			`BugdownUListPreprocessor(md),`
			`"_begin")`

bugdown: Use Markdown.reset() instead of re-creating the whole parser (imported from commit 45a65453f2178a6a73392e2bd1e7d6d03de0e0e7) 2012-11-20 20:15:55 +01:00			`_md_engine = markdown.Markdown(`
			`safe_mode = 'escape',`
			`output_format = 'html',`
			`extensions = ['nl2br',`
			`codehilite.makeExtension(configs=[`
			`('force_linenos', False),`
			`('guess_lang', False)]),`
			`fenced_code.makeExtension(),`
			`Bugdown()])`
Move our various Markdown hacks into their own file (imported from commit b03a5c64cc95964936c4aba7d667807969e35d21) 2012-10-15 22:03:50 +02:00
bugdown: Hide alphanumeric characters in exception logs, for privacy (imported from commit 39481494b7910307f56e566035c1b464c83d196e) 2012-10-25 21:38:47 +02:00			`# We want to log Markdown parser failures, but shouldn't log the actual input`
			`# message for privacy reasons. The compromise is to replace all alphanumeric`
			`# characters with 'x'.`
			`#`
			`# We also use repr() to improve reproducibility, and to escape terminal control`
			`# codes, which can do surprisingly nasty things.`
			`_privacy_re = re.compile(r'\w', flags=re.UNICODE)`
			`def _sanitize_for_log(md):`
			`return repr(_privacy_re.sub('x', md))`

Move our various Markdown hacks into their own file (imported from commit b03a5c64cc95964936c4aba7d667807969e35d21) 2012-10-15 22:03:50 +02:00			`def convert(md):`
			`"""Convert Markdown to HTML, with Humbug-specific settings and hacks."""`
bugdown: Use Markdown.reset() instead of re-creating the whole parser (imported from commit 45a65453f2178a6a73392e2bd1e7d6d03de0e0e7) 2012-11-20 20:15:55 +01:00
			`# Reset the parser; otherwise it will get slower over time.`
			`_md_engine.reset()`
Move our various Markdown hacks into their own file (imported from commit b03a5c64cc95964936c4aba7d667807969e35d21) 2012-10-15 22:03:50 +02:00
			`try:`
			`html = _md_engine.convert(md)`
			`except:`
			`# FIXME: Do something more reasonable here!`
			`html = '<p>[Humbug note: Sorry, we could not understand the formatting of your message]</p>'`
bugdown: Hide alphanumeric characters in exception logs, for privacy (imported from commit 39481494b7910307f56e566035c1b464c83d196e) 2012-10-25 21:38:47 +02:00			`logging.getLogger('').error('Exception in Markdown parser: %sInput (sanitized) was: %s'`
			`% (traceback.format_exc(), _sanitize_for_log(md)))`
Move our various Markdown hacks into their own file (imported from commit b03a5c64cc95964936c4aba7d667807969e35d21) 2012-10-15 22:03:50 +02:00
			`return html`