zulip/zephyr/lib/bugdown/__init__.py

import markdown
import logging
import traceback
import re

from zephyr.lib.avatar  import gravatar_hash
from zephyr.lib.bugdown import codehilite, fenced_code

class Gravatar(markdown.inlinepatterns.Pattern):
    def handleMatch(self, match):
        img = markdown.util.etree.Element('img')
        img.set('class', 'message_body_gravatar img-rounded')
        img.set('src', 'https://secure.gravatar.com/avatar/%s?d=identicon&s=30'
            % (gravatar_hash(match.group('email')),))
        return img

class AutoLink(markdown.inlinepatterns.Pattern):
    def handleMatch(self, match):
        url = match.group('url')
        a = markdown.util.etree.Element('a')
        a.set('href', url)
        a.text = url
        return a

class UListProcessor(markdown.blockprocessors.OListProcessor):
    """ Process unordered list blocks.

        Based on markdown.blockprocessors.UListProcessor, but does not accept
        '+' as a bullet character."""

    TAG = 'ul'
    RE = re.compile(r'^[ ]{0,3}[*-][ ]+(.*)')

class Bugdown(markdown.Extension):
    def extendMarkdown(self, md, md_globals):
        for k in ('image_link', 'image_reference', 'automail', 'autolink'):
            del md.inlinePatterns[k]

        for k in ('hashheader', 'setextheader', 'olist', 'ulist'):
            del md.parser.blockprocessors[k]

        md.parser.blockprocessors.add('ulist', UListProcessor(md.parser), '>hr')

        md.inlinePatterns.add('gravatar', Gravatar(r'!gravatar\((?P<email>[^)]*)\)'), '_begin')

        # A link starts at a word boundary, and ends at space or end-of-input.
        # But any trailing punctuation (other than /) is not included.
        # We accomplish this with a non-greedy match followed by a greedy
        # lookahead assertion.
        #
        # markdown.inlinepatterns.Pattern compiles this with re.UNICODE, which
        # is important because we're using \w.
        link_regex = r'\b(?P<url>https?://[^\s]+?)(?=[^\w/]*(\s|\Z))'
        md.inlinePatterns.add('autolink', AutoLink(link_regex), '>link')

_md_engine = markdown.Markdown(
    safe_mode     = 'escape',
    output_format = 'html',
    extensions    = ['nl2br',
        codehilite.makeExtension(configs=[
            ('force_linenos', False),
            ('guess_lang',    False)]),
        fenced_code.makeExtension(),
        Bugdown()])

# We want to log Markdown parser failures, but shouldn't log the actual input
# message for privacy reasons.  The compromise is to replace all alphanumeric
# characters with 'x'.
#
# We also use repr() to improve reproducibility, and to escape terminal control
# codes, which can do surprisingly nasty things.
_privacy_re = re.compile(r'\w', flags=re.UNICODE)
def _sanitize_for_log(md):
    return repr(_privacy_re.sub('x', md))

def _linkify(match):
    url = match.group('url')
    return ' [%s](%s) ' % (url, url)

def convert(md):
    """Convert Markdown to HTML, with Humbug-specific settings and hacks."""

    # Reset the parser; otherwise it will get slower over time.
    _md_engine.reset()

    try:
        html = _md_engine.convert(md)
    except:
        # FIXME: Do something more reasonable here!
        html = '<p>[Humbug note: Sorry, we could not understand the formatting of your message]</p>'
        logging.getLogger('').error('Exception in Markdown parser: %sInput (sanitized) was: %s'
            % (traceback.format_exc(), _sanitize_for_log(md)))

    return html
Move our various Markdown hacks into their own file (imported from commit b03a5c64cc95964936c4aba7d667807969e35d21) 2012-10-15 22:03:50 +02:00			`import markdown`
bugdown: Log Markdown parser failures (imported from commit 9e225a32b71edbfd9007cc2fbae32be31896233d) 2012-10-22 05:06:28 +02:00			`import logging`
			`import traceback`
bugdown: Hide alphanumeric characters in exception logs, for privacy (imported from commit 39481494b7910307f56e566035c1b464c83d196e) 2012-10-25 21:38:47 +02:00			`import re`
Move our various Markdown hacks into their own file (imported from commit b03a5c64cc95964936c4aba7d667807969e35d21) 2012-10-15 22:03:50 +02:00
Use our local copy of codehilite (imported from commit bc9704a19017061d55ff0e16589d57ee0c46caa5) 2012-10-20 05:34:14 +02:00			`from zephyr.lib.avatar import gravatar_hash`
bugdown: Use our local copy of fenced_code And wire it up to our local copy of codehilite. This fixes highlighting in fenced code blocks, e.g. ~~~~ .js var x = function () { return "hi"; }; ~~~~ (imported from commit 0efb0c9b98a3acdf55e18bb1918af7960f3425be) 2012-11-19 18:31:03 +01:00			`from zephyr.lib.bugdown import codehilite, fenced_code`
Allow users to embed Gravatars in messages We'll use this internally for the commit bot. We might eventually disable it for external users. (imported from commit 3136cd9faadc6b81355889d2ee6472985da87fbe) 2012-10-17 04:42:19 +02:00
			`class Gravatar(markdown.inlinepatterns.Pattern):`
			`def handleMatch(self, match):`
			`img = markdown.util.etree.Element('img')`
			`img.set('class', 'message_body_gravatar img-rounded')`
			`img.set('src', 'https://secure.gravatar.com/avatar/%s?d=identicon&s=30'`
bugdown: Use named match for Gravatar email (imported from commit 778c4b6d754f975c89a91336593d2f62e49249d5) 2012-10-22 02:15:44 +02:00			`% (gravatar_hash(match.group('email')),))`
Allow users to embed Gravatars in messages We'll use this internally for the commit bot. We might eventually disable it for external users. (imported from commit 3136cd9faadc6b81355889d2ee6472985da87fbe) 2012-10-17 04:42:19 +02:00			`return img`

bugdown: Linkify using a Markdown extension This prevents trying to linkify inside code blocks. (imported from commit 97dd20ecee19f41650aa98e68aa8e9908ece5b33) 2012-10-22 02:32:18 +02:00			`class AutoLink(markdown.inlinepatterns.Pattern):`
			`def handleMatch(self, match):`
			`url = match.group('url')`
			`a = markdown.util.etree.Element('a')`
			`a.set('href', url)`
			`a.text = url`
			`return a`

bugdown: Disable + as a bullet character for an unordered list Fixes #272. (imported from commit 8afaf14965ed1f6a4bb3ccfc9d4c2d807148666d) 2012-11-02 18:25:37 +01:00			`class UListProcessor(markdown.blockprocessors.OListProcessor):`
			`""" Process unordered list blocks.`

			`Based on markdown.blockprocessors.UListProcessor, but does not accept`
			`'+' as a bullet character."""`

			`TAG = 'ul'`
			`RE = re.compile(r'^[ ]{0,3}[-][ ]+(.)')`

Markdown: Disable images (imported from commit 6656b15fa690b463265af6384a73529ee635f688) 2012-10-16 17:35:58 +02:00			`class Bugdown(markdown.Extension):`
			`def extendMarkdown(self, md, md_globals):`
bugdown: Disable builtin 'autolink' feature This only linkifies inside angle brackets, per print md.inlinePatterns['autolink'].getCompiledRegExp().pattern We have our own linkification extension. (imported from commit 20cab11aaafee075e0caf933d8d197717976988c) 2012-10-22 02:40:58 +02:00			`for k in ('image_link', 'image_reference', 'automail', 'autolink'):`
bugdown: Decrease code duplication (imported from commit 40158134b60e477d8a353d050fa62c9ded0e9e9f) 2012-10-22 02:35:36 +02:00			`del md.inlinePatterns[k]`

bugdown: Disable + as a bullet character for an unordered list Fixes #272. (imported from commit 8afaf14965ed1f6a4bb3ccfc9d4c2d807148666d) 2012-11-02 18:25:37 +01:00			`for k in ('hashheader', 'setextheader', 'olist', 'ulist'):`
bugdown: Decrease code duplication (imported from commit 40158134b60e477d8a353d050fa62c9ded0e9e9f) 2012-10-22 02:35:36 +02:00			`del md.parser.blockprocessors[k]`
Markdown: Disable images (imported from commit 6656b15fa690b463265af6384a73529ee635f688) 2012-10-16 17:35:58 +02:00
bugdown: Disable + as a bullet character for an unordered list Fixes #272. (imported from commit 8afaf14965ed1f6a4bb3ccfc9d4c2d807148666d) 2012-11-02 18:25:37 +01:00			`md.parser.blockprocessors.add('ulist', UListProcessor(md.parser), '>hr')`

bugdown: Use named match for Gravatar email (imported from commit 778c4b6d754f975c89a91336593d2f62e49249d5) 2012-10-22 02:15:44 +02:00			`md.inlinePatterns.add('gravatar', Gravatar(r'!gravatar\((?P<email>[^)]*)\)'), '_begin')`
Allow users to embed Gravatars in messages We'll use this internally for the commit bot. We might eventually disable it for external users. (imported from commit 3136cd9faadc6b81355889d2ee6472985da87fbe) 2012-10-17 04:42:19 +02:00
bugdown: Remove trailing punctuation from automatic links And allow parentheses etc. within a link, if they're not at the end. Fixes #401. (imported from commit 5261fce74fe381ceece2e7406776cc5bde19deb9) 2012-11-20 19:33:10 +01:00			`# A link starts at a word boundary, and ends at space or end-of-input.`
			`# But any trailing punctuation (other than /) is not included.`
			`# We accomplish this with a non-greedy match followed by a greedy`
			`# lookahead assertion.`
			`#`
			`# markdown.inlinepatterns.Pattern compiles this with re.UNICODE, which`
			`# is important because we're using \w.`
			`link_regex = r'\b(?P<url>https?://[^\s]+?)(?=[^\w/]*(\s\|\Z))'`
bugdown: Install autolink pattern after link pattern This fixes explicit links. (imported from commit 6867d271344d35c2c8d54b3393219113a095bc4f) 2012-10-22 23:47:49 +02:00			`md.inlinePatterns.add('autolink', AutoLink(link_regex), '>link')`
bugdown: Linkify using a Markdown extension This prevents trying to linkify inside code blocks. (imported from commit 97dd20ecee19f41650aa98e68aa8e9908ece5b33) 2012-10-22 02:32:18 +02:00
bugdown: Use Markdown.reset() instead of re-creating the whole parser (imported from commit 45a65453f2178a6a73392e2bd1e7d6d03de0e0e7) 2012-11-20 20:15:55 +01:00			`_md_engine = markdown.Markdown(`
			`safe_mode = 'escape',`
			`output_format = 'html',`
			`extensions = ['nl2br',`
			`codehilite.makeExtension(configs=[`
			`('force_linenos', False),`
			`('guess_lang', False)]),`
			`fenced_code.makeExtension(),`
			`Bugdown()])`
Move our various Markdown hacks into their own file (imported from commit b03a5c64cc95964936c4aba7d667807969e35d21) 2012-10-15 22:03:50 +02:00
bugdown: Hide alphanumeric characters in exception logs, for privacy (imported from commit 39481494b7910307f56e566035c1b464c83d196e) 2012-10-25 21:38:47 +02:00			`# We want to log Markdown parser failures, but shouldn't log the actual input`
			`# message for privacy reasons. The compromise is to replace all alphanumeric`
			`# characters with 'x'.`
			`#`
			`# We also use repr() to improve reproducibility, and to escape terminal control`
			`# codes, which can do surprisingly nasty things.`
			`_privacy_re = re.compile(r'\w', flags=re.UNICODE)`
			`def _sanitize_for_log(md):`
			`return repr(_privacy_re.sub('x', md))`

Move our various Markdown hacks into their own file (imported from commit b03a5c64cc95964936c4aba7d667807969e35d21) 2012-10-15 22:03:50 +02:00			`def _linkify(match):`
			`url = match.group('url')`
			`return ' [%s](%s) ' % (url, url)`

			`def convert(md):`
			`"""Convert Markdown to HTML, with Humbug-specific settings and hacks."""`
bugdown: Use Markdown.reset() instead of re-creating the whole parser (imported from commit 45a65453f2178a6a73392e2bd1e7d6d03de0e0e7) 2012-11-20 20:15:55 +01:00
			`# Reset the parser; otherwise it will get slower over time.`
			`_md_engine.reset()`
Move our various Markdown hacks into their own file (imported from commit b03a5c64cc95964936c4aba7d667807969e35d21) 2012-10-15 22:03:50 +02:00
			`try:`
			`html = _md_engine.convert(md)`
			`except:`
			`# FIXME: Do something more reasonable here!`
			`html = '<p>[Humbug note: Sorry, we could not understand the formatting of your message]</p>'`
bugdown: Hide alphanumeric characters in exception logs, for privacy (imported from commit 39481494b7910307f56e566035c1b464c83d196e) 2012-10-25 21:38:47 +02:00			`logging.getLogger('').error('Exception in Markdown parser: %sInput (sanitized) was: %s'`
			`% (traceback.format_exc(), _sanitize_for_log(md)))`
Move our various Markdown hacks into their own file (imported from commit b03a5c64cc95964936c4aba7d667807969e35d21) 2012-10-15 22:03:50 +02:00
			`return html`