zulip/zephyr/lib/bugdown/__init__.py

import markdown
import logging
import traceback
import urlparse
import re

from django.core import mail

from zephyr.lib.avatar  import gravatar_hash
from zephyr.lib.bugdown import codehilite, fenced_code
from zephyr.lib.bugdown.fenced_code import FENCE_RE
from zephyr.lib.timeout import timeout

class Gravatar(markdown.inlinepatterns.Pattern):
    def handleMatch(self, match):
        img = markdown.util.etree.Element('img')
        img.set('class', 'message_body_gravatar img-rounded')
        img.set('src', 'https://secure.gravatar.com/avatar/%s?d=identicon&s=30'
            % (gravatar_hash(match.group('email')),))
        return img

def fixup_link(link):
    """Set certain attributes we want on every link."""
    link.set('target', '_blank')
    link.set('title',  link.get('href'))


def sanitize_url(url):
    """
    Sanitize a url against xss attacks.
    See the docstring on markdown.inlinepatterns.LinkPattern.sanitize_url.
    """
    try:
        parts = urlparse.urlparse(url.replace(' ', '%20'))
        scheme, netloc, path, params, query, fragment = parts
    except ValueError:
        # Bad url - so bad it couldn't be parsed.
        return ''

    # Humbug modification: If scheme is not specified, assume http://
    # It's unlikely that users want relative links within humbughq.com.
    # We re-enter sanitize_url because netloc etc. need to be re-parsed.
    if not scheme:
        return sanitize_url('http://' + url)

    locless_schemes = ['', 'mailto', 'news']
    if netloc == '' and scheme not in locless_schemes:
        # This fails regardless of anything else.
        # Return immediately to save additional proccessing
        return ''

    for part in parts[2:]:
        if ":" in part:
            # Not a safe url
            return ''

    # Url passes all tests. Return url as-is.
    return urlparse.urlunparse(parts)

def url_to_a(url):
    a = markdown.util.etree.Element('a')
    if '@' in url:
        href = 'mailto:' + url
    else:
        href = url
    a.set('href', sanitize_url(href))
    a.text = url
    fixup_link(a)
    return a

class AutoLink(markdown.inlinepatterns.Pattern):
    def handleMatch(self, match):
        url = match.group('url')
        # As this will also match already-matched https?:// links,
        # don't doubly-link them
        if url[:5] == 'http:' or url[:6] == 'https:':
            return url
        return url_to_a(url)

class HttpLink(markdown.inlinepatterns.Pattern):
    def handleMatch(self, match):
        url = match.group('url')
        return url_to_a(url)

class UListProcessor(markdown.blockprocessors.OListProcessor):
    """ Process unordered list blocks.

        Based on markdown.blockprocessors.UListProcessor, but does not accept
        '+' or '-' as a bullet character."""

    TAG = 'ul'
    RE = re.compile(r'^[ ]{0,3}[*][ ]+(.*)')

class BugdownUListPreprocessor(markdown.preprocessors.Preprocessor):
    """ Allows unordered list blocks that come directly after a
        paragraph to be rendered as an unordered list

        Detects paragraphs that have a matching list item that comes
        directly after a line of text, and inserts a newline between
        to satisfy Markdown"""

    LI_RE = re.compile(r'^[ ]{0,3}[*][ ]+(.*)', re.MULTILINE)
    HANGING_ULIST_RE = re.compile(r'^.+\n([ ]{0,3}[*][ ]+.*)', re.MULTILINE)

    def run(self, lines):
        """ Insert a newline between a paragraph and ulist if missing """
        inserts = 0
        fence = None
        copy = lines[:]
        for i in xrange(len(lines) - 1):
            # Ignore anything that is inside a fenced code block
            m = FENCE_RE.match(lines[i])
            if not fence and m:
                fence = m.group('fence')
            elif fence and m and fence == m.group('fence'):
                fence = None

            # If we're not in a fenced block and we detect an upcoming list
            #  hanging off a paragraph, add a newline
            if not fence and lines[i] and \
                self.LI_RE.match(lines[i+1]) and not self.LI_RE.match(lines[i]):
                copy.insert(i+inserts+1, '')
                inserts += 1
        return copy

# Based on markdown.inlinepatterns.LinkPattern
class LinkPattern(markdown.inlinepatterns.Pattern):
    """ Return a link element from the given match. """
    def handleMatch(self, m):
        el = markdown.util.etree.Element("a")
        el.text = m.group(2)
        href = m.group(9)

        if href:
            if href[0] == "<":
                href = href[1:-1]
            el.set("href", sanitize_url(self.unescape(href.strip())))
        else:
            el.set("href", "")

        fixup_link(el)
        return el

class Bugdown(markdown.Extension):
    def extendMarkdown(self, md, md_globals):
        del md.preprocessors['reference']

        for k in ('image_link', 'image_reference', 'automail',
                  'autolink', 'link', 'reference', 'short_reference',
                  'escape', 'strong_em', 'emphasis', 'emphasis2',
                  'strong'):
            del md.inlinePatterns[k]

        # Custom bold syntax: **foo** but not __foo__
        md.inlinePatterns.add('strong',
            markdown.inlinepatterns.SimpleTagPattern(r'(\*\*)(.+?)\2', 'strong'),
            '>not_strong')

        for k in ('hashheader', 'setextheader', 'olist', 'ulist'):
            del md.parser.blockprocessors[k]

        md.parser.blockprocessors.add('ulist', UListProcessor(md.parser), '>hr')

        md.inlinePatterns.add('gravatar', Gravatar(r'!gravatar\((?P<email>[^)]*)\)'), '_begin')
        md.inlinePatterns.add('link', LinkPattern(markdown.inlinepatterns.LINK_RE, md), '>backtick')

        # markdown.inlinepatterns.Pattern compiles this with re.UNICODE, which
        # is important because we're using \w.
        #
        # This rule must come after the built-in 'link' markdown linkifier to
        # avoid errors.
        http_link_regex = r'\b(?P<url>https?://[^\s]+?)(?=[^\w/]*(\s|\Z))'
        md.inlinePatterns.add('http_autolink', HttpLink(http_link_regex), '>link')

        # A link starts at a word boundary, and ends at space, punctuation, or end-of-input.
        #
        # We detect a url by checking for the TLD, and building around it.
        #
        # To support () in urls but not match ending ) when a url is inside a parenthesis,
        # we match at maximum one set of matching parens in a url. We could extend this
        # to match two parenthetical groups, at the cost of more regex complexity.
        #
        # This rule must come after the http_autolink rule we add above to avoid double
        # linkifying.
        tlds = '|'.join(['co.uk', 'com', 'co', 'biz', 'gd', 'org', 'net', 'ly', 'edu', 'mil',
                         'gov', 'info', 'me', 'it', '.ca', 'tv', 'fm', 'io', 'gl'])
        link_regex = r"\b(?P<url>[^\s]+\.(%s)(?:/[^\s()\":]*?|([^\s()\":]*\([^\s()\":]*\)[^\s()\":]*))?)(?=([:;\?\),\.\'\"]\Z|[:;\?\),\.\'\"]\s|\Z|\s))" % (tlds,)
        md.inlinePatterns.add('autolink', AutoLink(link_regex), '>http_autolink')

        md.preprocessors.add('hanging_ulists',
                                 BugdownUListPreprocessor(md),
                                 "_begin")

_md_engine = markdown.Markdown(
    safe_mode     = 'escape',
    output_format = 'html',
    extensions    = ['nl2br',
        codehilite.makeExtension(configs=[
            ('force_linenos', False),
            ('guess_lang',    False)]),
        fenced_code.makeExtension(),
        Bugdown()])

# We want to log Markdown parser failures, but shouldn't log the actual input
# message for privacy reasons.  The compromise is to replace all alphanumeric
# characters with 'x'.
#
# We also use repr() to improve reproducibility, and to escape terminal control
# codes, which can do surprisingly nasty things.
_privacy_re = re.compile(r'\w', flags=re.UNICODE)
def _sanitize_for_log(md):
    return repr(_privacy_re.sub('x', md))

def convert(md):
    """Convert Markdown to HTML, with Humbug-specific settings and hacks."""

    # Reset the parser; otherwise it will get slower over time.
    _md_engine.reset()

    try:
        # Spend at most 5 seconds rendering.
        # Sometimes Python-Markdown is really slow; see
        # https://trac.humbughq.com/ticket/345
        html = timeout(5, _md_engine.convert, md)
    except:
        from zephyr.models import Recipient
        from zephyr.lib.actions import internal_send_message

        cleaned = _sanitize_for_log(md)

        html = '<p>[Humbug note: Sorry, we could not understand the formatting of your message]</p>'

        # Output error to log as well as sending a humbug and email
        logging.getLogger('').error('Exception in Markdown parser: %sInput (sanitized) was: %s'
            % (traceback.format_exc(), cleaned))
        subject = "Markdown parser failure"
        internal_send_message("humbug+errors@humbughq.com",
                Recipient.STREAM, "devel", subject,
                "Markdown parser failed, message sent to devel@")
        mail.mail_admins(subject, "Failed message: %s\n\n%s\n\n" % (
                                    cleaned, traceback.format_exc()),
                         fail_silently=False)

    return html
Move our various Markdown hacks into their own file (imported from commit b03a5c64cc95964936c4aba7d667807969e35d21) 2012-10-15 22:03:50 +02:00			`import markdown`
bugdown: Log Markdown parser failures (imported from commit 9e225a32b71edbfd9007cc2fbae32be31896233d) 2012-10-22 05:06:28 +02:00			`import logging`
			`import traceback`
bugdown: Use a local copy of markdown.inlinepatterns.LinkPattern With changes for the way things are imported in bugdown. (imported from commit 11adf7911f7bb945367221f8fa317caa71de3fc4) 2012-12-04 19:57:54 +01:00			`import urlparse`
bugdown: Hide alphanumeric characters in exception logs, for privacy (imported from commit 39481494b7910307f56e566035c1b464c83d196e) 2012-10-25 21:38:47 +02:00			`import re`
Move our various Markdown hacks into their own file (imported from commit b03a5c64cc95964936c4aba7d667807969e35d21) 2012-10-15 22:03:50 +02:00
Send humbug & email on markdown parse failure (imported from commit c132ab91f1ff9ffdbe5f740980bf362b16c6bba6) 2013-01-31 19:57:25 +01:00			`from django.core import mail`

Use our local copy of codehilite (imported from commit bc9704a19017061d55ff0e16589d57ee0c46caa5) 2012-10-20 05:34:14 +02:00			`from zephyr.lib.avatar import gravatar_hash`
bugdown: Use our local copy of fenced_code And wire it up to our local copy of codehilite. This fixes highlighting in fenced code blocks, e.g. ~~~~ .js var x = function () { return "hi"; }; ~~~~ (imported from commit 0efb0c9b98a3acdf55e18bb1918af7960f3425be) 2012-11-19 18:31:03 +01:00			`from zephyr.lib.bugdown import codehilite, fenced_code`
Handle lists that start immediately after paragraphs (imported from commit 055593d7a324598e133d53db0c33103016426c8f) 2013-01-24 19:35:20 +01:00			`from zephyr.lib.bugdown.fenced_code import FENCE_RE`
bugdown: Spend at most 5 seconds rendering a message (imported from commit bc092acc8b2b9f8a63af669de06c6f7512ccf8c9) 2013-01-29 21:47:53 +01:00			`from zephyr.lib.timeout import timeout`
Allow users to embed Gravatars in messages We'll use this internally for the commit bot. We might eventually disable it for external users. (imported from commit 3136cd9faadc6b81355889d2ee6472985da87fbe) 2012-10-17 04:42:19 +02:00
			`class Gravatar(markdown.inlinepatterns.Pattern):`
			`def handleMatch(self, match):`
			`img = markdown.util.etree.Element('img')`
			`img.set('class', 'message_body_gravatar img-rounded')`
			`img.set('src', 'https://secure.gravatar.com/avatar/%s?d=identicon&s=30'`
bugdown: Use named match for Gravatar email (imported from commit 778c4b6d754f975c89a91336593d2f62e49249d5) 2012-10-22 02:15:44 +02:00			`% (gravatar_hash(match.group('email')),))`
Allow users to embed Gravatars in messages We'll use this internally for the commit bot. We might eventually disable it for external users. (imported from commit 3136cd9faadc6b81355889d2ee6472985da87fbe) 2012-10-17 04:42:19 +02:00			`return img`

bugdown: Set link target, title attributes on the server For consistency. Fixes #266. (imported from commit 63e199a0fe4534df804a82cc98a1fdcf8ccb45da) 2012-12-04 20:15:50 +01:00			`def fixup_link(link):`
			`"""Set certain attributes we want on every link."""`
			`link.set('target', '_blank')`
			`link.set('title', link.get('href'))`

Sanitize links to prevent XSS, and handle emails (imported from commit 622396efde50d9f5e3501f5d780c344ad0692662) 2013-02-01 23:15:05 +01:00
			`def sanitize_url(url):`
			`"""`
			`Sanitize a url against xss attacks.`
			`See the docstring on markdown.inlinepatterns.LinkPattern.sanitize_url.`
			`"""`
			`try:`
			`parts = urlparse.urlparse(url.replace(' ', '%20'))`
			`scheme, netloc, path, params, query, fragment = parts`
			`except ValueError:`
			`# Bad url - so bad it couldn't be parsed.`
			`return ''`

			`# Humbug modification: If scheme is not specified, assume http://`
			`# It's unlikely that users want relative links within humbughq.com.`
			`# We re-enter sanitize_url because netloc etc. need to be re-parsed.`
			`if not scheme:`
			`return sanitize_url('http://' + url)`

			`locless_schemes = ['', 'mailto', 'news']`
			`if netloc == '' and scheme not in locless_schemes:`
			`# This fails regardless of anything else.`
			`# Return immediately to save additional proccessing`
			`return ''`

			`for part in parts[2:]:`
			`if ":" in part:`
			`# Not a safe url`
			`return ''`

			`# Url passes all tests. Return url as-is.`
			`return urlparse.urlunparse(parts)`

Add https?: greedy url matching before falling back to our url guesser (imported from commit 9e6e5a0522e6501b354a56223c2639841d290d4b) 2013-02-11 20:49:48 +01:00			`def url_to_a(url):`
			`a = markdown.util.etree.Element('a')`
			`if '@' in url:`
			`href = 'mailto:' + url`
			`else:`
			`href = url`
			`a.set('href', sanitize_url(href))`
			`a.text = url`
			`fixup_link(a)`
			`return a`

bugdown: Linkify using a Markdown extension This prevents trying to linkify inside code blocks. (imported from commit 97dd20ecee19f41650aa98e68aa8e9908ece5b33) 2012-10-22 02:32:18 +02:00			`class AutoLink(markdown.inlinepatterns.Pattern):`
			`def handleMatch(self, match):`
			`url = match.group('url')`
Add https?: greedy url matching before falling back to our url guesser (imported from commit 9e6e5a0522e6501b354a56223c2639841d290d4b) 2013-02-11 20:49:48 +01:00			`# As this will also match already-matched https?:// links,`
			`# don't doubly-link them`
			`if url[:5] == 'http:' or url[:6] == 'https:':`
			`return url`
			`return url_to_a(url)`
Sanitize links to prevent XSS, and handle emails (imported from commit 622396efde50d9f5e3501f5d780c344ad0692662) 2013-02-01 23:15:05 +01:00
Add https?: greedy url matching before falling back to our url guesser (imported from commit 9e6e5a0522e6501b354a56223c2639841d290d4b) 2013-02-11 20:49:48 +01:00			`class HttpLink(markdown.inlinepatterns.Pattern):`
			`def handleMatch(self, match):`
			`url = match.group('url')`
			`return url_to_a(url)`
bugdown: Linkify using a Markdown extension This prevents trying to linkify inside code blocks. (imported from commit 97dd20ecee19f41650aa98e68aa8e9908ece5b33) 2012-10-22 02:32:18 +02:00
bugdown: Disable + as a bullet character for an unordered list Fixes #272. (imported from commit 8afaf14965ed1f6a4bb3ccfc9d4c2d807148666d) 2012-11-02 18:25:37 +01:00			`class UListProcessor(markdown.blockprocessors.OListProcessor):`
			`""" Process unordered list blocks.`

			`Based on markdown.blockprocessors.UListProcessor, but does not accept`
Don't accept - as valid list delimiters (imported from commit 287353a29289ee536a59f47f87ff66893bf261ec) 2013-01-23 23:07:01 +01:00			`'+' or '-' as a bullet character."""`
bugdown: Disable + as a bullet character for an unordered list Fixes #272. (imported from commit 8afaf14965ed1f6a4bb3ccfc9d4c2d807148666d) 2012-11-02 18:25:37 +01:00
			`TAG = 'ul'`
Don't accept - as valid list delimiters (imported from commit 287353a29289ee536a59f47f87ff66893bf261ec) 2013-01-23 23:07:01 +01:00			`RE = re.compile(r'^[ ]{0,3}[][ ]+(.)')`
bugdown: Disable + as a bullet character for an unordered list Fixes #272. (imported from commit 8afaf14965ed1f6a4bb3ccfc9d4c2d807148666d) 2012-11-02 18:25:37 +01:00
Handle lists that start immediately after paragraphs (imported from commit 055593d7a324598e133d53db0c33103016426c8f) 2013-01-24 19:35:20 +01:00			`class BugdownUListPreprocessor(markdown.preprocessors.Preprocessor):`
			`""" Allows unordered list blocks that come directly after a`
			`paragraph to be rendered as an unordered list`

			`Detects paragraphs that have a matching list item that comes`
			`directly after a line of text, and inserts a newline between`
			`to satisfy Markdown"""`

			`LI_RE = re.compile(r'^[ ]{0,3}[][ ]+(.)', re.MULTILINE)`
			`HANGING_ULIST_RE = re.compile(r'^.+\n([ ]{0,3}[][ ]+.)', re.MULTILINE)`

			`def run(self, lines):`
			`""" Insert a newline between a paragraph and ulist if missing """`
			`inserts = 0`
			`fence = None`
			`copy = lines[:]`
			`for i in xrange(len(lines) - 1):`
			`# Ignore anything that is inside a fenced code block`
			`m = FENCE_RE.match(lines[i])`
			`if not fence and m:`
			`fence = m.group('fence')`
			`elif fence and m and fence == m.group('fence'):`
			`fence = None`

			`# If we're not in a fenced block and we detect an upcoming list`
			`# hanging off a paragraph, add a newline`
			`if not fence and lines[i] and \`
			`self.LI_RE.match(lines[i+1]) and not self.LI_RE.match(lines[i]):`
			`copy.insert(i+inserts+1, '')`
			`inserts += 1`
			`return copy`

bugdown: Use a local copy of markdown.inlinepatterns.LinkPattern With changes for the way things are imported in bugdown. (imported from commit 11adf7911f7bb945367221f8fa317caa71de3fc4) 2012-12-04 19:57:54 +01:00			`# Based on markdown.inlinepatterns.LinkPattern`
			`class LinkPattern(markdown.inlinepatterns.Pattern):`
			`""" Return a link element from the given match. """`
			`def handleMatch(self, m):`
			`el = markdown.util.etree.Element("a")`
			`el.text = m.group(2)`
			`href = m.group(9)`

			`if href:`
			`if href[0] == "<":`
			`href = href[1:-1]`
Sanitize links to prevent XSS, and handle emails (imported from commit 622396efde50d9f5e3501f5d780c344ad0692662) 2013-02-01 23:15:05 +01:00			`el.set("href", sanitize_url(self.unescape(href.strip())))`
bugdown: Use a local copy of markdown.inlinepatterns.LinkPattern With changes for the way things are imported in bugdown. (imported from commit 11adf7911f7bb945367221f8fa317caa71de3fc4) 2012-12-04 19:57:54 +01:00			`else:`
			`el.set("href", "")`

bugdown: Set link target, title attributes on the server For consistency. Fixes #266. (imported from commit 63e199a0fe4534df804a82cc98a1fdcf8ccb45da) 2012-12-04 20:15:50 +01:00			`fixup_link(el)`
bugdown: Use a local copy of markdown.inlinepatterns.LinkPattern With changes for the way things are imported in bugdown. (imported from commit 11adf7911f7bb945367221f8fa317caa71de3fc4) 2012-12-04 19:57:54 +01:00			`return el`

Markdown: Disable images (imported from commit 6656b15fa690b463265af6384a73529ee635f688) 2012-10-16 17:35:58 +02:00			`class Bugdown(markdown.Extension):`
			`def extendMarkdown(self, md, md_globals):`
bugdown: Disable reference-based links This is syntax like Here's [a link][] [a link]: http://google.com This is not very useful for short chat-style messages. It will confuse users, especially because we don't document it. And disabling it saves the effort of applying the same link fixups as elsewhere. (imported from commit c23391465486db545302b79c084b4f9cd5cdcc6a) 2012-12-04 20:22:14 +01:00			`del md.preprocessors['reference']`

			`for k in ('image_link', 'image_reference', 'automail',`
bugdown: Remove special treatment of backslash Fixes #562. (imported from commit aa39cf390ef44275c7d5a84ca954c75535d372b1) 2012-12-11 21:19:15 +01:00			`'autolink', 'link', 'reference', 'short_reference',`
bugdown: Disable italics and __foo__-style bold (imported from commit c35d6980db6c59828514a90eec199a7494625495) 2013-01-31 21:13:09 +01:00			`'escape', 'strong_em', 'emphasis', 'emphasis2',`
			`'strong'):`
bugdown: Decrease code duplication (imported from commit 40158134b60e477d8a353d050fa62c9ded0e9e9f) 2012-10-22 02:35:36 +02:00			`del md.inlinePatterns[k]`

bugdown: Disable italics and __foo__-style bold (imported from commit c35d6980db6c59828514a90eec199a7494625495) 2013-01-31 21:13:09 +01:00			`# Custom bold syntax: foo but not __foo__`
			`md.inlinePatterns.add('strong',`
			`markdown.inlinepatterns.SimpleTagPattern(r'(\\)(.+?)\2', 'strong'),`
			`'>not_strong')`

bugdown: Disable + as a bullet character for an unordered list Fixes #272. (imported from commit 8afaf14965ed1f6a4bb3ccfc9d4c2d807148666d) 2012-11-02 18:25:37 +01:00			`for k in ('hashheader', 'setextheader', 'olist', 'ulist'):`
bugdown: Decrease code duplication (imported from commit 40158134b60e477d8a353d050fa62c9ded0e9e9f) 2012-10-22 02:35:36 +02:00			`del md.parser.blockprocessors[k]`
Markdown: Disable images (imported from commit 6656b15fa690b463265af6384a73529ee635f688) 2012-10-16 17:35:58 +02:00
bugdown: Disable + as a bullet character for an unordered list Fixes #272. (imported from commit 8afaf14965ed1f6a4bb3ccfc9d4c2d807148666d) 2012-11-02 18:25:37 +01:00			`md.parser.blockprocessors.add('ulist', UListProcessor(md.parser), '>hr')`

bugdown: Use named match for Gravatar email (imported from commit 778c4b6d754f975c89a91336593d2f62e49249d5) 2012-10-22 02:15:44 +02:00			`md.inlinePatterns.add('gravatar', Gravatar(r'!gravatar\((?P<email>[^)]*)\)'), '_begin')`
bugdown: Remove special treatment of backslash Fixes #562. (imported from commit aa39cf390ef44275c7d5a84ca954c75535d372b1) 2012-12-11 21:19:15 +01:00			`md.inlinePatterns.add('link', LinkPattern(markdown.inlinepatterns.LINK_RE, md), '>backtick')`
Allow users to embed Gravatars in messages We'll use this internally for the commit bot. We might eventually disable it for external users. (imported from commit 3136cd9faadc6b81355889d2ee6472985da87fbe) 2012-10-17 04:42:19 +02:00
Add https?: greedy url matching before falling back to our url guesser (imported from commit 9e6e5a0522e6501b354a56223c2639841d290d4b) 2013-02-11 20:49:48 +01:00			`# markdown.inlinepatterns.Pattern compiles this with re.UNICODE, which`
			`# is important because we're using \w.`
			`#`
			`# This rule must come after the built-in 'link' markdown linkifier to`
			`# avoid errors.`
			`http_link_regex = r'\b(?P<url>https?://[^\s]+?)(?=[^\w/]*(\s\|\Z))'`
			`md.inlinePatterns.add('http_autolink', HttpLink(http_link_regex), '>link')`

Rework linkify regex to match more urls (imported from commit 0e1a1df88363374ffbc802f83f43eb0fac8c99ea) 2013-02-01 20:04:28 +01:00			`# A link starts at a word boundary, and ends at space, punctuation, or end-of-input.`
bugdown: Remove trailing punctuation from automatic links And allow parentheses etc. within a link, if they're not at the end. Fixes #401. (imported from commit 5261fce74fe381ceece2e7406776cc5bde19deb9) 2012-11-20 19:33:10 +01:00			`#`
Rework linkify regex to match more urls (imported from commit 0e1a1df88363374ffbc802f83f43eb0fac8c99ea) 2013-02-01 20:04:28 +01:00			`# We detect a url by checking for the TLD, and building around it.`
			`#`
			`# To support () in urls but not match ending ) when a url is inside a parenthesis,`
			`# we match at maximum one set of matching parens in a url. We could extend this`
			`# to match two parenthetical groups, at the cost of more regex complexity.`
Add https?: greedy url matching before falling back to our url guesser (imported from commit 9e6e5a0522e6501b354a56223c2639841d290d4b) 2013-02-11 20:49:48 +01:00			`#`
			`# This rule must come after the http_autolink rule we add above to avoid double`
			`# linkifying.`
Match .co.uk before .co in linkification (imported from commit 2461cb4d49eef15431dde30dda646b25dc860a7b) 2013-02-05 19:04:45 +01:00			`tlds = '\|'.join(['co.uk', 'com', 'co', 'biz', 'gd', 'org', 'net', 'ly', 'edu', 'mil',`
Handle chars at end of link better (imported from commit 53842bc17bbb9cfb555738ee6b72291d7ce61d67) 2013-02-06 15:33:24 +01:00			`'gov', 'info', 'me', 'it', '.ca', 'tv', 'fm', 'io', 'gl'])`
			`link_regex = r"\b(?P<url>[^\s]+\.(%s)(?:/[^\s()\":]?\|([^\s()\":]\([^\s()\":]\)[^\s()\":]))?)(?=([:;\?\),\.\'\"]\Z\|[:;\?\),\.\'\"]\s\|\Z\|\s))" % (tlds,)`
Add https?: greedy url matching before falling back to our url guesser (imported from commit 9e6e5a0522e6501b354a56223c2639841d290d4b) 2013-02-11 20:49:48 +01:00			`md.inlinePatterns.add('autolink', AutoLink(link_regex), '>http_autolink')`
bugdown: Linkify using a Markdown extension This prevents trying to linkify inside code blocks. (imported from commit 97dd20ecee19f41650aa98e68aa8e9908ece5b33) 2012-10-22 02:32:18 +02:00
Handle lists that start immediately after paragraphs (imported from commit 055593d7a324598e133d53db0c33103016426c8f) 2013-01-24 19:35:20 +01:00			`md.preprocessors.add('hanging_ulists',`
			`BugdownUListPreprocessor(md),`
			`"_begin")`

bugdown: Use Markdown.reset() instead of re-creating the whole parser (imported from commit 45a65453f2178a6a73392e2bd1e7d6d03de0e0e7) 2012-11-20 20:15:55 +01:00			`_md_engine = markdown.Markdown(`
			`safe_mode = 'escape',`
			`output_format = 'html',`
			`extensions = ['nl2br',`
			`codehilite.makeExtension(configs=[`
			`('force_linenos', False),`
			`('guess_lang', False)]),`
			`fenced_code.makeExtension(),`
			`Bugdown()])`
Move our various Markdown hacks into their own file (imported from commit b03a5c64cc95964936c4aba7d667807969e35d21) 2012-10-15 22:03:50 +02:00
bugdown: Hide alphanumeric characters in exception logs, for privacy (imported from commit 39481494b7910307f56e566035c1b464c83d196e) 2012-10-25 21:38:47 +02:00			`# We want to log Markdown parser failures, but shouldn't log the actual input`
			`# message for privacy reasons. The compromise is to replace all alphanumeric`
			`# characters with 'x'.`
			`#`
			`# We also use repr() to improve reproducibility, and to escape terminal control`
			`# codes, which can do surprisingly nasty things.`
			`_privacy_re = re.compile(r'\w', flags=re.UNICODE)`
			`def _sanitize_for_log(md):`
			`return repr(_privacy_re.sub('x', md))`

Move our various Markdown hacks into their own file (imported from commit b03a5c64cc95964936c4aba7d667807969e35d21) 2012-10-15 22:03:50 +02:00			`def convert(md):`
			`"""Convert Markdown to HTML, with Humbug-specific settings and hacks."""`
bugdown: Use Markdown.reset() instead of re-creating the whole parser (imported from commit 45a65453f2178a6a73392e2bd1e7d6d03de0e0e7) 2012-11-20 20:15:55 +01:00
			`# Reset the parser; otherwise it will get slower over time.`
			`_md_engine.reset()`
Move our various Markdown hacks into their own file (imported from commit b03a5c64cc95964936c4aba7d667807969e35d21) 2012-10-15 22:03:50 +02:00
			`try:`
bugdown: Spend at most 5 seconds rendering a message (imported from commit bc092acc8b2b9f8a63af669de06c6f7512ccf8c9) 2013-01-29 21:47:53 +01:00			`# Spend at most 5 seconds rendering.`
			`# Sometimes Python-Markdown is really slow; see`
			`# https://trac.humbughq.com/ticket/345`
			`html = timeout(5, _md_engine.convert, md)`
Move our various Markdown hacks into their own file (imported from commit b03a5c64cc95964936c4aba7d667807969e35d21) 2012-10-15 22:03:50 +02:00			`except:`
Send humbug & email on markdown parse failure (imported from commit c132ab91f1ff9ffdbe5f740980bf362b16c6bba6) 2013-01-31 19:57:25 +01:00			`from zephyr.models import Recipient`
			`from zephyr.lib.actions import internal_send_message`

			`cleaned = _sanitize_for_log(md)`

Move our various Markdown hacks into their own file (imported from commit b03a5c64cc95964936c4aba7d667807969e35d21) 2012-10-15 22:03:50 +02:00			`html = '<p>[Humbug note: Sorry, we could not understand the formatting of your message]</p>'`
Send humbug & email on markdown parse failure (imported from commit c132ab91f1ff9ffdbe5f740980bf362b16c6bba6) 2013-01-31 19:57:25 +01:00
			`# Output error to log as well as sending a humbug and email`
bugdown: Hide alphanumeric characters in exception logs, for privacy (imported from commit 39481494b7910307f56e566035c1b464c83d196e) 2012-10-25 21:38:47 +02:00			`logging.getLogger('').error('Exception in Markdown parser: %sInput (sanitized) was: %s'`
Send humbug & email on markdown parse failure (imported from commit c132ab91f1ff9ffdbe5f740980bf362b16c6bba6) 2013-01-31 19:57:25 +01:00			`% (traceback.format_exc(), cleaned))`
			`subject = "Markdown parser failure"`
			`internal_send_message("humbug+errors@humbughq.com",`
			`Recipient.STREAM, "devel", subject,`
			`"Markdown parser failed, message sent to devel@")`
			`mail.mail_admins(subject, "Failed message: %s\n\n%s\n\n" % (`
			`cleaned, traceback.format_exc()),`
			`fail_silently=False)`
Move our various Markdown hacks into their own file (imported from commit b03a5c64cc95964936c4aba7d667807969e35d21) 2012-10-15 22:03:50 +02:00
			`return html`