2012-10-15 22:03:50 +02:00
|
|
|
import markdown
|
2012-10-22 05:06:28 +02:00
|
|
|
import logging
|
|
|
|
import traceback
|
2012-12-04 19:57:54 +01:00
|
|
|
import urlparse
|
2012-10-25 21:38:47 +02:00
|
|
|
import re
|
2012-10-15 22:03:50 +02:00
|
|
|
|
2012-10-20 05:34:14 +02:00
|
|
|
from zephyr.lib.avatar import gravatar_hash
|
2012-11-19 18:31:03 +01:00
|
|
|
from zephyr.lib.bugdown import codehilite, fenced_code
|
2012-10-17 04:42:19 +02:00
|
|
|
|
|
|
|
class Gravatar(markdown.inlinepatterns.Pattern):
|
|
|
|
def handleMatch(self, match):
|
|
|
|
img = markdown.util.etree.Element('img')
|
|
|
|
img.set('class', 'message_body_gravatar img-rounded')
|
|
|
|
img.set('src', 'https://secure.gravatar.com/avatar/%s?d=identicon&s=30'
|
2012-10-22 02:15:44 +02:00
|
|
|
% (gravatar_hash(match.group('email')),))
|
2012-10-17 04:42:19 +02:00
|
|
|
return img
|
|
|
|
|
2012-12-04 20:15:50 +01:00
|
|
|
def fixup_link(link):
|
|
|
|
"""Set certain attributes we want on every link."""
|
|
|
|
link.set('target', '_blank')
|
|
|
|
link.set('title', link.get('href'))
|
|
|
|
|
2012-10-22 02:32:18 +02:00
|
|
|
class AutoLink(markdown.inlinepatterns.Pattern):
|
|
|
|
def handleMatch(self, match):
|
|
|
|
url = match.group('url')
|
|
|
|
a = markdown.util.etree.Element('a')
|
|
|
|
a.set('href', url)
|
|
|
|
a.text = url
|
2012-12-04 20:15:50 +01:00
|
|
|
fixup_link(a)
|
2012-10-22 02:32:18 +02:00
|
|
|
return a
|
|
|
|
|
2012-11-02 18:25:37 +01:00
|
|
|
class UListProcessor(markdown.blockprocessors.OListProcessor):
|
|
|
|
""" Process unordered list blocks.
|
|
|
|
|
|
|
|
Based on markdown.blockprocessors.UListProcessor, but does not accept
|
|
|
|
'+' as a bullet character."""
|
|
|
|
|
|
|
|
TAG = 'ul'
|
|
|
|
RE = re.compile(r'^[ ]{0,3}[*-][ ]+(.*)')
|
|
|
|
|
2012-12-04 19:57:54 +01:00
|
|
|
# Based on markdown.inlinepatterns.LinkPattern
|
|
|
|
class LinkPattern(markdown.inlinepatterns.Pattern):
|
|
|
|
""" Return a link element from the given match. """
|
|
|
|
def handleMatch(self, m):
|
|
|
|
el = markdown.util.etree.Element("a")
|
|
|
|
el.text = m.group(2)
|
|
|
|
href = m.group(9)
|
|
|
|
|
|
|
|
if href:
|
|
|
|
if href[0] == "<":
|
|
|
|
href = href[1:-1]
|
|
|
|
el.set("href", self.sanitize_url(self.unescape(href.strip())))
|
|
|
|
else:
|
|
|
|
el.set("href", "")
|
|
|
|
|
2012-12-04 20:15:50 +01:00
|
|
|
fixup_link(el)
|
2012-12-04 19:57:54 +01:00
|
|
|
return el
|
|
|
|
|
|
|
|
def sanitize_url(self, url):
|
|
|
|
"""
|
|
|
|
Sanitize a url against xss attacks.
|
|
|
|
See the docstring on markdown.inlinepatterns.LinkPattern.sanitize_url.
|
|
|
|
"""
|
|
|
|
try:
|
2012-12-04 20:01:07 +01:00
|
|
|
parts = urlparse.urlparse(url.replace(' ', '%20'))
|
|
|
|
scheme, netloc, path, params, query, fragment = parts
|
2012-12-04 19:57:54 +01:00
|
|
|
except ValueError:
|
|
|
|
# Bad url - so bad it couldn't be parsed.
|
|
|
|
return ''
|
|
|
|
|
2012-12-04 20:04:34 +01:00
|
|
|
# Humbug modification: If scheme is not specified, assume http://
|
|
|
|
# It's unlikely that users want relative links within humbughq.com.
|
|
|
|
# We re-enter sanitize_url because netloc etc. need to be re-parsed.
|
|
|
|
if not scheme:
|
|
|
|
return self.sanitize_url('http://' + url)
|
|
|
|
|
2012-12-04 19:57:54 +01:00
|
|
|
locless_schemes = ['', 'mailto', 'news']
|
|
|
|
if netloc == '' and scheme not in locless_schemes:
|
|
|
|
# This fails regardless of anything else.
|
|
|
|
# Return immediately to save additional proccessing
|
|
|
|
return ''
|
|
|
|
|
2012-12-04 20:01:07 +01:00
|
|
|
for part in parts[2:]:
|
2012-12-04 19:57:54 +01:00
|
|
|
if ":" in part:
|
|
|
|
# Not a safe url
|
|
|
|
return ''
|
|
|
|
|
|
|
|
# Url passes all tests. Return url as-is.
|
2012-12-04 20:01:07 +01:00
|
|
|
return urlparse.urlunparse(parts)
|
2012-12-04 19:57:54 +01:00
|
|
|
|
2012-10-16 17:35:58 +02:00
|
|
|
class Bugdown(markdown.Extension):
|
|
|
|
def extendMarkdown(self, md, md_globals):
|
2012-12-04 20:22:14 +01:00
|
|
|
del md.preprocessors['reference']
|
|
|
|
|
|
|
|
for k in ('image_link', 'image_reference', 'automail',
|
2012-12-11 21:19:15 +01:00
|
|
|
'autolink', 'link', 'reference', 'short_reference',
|
|
|
|
'escape'):
|
2012-10-22 02:35:36 +02:00
|
|
|
del md.inlinePatterns[k]
|
|
|
|
|
2012-11-02 18:25:37 +01:00
|
|
|
for k in ('hashheader', 'setextheader', 'olist', 'ulist'):
|
2012-10-22 02:35:36 +02:00
|
|
|
del md.parser.blockprocessors[k]
|
2012-10-16 17:35:58 +02:00
|
|
|
|
2012-11-02 18:25:37 +01:00
|
|
|
md.parser.blockprocessors.add('ulist', UListProcessor(md.parser), '>hr')
|
|
|
|
|
2012-10-22 02:15:44 +02:00
|
|
|
md.inlinePatterns.add('gravatar', Gravatar(r'!gravatar\((?P<email>[^)]*)\)'), '_begin')
|
2012-12-11 21:19:15 +01:00
|
|
|
md.inlinePatterns.add('link', LinkPattern(markdown.inlinepatterns.LINK_RE, md), '>backtick')
|
2012-10-17 04:42:19 +02:00
|
|
|
|
2012-11-20 19:33:10 +01:00
|
|
|
# A link starts at a word boundary, and ends at space or end-of-input.
|
|
|
|
# But any trailing punctuation (other than /) is not included.
|
|
|
|
# We accomplish this with a non-greedy match followed by a greedy
|
|
|
|
# lookahead assertion.
|
|
|
|
#
|
|
|
|
# markdown.inlinepatterns.Pattern compiles this with re.UNICODE, which
|
|
|
|
# is important because we're using \w.
|
|
|
|
link_regex = r'\b(?P<url>https?://[^\s]+?)(?=[^\w/]*(\s|\Z))'
|
2012-10-22 23:47:49 +02:00
|
|
|
md.inlinePatterns.add('autolink', AutoLink(link_regex), '>link')
|
2012-10-22 02:32:18 +02:00
|
|
|
|
2012-11-20 20:15:55 +01:00
|
|
|
_md_engine = markdown.Markdown(
|
|
|
|
safe_mode = 'escape',
|
|
|
|
output_format = 'html',
|
|
|
|
extensions = ['nl2br',
|
|
|
|
codehilite.makeExtension(configs=[
|
|
|
|
('force_linenos', False),
|
|
|
|
('guess_lang', False)]),
|
|
|
|
fenced_code.makeExtension(),
|
|
|
|
Bugdown()])
|
2012-10-15 22:03:50 +02:00
|
|
|
|
2012-10-25 21:38:47 +02:00
|
|
|
# We want to log Markdown parser failures, but shouldn't log the actual input
|
|
|
|
# message for privacy reasons. The compromise is to replace all alphanumeric
|
|
|
|
# characters with 'x'.
|
|
|
|
#
|
|
|
|
# We also use repr() to improve reproducibility, and to escape terminal control
|
|
|
|
# codes, which can do surprisingly nasty things.
|
|
|
|
_privacy_re = re.compile(r'\w', flags=re.UNICODE)
|
|
|
|
def _sanitize_for_log(md):
|
|
|
|
return repr(_privacy_re.sub('x', md))
|
|
|
|
|
2012-10-15 22:03:50 +02:00
|
|
|
def _linkify(match):
|
|
|
|
url = match.group('url')
|
|
|
|
return ' [%s](%s) ' % (url, url)
|
|
|
|
|
|
|
|
def convert(md):
|
|
|
|
"""Convert Markdown to HTML, with Humbug-specific settings and hacks."""
|
2012-11-20 20:15:55 +01:00
|
|
|
|
|
|
|
# Reset the parser; otherwise it will get slower over time.
|
|
|
|
_md_engine.reset()
|
2012-10-15 22:03:50 +02:00
|
|
|
|
|
|
|
try:
|
|
|
|
html = _md_engine.convert(md)
|
|
|
|
except:
|
|
|
|
# FIXME: Do something more reasonable here!
|
|
|
|
html = '<p>[Humbug note: Sorry, we could not understand the formatting of your message]</p>'
|
2012-10-25 21:38:47 +02:00
|
|
|
logging.getLogger('').error('Exception in Markdown parser: %sInput (sanitized) was: %s'
|
|
|
|
% (traceback.format_exc(), _sanitize_for_log(md)))
|
2012-10-15 22:03:50 +02:00
|
|
|
|
|
|
|
return html
|