2012-10-15 22:03:50 +02:00
|
|
|
import markdown
|
2012-10-22 05:06:28 +02:00
|
|
|
import logging
|
|
|
|
import traceback
|
2012-12-04 19:57:54 +01:00
|
|
|
import urlparse
|
2012-10-25 21:38:47 +02:00
|
|
|
import re
|
2013-03-01 22:07:27 +01:00
|
|
|
import os.path
|
|
|
|
import glob
|
2013-03-08 06:27:16 +01:00
|
|
|
import urllib2
|
2013-05-01 22:49:16 +02:00
|
|
|
import itertools
|
2013-03-08 06:27:16 +01:00
|
|
|
import simplejson
|
2013-03-08 20:48:14 +01:00
|
|
|
import twitter
|
2013-05-07 00:31:46 +02:00
|
|
|
import platform
|
2012-10-15 22:03:50 +02:00
|
|
|
|
2013-05-07 20:50:25 +02:00
|
|
|
import httplib2
|
|
|
|
|
2013-05-01 22:49:16 +02:00
|
|
|
from hashlib import sha1
|
|
|
|
|
2013-01-31 19:57:25 +01:00
|
|
|
from django.core import mail
|
2013-03-08 06:27:16 +01:00
|
|
|
from django.conf import settings
|
2013-01-31 19:57:25 +01:00
|
|
|
|
2012-10-20 05:34:14 +02:00
|
|
|
from zephyr.lib.avatar import gravatar_hash
|
2012-11-19 18:31:03 +01:00
|
|
|
from zephyr.lib.bugdown import codehilite, fenced_code
|
2013-01-24 19:35:20 +01:00
|
|
|
from zephyr.lib.bugdown.fenced_code import FENCE_RE
|
2013-04-23 17:01:33 +02:00
|
|
|
from zephyr.lib.timeout import timeout, TimeoutExpired
|
2013-05-01 22:49:16 +02:00
|
|
|
from zephyr.lib.cache import cache_with_key, cache_get_many, cache_set_many
|
2013-04-29 22:22:07 +02:00
|
|
|
from embedly import Embedly
|
|
|
|
|
2013-05-07 20:50:13 +02:00
|
|
|
embedly_client = Embedly(settings.EMBEDLY_KEY, timeout=2.5)
|
2012-10-17 04:42:19 +02:00
|
|
|
|
2013-03-18 22:51:08 +01:00
|
|
|
# Format version of the bugdown rendering; stored along with rendered
|
|
|
|
# messages so that we can efficiently determine what needs to be re-rendered
|
|
|
|
version = 1
|
|
|
|
|
2013-04-02 17:08:00 +02:00
|
|
|
def list_of_tlds():
|
|
|
|
# HACK we manually blacklist .py
|
|
|
|
blacklist = ['PY\n', ]
|
|
|
|
|
|
|
|
# tlds-alpha-by-domain.txt comes from http://data.iana.org/TLD/tlds-alpha-by-domain.txt
|
|
|
|
tlds_file = os.path.join(os.path.dirname(__file__), 'tlds-alpha-by-domain.txt')
|
|
|
|
tlds = [tld.lower().strip() for tld in open(tlds_file, 'r')
|
|
|
|
if not tld in blacklist and not tld[0].startswith('#')]
|
|
|
|
tlds.sort(key=len, reverse=True)
|
|
|
|
return tlds
|
|
|
|
|
2013-03-08 21:44:06 +01:00
|
|
|
def walk_tree(root, processor, stop_after_first=False):
|
2013-03-08 20:07:46 +01:00
|
|
|
results = []
|
|
|
|
stack = [root]
|
|
|
|
|
|
|
|
while stack:
|
|
|
|
currElement = stack.pop()
|
|
|
|
for child in currElement.getchildren():
|
|
|
|
if child.getchildren():
|
|
|
|
stack.append(child)
|
|
|
|
|
|
|
|
result = processor(child)
|
|
|
|
if result is not None:
|
|
|
|
results.append(result)
|
2013-03-08 21:44:06 +01:00
|
|
|
if stop_after_first:
|
|
|
|
return results
|
2013-03-08 20:07:46 +01:00
|
|
|
|
|
|
|
return results
|
|
|
|
|
2013-04-29 22:17:17 +02:00
|
|
|
def add_a(root, url, link, height=None):
|
|
|
|
div = markdown.util.etree.SubElement(root, "div")
|
|
|
|
div.set("class", "message_inline_image");
|
|
|
|
a = markdown.util.etree.SubElement(div, "a")
|
|
|
|
a.set("href", link)
|
|
|
|
a.set("target", "_blank")
|
|
|
|
a.set("title", link)
|
|
|
|
img = markdown.util.etree.SubElement(a, "img")
|
|
|
|
img.set("src", url)
|
|
|
|
|
2013-05-01 22:49:16 +02:00
|
|
|
def hash_embedly_url(link):
|
|
|
|
return 'embedly:' + sha1(link).hexdigest()
|
|
|
|
|
2013-04-29 22:22:07 +02:00
|
|
|
class EmbedlyProcessor(markdown.treeprocessors.Treeprocessor):
|
|
|
|
def run(self, root):
|
|
|
|
# Get all URLs from the blob
|
2013-05-01 22:49:16 +02:00
|
|
|
found_urls = walk_tree(root, lambda e: e.get("href") if e.tag == "a" else None)
|
|
|
|
|
|
|
|
supported_urls = []
|
|
|
|
for link in found_urls:
|
|
|
|
# Don't waste our quota with unsupported links or links otherwise
|
|
|
|
# handled by our Twitter integration
|
2013-04-30 21:37:22 +02:00
|
|
|
if not embedly_client.is_supported(link) or get_tweet_id(link):
|
2013-04-29 22:22:07 +02:00
|
|
|
continue
|
2013-05-01 22:49:16 +02:00
|
|
|
supported_urls.append(link)
|
|
|
|
|
2013-05-07 21:26:46 +02:00
|
|
|
if not supported_urls or len(supported_urls) > 5:
|
|
|
|
# Either zero urls or too many urls.
|
2013-05-01 22:49:16 +02:00
|
|
|
return root
|
|
|
|
|
|
|
|
# We want this to be able to easily reverse the hashing later
|
|
|
|
keys_to_links = dict((hash_embedly_url(link), link) for link in supported_urls)
|
|
|
|
cache_hits = cache_get_many(keys_to_links.keys(), cache_name="database")
|
|
|
|
|
|
|
|
# Construct a dict of url => oembed_data pairs
|
|
|
|
oembeds = dict((keys_to_links[key], cache_hits[key]) for key in cache_hits)
|
|
|
|
|
|
|
|
to_process = [url for url in supported_urls if not url in oembeds]
|
|
|
|
to_cache = {}
|
|
|
|
|
2013-05-07 21:26:46 +02:00
|
|
|
try:
|
|
|
|
responses = embedly_client.oembed(to_process, maxwidth=250)
|
|
|
|
except httplib2.socket.timeout:
|
|
|
|
# We put this in its own try-except because it requires external
|
|
|
|
# connectivity. If embedly flakes out, we don't want to not-render
|
|
|
|
# the entire message; we just want to not show the embedly preview.
|
|
|
|
logging.warning("Embedly Embed timeout for URLs: %s" % (" ".join(to_process)))
|
|
|
|
logging.warning(traceback.format_exc())
|
|
|
|
return root
|
|
|
|
except Exception:
|
|
|
|
# If things break for any other reason, don't make things sad.
|
|
|
|
logging.warning(traceback.format_exc())
|
|
|
|
return root
|
|
|
|
for oembed_data in responses:
|
|
|
|
# Don't cache permanent errors
|
|
|
|
if oembed_data["type"] == "error" and \
|
|
|
|
oembed_data["error_code"] in (500, 501, 503):
|
|
|
|
continue
|
|
|
|
# Convert to dict because otherwise pickling won't work.
|
|
|
|
to_cache[oembed_data["original_url"]] = dict(oembed_data)
|
2013-05-01 22:49:16 +02:00
|
|
|
|
|
|
|
# Cache the newly collected data to the database
|
|
|
|
cache_set_many(dict((hash_embedly_url(link), to_cache[link]) for link in to_cache),
|
|
|
|
cache_name="database")
|
|
|
|
oembeds.update(to_cache)
|
|
|
|
|
|
|
|
# Now let's process the URLs in order
|
|
|
|
for link in supported_urls:
|
|
|
|
oembed_data = oembeds[link]
|
|
|
|
|
2013-04-29 22:22:07 +02:00
|
|
|
if oembed_data["type"] in ("link"):
|
|
|
|
continue
|
|
|
|
elif oembed_data["type"] in ("video", "rich") and "script" not in oembed_data["html"]:
|
|
|
|
placeholder = self.markdown.htmlStash.store(oembed_data["html"], safe=True)
|
|
|
|
el = markdown.util.etree.SubElement(root, "p")
|
|
|
|
el.text = placeholder
|
|
|
|
else:
|
|
|
|
try:
|
|
|
|
add_a(root,
|
|
|
|
oembed_data["thumbnail_url"],
|
|
|
|
link,
|
|
|
|
height=oembed_data["thumbnail_height"])
|
|
|
|
except KeyError:
|
|
|
|
# We didn't have a thumbnail, so let's just bail and keep on going...
|
|
|
|
continue
|
|
|
|
self.markdown.processed_hrefs.append(link)
|
|
|
|
return root
|
|
|
|
|
2013-03-01 19:20:53 +01:00
|
|
|
class InlineImagePreviewProcessor(markdown.treeprocessors.Treeprocessor):
|
|
|
|
def is_image(self, url):
|
2013-03-06 00:03:45 +01:00
|
|
|
parsed_url = urlparse.urlparse(url)
|
2013-03-01 19:20:53 +01:00
|
|
|
# List from http://support.google.com/chromeos/bin/answer.py?hl=en&answer=183093
|
|
|
|
for ext in [".bmp", ".gif", ".jpg", "jpeg", ".png", ".webp"]:
|
2013-03-06 00:03:45 +01:00
|
|
|
if parsed_url.path.lower().endswith(ext):
|
2013-03-01 19:20:53 +01:00
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
2013-03-04 16:38:42 +01:00
|
|
|
def dropbox_image(self, url):
|
|
|
|
if not self.is_image(url):
|
|
|
|
return None
|
|
|
|
parsed_url = urlparse.urlparse(url)
|
|
|
|
if (parsed_url.netloc == 'dropbox.com' or parsed_url.netloc.endswith('.dropbox.com')) \
|
2013-03-12 22:30:40 +01:00
|
|
|
and (parsed_url.path.startswith('/s/') or parsed_url.path.startswith('/sh/')):
|
2013-03-04 16:38:42 +01:00
|
|
|
return "%s?dl=1" % (url,)
|
|
|
|
return None
|
|
|
|
|
2013-03-01 19:20:53 +01:00
|
|
|
def youtube_image(self, url):
|
|
|
|
# Youtube video id extraction regular expression from http://pastebin.com/KyKAFv1s
|
|
|
|
# If it matches, match.group(2) is the video id.
|
|
|
|
youtube_re = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
|
|
|
|
match = re.match(youtube_re, url)
|
|
|
|
if match is None:
|
|
|
|
return None
|
|
|
|
return "http://i.ytimg.com/vi/%s/default.jpg" % (match.group(2),)
|
|
|
|
|
|
|
|
# Search the tree for <a> tags and read their href values
|
|
|
|
def find_images(self, root):
|
2013-03-08 20:07:46 +01:00
|
|
|
def process_image_links(element):
|
|
|
|
if element.tag != "a":
|
|
|
|
return None
|
2013-03-01 19:20:53 +01:00
|
|
|
|
2013-03-08 20:07:46 +01:00
|
|
|
url = element.get("href")
|
|
|
|
youtube = self.youtube_image(url)
|
|
|
|
if youtube is not None:
|
|
|
|
return (youtube, url)
|
|
|
|
dropbox = self.dropbox_image(url)
|
|
|
|
if dropbox is not None:
|
|
|
|
return (dropbox, url)
|
|
|
|
if self.is_image(url):
|
|
|
|
return (url, url)
|
2013-03-04 16:38:42 +01:00
|
|
|
|
2013-03-08 20:07:46 +01:00
|
|
|
return walk_tree(root, process_image_links)
|
2013-03-01 19:20:53 +01:00
|
|
|
|
|
|
|
def run(self, root):
|
|
|
|
image_urls = self.find_images(root)
|
|
|
|
for (url, link) in image_urls:
|
2013-04-29 22:22:07 +02:00
|
|
|
if link in self.markdown.processed_hrefs:
|
|
|
|
continue
|
2013-04-29 22:17:17 +02:00
|
|
|
add_a(root, url, link)
|
2013-03-01 19:20:53 +01:00
|
|
|
|
|
|
|
return root
|
|
|
|
|
2013-05-11 15:50:02 +02:00
|
|
|
@cache_with_key(lambda tweet_id: tweet_id, cache_name="database", with_statsd_key="tweet_data")
|
2013-03-11 16:23:34 +01:00
|
|
|
def fetch_tweet_data(tweet_id):
|
|
|
|
if settings.TEST_SUITE:
|
|
|
|
import testing_mocks
|
|
|
|
res = testing_mocks.twitter(tweet_id)
|
|
|
|
else:
|
2013-03-11 19:45:15 +01:00
|
|
|
if settings.STAGING_DEPLOYED:
|
2013-03-11 16:23:34 +01:00
|
|
|
# Application: "Humbug HQ"
|
|
|
|
api = twitter.Api(consumer_key = 'xxxxxxxxxxxxxxxxxxxxxx',
|
|
|
|
consumer_secret = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx',
|
|
|
|
access_token_key = 'xxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx',
|
|
|
|
access_token_secret = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx')
|
2013-03-11 19:45:15 +01:00
|
|
|
elif settings.DEPLOYED:
|
|
|
|
# This is the real set of API credentials used by our real server,
|
|
|
|
# and we probably shouldn't test with it just so we don't waste its requests
|
|
|
|
# Application: "Humbug HQ - Production"
|
|
|
|
api = twitter.Api(consumer_key = 'xxxxxxxxxxxxxxxxxxxxx',
|
|
|
|
consumer_secret = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx',
|
|
|
|
access_token_key = 'xxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx',
|
|
|
|
access_token_secret = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx')
|
2013-03-11 16:23:34 +01:00
|
|
|
else:
|
|
|
|
# Application: "Humbug HQ Test"
|
|
|
|
api = twitter.Api(consumer_key = 'xxxxxxxxxxxxxxxxxxxxxx',
|
|
|
|
consumer_secret = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx',
|
|
|
|
access_token_key = 'xxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx',
|
|
|
|
access_token_secret = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx')
|
2013-03-12 23:40:41 +01:00
|
|
|
try:
|
2013-04-23 17:01:33 +02:00
|
|
|
# Sometimes Twitter hangs on responses. Timing out here
|
|
|
|
# will cause the Tweet to go through as-is with no inline
|
|
|
|
# preview, rather than having the message be rejected
|
|
|
|
# entirely. This timeout needs to be less than our overall
|
|
|
|
# formatting timeout.
|
|
|
|
res = timeout(3, api.GetStatus, tweet_id).AsDict()
|
|
|
|
except TimeoutExpired as e:
|
|
|
|
# We'd like to try again later and not cache the bad result,
|
|
|
|
# so we need to re-raise the exception (just as though
|
|
|
|
# we were being rate-limited)
|
|
|
|
raise
|
2013-03-12 23:40:41 +01:00
|
|
|
except twitter.TwitterError as e:
|
|
|
|
t = e.args[0]
|
|
|
|
if len(t) == 1 and ('code' in t[0]) and (t[0]['code'] == 34):
|
|
|
|
# Code 34 means that the message doesn't exist; return
|
|
|
|
# None so that we will cache the error
|
|
|
|
return None
|
|
|
|
elif len(t) == 1 and ('code' in t[0]) and (t[0]['code'] == 88 or
|
|
|
|
t[0]['code'] == 130):
|
|
|
|
# Code 88 means that we were rate-limited and 130
|
|
|
|
# means Twitter is having capacity issues; either way
|
|
|
|
# just raise the error so we don't cache None and will
|
|
|
|
# try again later.
|
|
|
|
raise
|
|
|
|
else:
|
|
|
|
# It's not clear what to do in cases of other errors,
|
|
|
|
# but for now it seems reasonable to log at error
|
|
|
|
# level (so that we get notified), but then cache the
|
|
|
|
# failure to proceed with our usual work
|
|
|
|
logging.error(traceback.format_exc())
|
|
|
|
return None
|
2013-03-11 16:23:34 +01:00
|
|
|
return res
|
|
|
|
|
2013-04-30 21:37:22 +02:00
|
|
|
def get_tweet_id(url):
|
|
|
|
parsed_url = urlparse.urlparse(url)
|
|
|
|
if not (parsed_url.netloc == 'twitter.com' or parsed_url.netloc.endswith('.twitter.com')):
|
|
|
|
return False
|
|
|
|
|
|
|
|
tweet_id_match = re.match(r'^/.*?/status(es)?/(?P<tweetid>\d{18})$', parsed_url.path)
|
|
|
|
if not tweet_id_match:
|
|
|
|
return False
|
|
|
|
return tweet_id_match.group("tweetid")
|
|
|
|
|
|
|
|
|
2013-03-08 06:27:16 +01:00
|
|
|
class InlineInterestingLinkProcessor(markdown.treeprocessors.Treeprocessor):
|
|
|
|
def twitter_link(self, url):
|
2013-04-30 21:37:22 +02:00
|
|
|
tweet_id = get_tweet_id(url)
|
2013-03-08 06:27:16 +01:00
|
|
|
|
2013-04-30 21:37:22 +02:00
|
|
|
if not tweet_id:
|
2013-03-08 06:27:16 +01:00
|
|
|
return None
|
|
|
|
|
|
|
|
try:
|
2013-03-11 16:23:34 +01:00
|
|
|
res = fetch_tweet_data(tweet_id)
|
2013-03-12 23:40:41 +01:00
|
|
|
if res is None:
|
|
|
|
return None
|
2013-03-08 06:27:16 +01:00
|
|
|
user = res['user']
|
|
|
|
tweet = markdown.util.etree.Element("div")
|
|
|
|
tweet.set("class", "twitter-tweet")
|
|
|
|
img_a = markdown.util.etree.SubElement(tweet, 'a')
|
|
|
|
img_a.set("href", url)
|
|
|
|
img_a.set("target", "_blank")
|
|
|
|
profile_img = markdown.util.etree.SubElement(img_a, 'img')
|
|
|
|
profile_img.set('class', 'twitter-avatar')
|
2013-03-08 20:48:14 +01:00
|
|
|
# For some reason, for, e.g. tweet 285072525413724161,
|
|
|
|
# python-twitter does not give us a
|
|
|
|
# profile_image_url_https, but instead puts that URL in
|
|
|
|
# profile_image_url. So use _https if available, but fall
|
|
|
|
# back gracefully.
|
|
|
|
image_url = user.get('profile_image_url_https', user['profile_image_url'])
|
|
|
|
profile_img.set('src', image_url)
|
2013-03-08 06:27:16 +01:00
|
|
|
p = markdown.util.etree.SubElement(tweet, 'p')
|
|
|
|
p.text = res['text']
|
|
|
|
span = markdown.util.etree.SubElement(tweet, 'span')
|
|
|
|
span.text = "- %s (@%s)" % (user['name'], user['screen_name'])
|
|
|
|
|
|
|
|
return ('twitter', tweet)
|
|
|
|
except:
|
|
|
|
# We put this in its own try-except because it requires external
|
|
|
|
# connectivity. If Twitter flakes out, we don't want to not-render
|
|
|
|
# the entire message; we just want to not show the Twitter preview.
|
2013-03-11 16:23:34 +01:00
|
|
|
logging.warning(traceback.format_exc())
|
2013-03-08 06:27:16 +01:00
|
|
|
return None
|
|
|
|
|
|
|
|
# Search the tree for <a> tags and read their href values
|
|
|
|
def find_interesting_links(self, root):
|
|
|
|
def process_interesting_links(element):
|
|
|
|
if element.tag != "a":
|
|
|
|
return None
|
|
|
|
|
|
|
|
url = element.get("href")
|
|
|
|
return self.twitter_link(url)
|
|
|
|
|
2013-03-08 21:44:06 +01:00
|
|
|
return walk_tree(root, process_interesting_links, stop_after_first=True)
|
2013-03-08 06:27:16 +01:00
|
|
|
|
|
|
|
def run(self, root):
|
|
|
|
interesting_links = self.find_interesting_links(root)
|
|
|
|
for (service_name, data) in interesting_links:
|
|
|
|
div = markdown.util.etree.SubElement(root, "div")
|
|
|
|
div.set("class", "inline-preview-%s" % service_name)
|
|
|
|
div.insert(0, data)
|
|
|
|
return root
|
|
|
|
|
2012-10-17 04:42:19 +02:00
|
|
|
class Gravatar(markdown.inlinepatterns.Pattern):
|
|
|
|
def handleMatch(self, match):
|
|
|
|
img = markdown.util.etree.Element('img')
|
|
|
|
img.set('class', 'message_body_gravatar img-rounded')
|
|
|
|
img.set('src', 'https://secure.gravatar.com/avatar/%s?d=identicon&s=30'
|
2012-10-22 02:15:44 +02:00
|
|
|
% (gravatar_hash(match.group('email')),))
|
2012-10-17 04:42:19 +02:00
|
|
|
return img
|
|
|
|
|
2013-03-01 22:07:27 +01:00
|
|
|
path_to_emoji = os.path.join(os.path.dirname(__file__), '..', '..',
|
|
|
|
# This should be zephyr/
|
|
|
|
'static', 'third', 'gemoji', 'images', 'emoji', '*.png')
|
|
|
|
emoji_list = [os.path.splitext(os.path.basename(fn))[0] for fn in glob.glob(path_to_emoji)]
|
|
|
|
|
|
|
|
def make_emoji(emoji_name, display_string):
|
|
|
|
elt = markdown.util.etree.Element('img')
|
|
|
|
elt.set('src', 'static/third/gemoji/images/emoji/%s.png' % (emoji_name,))
|
|
|
|
elt.set('class', 'emoji')
|
|
|
|
elt.set("alt", display_string)
|
|
|
|
elt.set("title", display_string)
|
|
|
|
return elt
|
|
|
|
|
|
|
|
class Emoji(markdown.inlinepatterns.Pattern):
|
|
|
|
def handleMatch(self, match):
|
|
|
|
orig_syntax = match.group("syntax")
|
|
|
|
name = orig_syntax[1:-1]
|
|
|
|
if name not in emoji_list:
|
|
|
|
return orig_syntax
|
|
|
|
return make_emoji(name, orig_syntax)
|
|
|
|
|
2013-03-29 20:17:33 +01:00
|
|
|
def fixup_link(link, target_blank=True):
|
2012-12-04 20:15:50 +01:00
|
|
|
"""Set certain attributes we want on every link."""
|
2013-03-29 20:17:33 +01:00
|
|
|
if target_blank:
|
|
|
|
link.set('target', '_blank')
|
2012-12-04 20:15:50 +01:00
|
|
|
link.set('title', link.get('href'))
|
|
|
|
|
2013-02-01 23:15:05 +01:00
|
|
|
|
|
|
|
def sanitize_url(url):
|
|
|
|
"""
|
|
|
|
Sanitize a url against xss attacks.
|
|
|
|
See the docstring on markdown.inlinepatterns.LinkPattern.sanitize_url.
|
|
|
|
"""
|
|
|
|
try:
|
|
|
|
parts = urlparse.urlparse(url.replace(' ', '%20'))
|
|
|
|
scheme, netloc, path, params, query, fragment = parts
|
|
|
|
except ValueError:
|
|
|
|
# Bad url - so bad it couldn't be parsed.
|
|
|
|
return ''
|
|
|
|
|
2013-03-29 20:17:33 +01:00
|
|
|
# If there is no scheme or netloc and there is a '@' in the path,
|
|
|
|
# treat it as a mailto: and set the appropriate scheme
|
|
|
|
if scheme == '' and netloc == '' and '@' in path:
|
|
|
|
scheme = 'mailto'
|
|
|
|
|
2013-02-01 23:15:05 +01:00
|
|
|
# Humbug modification: If scheme is not specified, assume http://
|
|
|
|
# It's unlikely that users want relative links within humbughq.com.
|
|
|
|
# We re-enter sanitize_url because netloc etc. need to be re-parsed.
|
|
|
|
if not scheme:
|
|
|
|
return sanitize_url('http://' + url)
|
|
|
|
|
2013-04-02 19:36:37 +02:00
|
|
|
locless_schemes = ['mailto', 'news']
|
2013-02-01 23:15:05 +01:00
|
|
|
if netloc == '' and scheme not in locless_schemes:
|
|
|
|
# This fails regardless of anything else.
|
|
|
|
# Return immediately to save additional proccessing
|
2013-02-26 22:41:39 +01:00
|
|
|
return None
|
2013-02-01 23:15:05 +01:00
|
|
|
|
2013-04-02 19:57:35 +02:00
|
|
|
# Upstream code will accept a URL like javascript://foo because it
|
|
|
|
# appears to have a netloc. Additionally there are plenty of other
|
|
|
|
# schemes that do weird things like launch external programs. To be
|
|
|
|
# on the safe side, we whitelist the scheme.
|
|
|
|
if scheme not in ('http', 'https', 'ftp', 'mailto'):
|
|
|
|
return None
|
|
|
|
|
2013-04-02 19:36:37 +02:00
|
|
|
# Upstream code scans path, parameters, and query for colon characters
|
|
|
|
# because
|
|
|
|
#
|
|
|
|
# some aliases [for javascript:] will appear to urlparse() to have
|
|
|
|
# no scheme. On top of that relative links (i.e.: "foo/bar.html")
|
|
|
|
# have no scheme.
|
|
|
|
#
|
|
|
|
# We already converted an empty scheme to http:// above, so we skip
|
|
|
|
# the colon check, which would also forbid a lot of legitimate URLs.
|
2013-02-01 23:15:05 +01:00
|
|
|
|
|
|
|
# Url passes all tests. Return url as-is.
|
2013-03-29 20:17:33 +01:00
|
|
|
return urlparse.urlunparse((scheme, netloc, path, params, query, fragment))
|
2013-02-01 23:15:05 +01:00
|
|
|
|
2013-02-11 20:49:48 +01:00
|
|
|
def url_to_a(url):
|
|
|
|
a = markdown.util.etree.Element('a')
|
2013-02-26 22:41:39 +01:00
|
|
|
|
2013-03-29 20:17:33 +01:00
|
|
|
href = sanitize_url(url)
|
2013-02-26 22:41:39 +01:00
|
|
|
if href is None:
|
|
|
|
# Rejected by sanitize_url; render it as plain text.
|
|
|
|
return url
|
|
|
|
|
|
|
|
a.set('href', href)
|
2013-02-11 20:49:48 +01:00
|
|
|
a.text = url
|
2013-03-29 20:17:33 +01:00
|
|
|
fixup_link(a, not 'mailto:' in href[:7])
|
2013-02-11 20:49:48 +01:00
|
|
|
return a
|
|
|
|
|
2012-10-22 02:32:18 +02:00
|
|
|
class AutoLink(markdown.inlinepatterns.Pattern):
|
|
|
|
def handleMatch(self, match):
|
|
|
|
url = match.group('url')
|
2013-02-11 20:49:48 +01:00
|
|
|
# As this will also match already-matched https?:// links,
|
|
|
|
# don't doubly-link them
|
|
|
|
if url[:5] == 'http:' or url[:6] == 'https:':
|
|
|
|
return url
|
|
|
|
return url_to_a(url)
|
2013-02-01 23:15:05 +01:00
|
|
|
|
2013-02-11 20:49:48 +01:00
|
|
|
class HttpLink(markdown.inlinepatterns.Pattern):
|
|
|
|
def handleMatch(self, match):
|
|
|
|
url = match.group('url')
|
|
|
|
return url_to_a(url)
|
2012-10-22 02:32:18 +02:00
|
|
|
|
2012-11-02 18:25:37 +01:00
|
|
|
class UListProcessor(markdown.blockprocessors.OListProcessor):
|
|
|
|
""" Process unordered list blocks.
|
|
|
|
|
|
|
|
Based on markdown.blockprocessors.UListProcessor, but does not accept
|
2013-01-23 23:07:01 +01:00
|
|
|
'+' or '-' as a bullet character."""
|
2012-11-02 18:25:37 +01:00
|
|
|
|
|
|
|
TAG = 'ul'
|
2013-01-23 23:07:01 +01:00
|
|
|
RE = re.compile(r'^[ ]{0,3}[*][ ]+(.*)')
|
2012-11-02 18:25:37 +01:00
|
|
|
|
2013-01-24 19:35:20 +01:00
|
|
|
class BugdownUListPreprocessor(markdown.preprocessors.Preprocessor):
|
|
|
|
""" Allows unordered list blocks that come directly after a
|
|
|
|
paragraph to be rendered as an unordered list
|
|
|
|
|
|
|
|
Detects paragraphs that have a matching list item that comes
|
|
|
|
directly after a line of text, and inserts a newline between
|
|
|
|
to satisfy Markdown"""
|
|
|
|
|
|
|
|
LI_RE = re.compile(r'^[ ]{0,3}[*][ ]+(.*)', re.MULTILINE)
|
|
|
|
HANGING_ULIST_RE = re.compile(r'^.+\n([ ]{0,3}[*][ ]+.*)', re.MULTILINE)
|
|
|
|
|
|
|
|
def run(self, lines):
|
|
|
|
""" Insert a newline between a paragraph and ulist if missing """
|
|
|
|
inserts = 0
|
|
|
|
fence = None
|
|
|
|
copy = lines[:]
|
|
|
|
for i in xrange(len(lines) - 1):
|
|
|
|
# Ignore anything that is inside a fenced code block
|
|
|
|
m = FENCE_RE.match(lines[i])
|
|
|
|
if not fence and m:
|
|
|
|
fence = m.group('fence')
|
|
|
|
elif fence and m and fence == m.group('fence'):
|
|
|
|
fence = None
|
|
|
|
|
|
|
|
# If we're not in a fenced block and we detect an upcoming list
|
|
|
|
# hanging off a paragraph, add a newline
|
|
|
|
if not fence and lines[i] and \
|
|
|
|
self.LI_RE.match(lines[i+1]) and not self.LI_RE.match(lines[i]):
|
|
|
|
copy.insert(i+inserts+1, '')
|
|
|
|
inserts += 1
|
|
|
|
return copy
|
|
|
|
|
2012-12-04 19:57:54 +01:00
|
|
|
# Based on markdown.inlinepatterns.LinkPattern
|
|
|
|
class LinkPattern(markdown.inlinepatterns.Pattern):
|
|
|
|
""" Return a link element from the given match. """
|
|
|
|
def handleMatch(self, m):
|
2013-02-26 22:41:39 +01:00
|
|
|
# Return the original link syntax as plain text,
|
|
|
|
# if the link fails checks.
|
|
|
|
orig_syntax = m.group(0)
|
|
|
|
|
2012-12-04 19:57:54 +01:00
|
|
|
href = m.group(9)
|
2013-02-26 22:41:39 +01:00
|
|
|
if not href:
|
|
|
|
return orig_syntax
|
2012-12-04 19:57:54 +01:00
|
|
|
|
2013-02-26 22:41:39 +01:00
|
|
|
if href[0] == "<":
|
|
|
|
href = href[1:-1]
|
|
|
|
href = sanitize_url(self.unescape(href.strip()))
|
|
|
|
if href is None:
|
|
|
|
return orig_syntax
|
2012-12-04 19:57:54 +01:00
|
|
|
|
2013-02-26 22:41:39 +01:00
|
|
|
el = markdown.util.etree.Element('a')
|
|
|
|
el.text = m.group(2)
|
|
|
|
el.set('href', href)
|
2012-12-04 20:15:50 +01:00
|
|
|
fixup_link(el)
|
2012-12-04 19:57:54 +01:00
|
|
|
return el
|
|
|
|
|
2012-10-16 17:35:58 +02:00
|
|
|
class Bugdown(markdown.Extension):
|
|
|
|
def extendMarkdown(self, md, md_globals):
|
2012-12-04 20:22:14 +01:00
|
|
|
del md.preprocessors['reference']
|
|
|
|
|
|
|
|
for k in ('image_link', 'image_reference', 'automail',
|
2012-12-11 21:19:15 +01:00
|
|
|
'autolink', 'link', 'reference', 'short_reference',
|
2013-01-31 21:13:09 +01:00
|
|
|
'escape', 'strong_em', 'emphasis', 'emphasis2',
|
|
|
|
'strong'):
|
2012-10-22 02:35:36 +02:00
|
|
|
del md.inlinePatterns[k]
|
|
|
|
|
2013-01-31 21:13:09 +01:00
|
|
|
# Custom bold syntax: **foo** but not __foo__
|
|
|
|
md.inlinePatterns.add('strong',
|
2013-02-14 17:45:04 +01:00
|
|
|
markdown.inlinepatterns.SimpleTagPattern(r'(\*\*)([^\n]+?)\2', 'strong'),
|
2013-01-31 21:13:09 +01:00
|
|
|
'>not_strong')
|
|
|
|
|
2012-11-02 18:25:37 +01:00
|
|
|
for k in ('hashheader', 'setextheader', 'olist', 'ulist'):
|
2012-10-22 02:35:36 +02:00
|
|
|
del md.parser.blockprocessors[k]
|
2012-10-16 17:35:58 +02:00
|
|
|
|
2012-11-02 18:25:37 +01:00
|
|
|
md.parser.blockprocessors.add('ulist', UListProcessor(md.parser), '>hr')
|
|
|
|
|
2012-10-22 02:15:44 +02:00
|
|
|
md.inlinePatterns.add('gravatar', Gravatar(r'!gravatar\((?P<email>[^)]*)\)'), '_begin')
|
2013-03-01 22:07:27 +01:00
|
|
|
md.inlinePatterns.add('emoji', Emoji(r'(?<!\S)(?P<syntax>:[^:\s]+:)(?!\S)'), '_begin')
|
2012-12-11 21:19:15 +01:00
|
|
|
md.inlinePatterns.add('link', LinkPattern(markdown.inlinepatterns.LINK_RE, md), '>backtick')
|
2012-10-17 04:42:19 +02:00
|
|
|
|
2013-02-11 20:49:48 +01:00
|
|
|
# markdown.inlinepatterns.Pattern compiles this with re.UNICODE, which
|
|
|
|
# is important because we're using \w.
|
|
|
|
#
|
|
|
|
# This rule must come after the built-in 'link' markdown linkifier to
|
|
|
|
# avoid errors.
|
2013-03-29 19:45:22 +01:00
|
|
|
#
|
|
|
|
# We support up to 1 nested pair of paranthesis in a url
|
|
|
|
http_link_regex = r'\b(?P<url>https?://(?:(?:[^\s]+\([^\s)]+?\)[^\s]*?)|[^\s]+?))(?=[^\w/]*(\s|\Z))'
|
|
|
|
|
2013-02-11 20:49:48 +01:00
|
|
|
md.inlinePatterns.add('http_autolink', HttpLink(http_link_regex), '>link')
|
|
|
|
|
2013-02-01 20:04:28 +01:00
|
|
|
# A link starts at a word boundary, and ends at space, punctuation, or end-of-input.
|
2012-11-20 19:33:10 +01:00
|
|
|
#
|
2013-02-01 20:04:28 +01:00
|
|
|
# We detect a url by checking for the TLD, and building around it.
|
|
|
|
#
|
|
|
|
# To support () in urls but not match ending ) when a url is inside a parenthesis,
|
|
|
|
# we match at maximum one set of matching parens in a url. We could extend this
|
|
|
|
# to match two parenthetical groups, at the cost of more regex complexity.
|
2013-02-11 20:49:48 +01:00
|
|
|
#
|
|
|
|
# This rule must come after the http_autolink rule we add above to avoid double
|
|
|
|
# linkifying.
|
2013-04-02 17:08:00 +02:00
|
|
|
tlds = '|'.join(list_of_tlds())
|
2013-02-06 15:33:24 +01:00
|
|
|
link_regex = r"\b(?P<url>[^\s]+\.(%s)(?:/[^\s()\":]*?|([^\s()\":]*\([^\s()\":]*\)[^\s()\":]*))?)(?=([:;\?\),\.\'\"]\Z|[:;\?\),\.\'\"]\s|\Z|\s))" % (tlds,)
|
2013-02-11 20:49:48 +01:00
|
|
|
md.inlinePatterns.add('autolink', AutoLink(link_regex), '>http_autolink')
|
2012-10-22 02:32:18 +02:00
|
|
|
|
2013-01-24 19:35:20 +01:00
|
|
|
md.preprocessors.add('hanging_ulists',
|
|
|
|
BugdownUListPreprocessor(md),
|
|
|
|
"_begin")
|
|
|
|
|
2013-04-29 22:22:07 +02:00
|
|
|
md.processed_hrefs = []
|
|
|
|
if not settings.DEPLOYED or settings.STAGING_DEPLOYED:
|
|
|
|
md.treeprocessors.add("embedly_processor", EmbedlyProcessor(md), "_end")
|
2013-03-01 19:20:53 +01:00
|
|
|
md.treeprocessors.add("inline_images", InlineImagePreviewProcessor(md), "_end")
|
2013-03-14 22:48:17 +01:00
|
|
|
md.treeprocessors.add("inline_interesting_links", InlineInterestingLinkProcessor(md), "_end")
|
2013-03-01 19:20:53 +01:00
|
|
|
|
2012-11-20 20:15:55 +01:00
|
|
|
_md_engine = markdown.Markdown(
|
|
|
|
safe_mode = 'escape',
|
|
|
|
output_format = 'html',
|
|
|
|
extensions = ['nl2br',
|
|
|
|
codehilite.makeExtension(configs=[
|
|
|
|
('force_linenos', False),
|
|
|
|
('guess_lang', False)]),
|
|
|
|
fenced_code.makeExtension(),
|
|
|
|
Bugdown()])
|
2012-10-15 22:03:50 +02:00
|
|
|
|
2012-10-25 21:38:47 +02:00
|
|
|
# We want to log Markdown parser failures, but shouldn't log the actual input
|
|
|
|
# message for privacy reasons. The compromise is to replace all alphanumeric
|
|
|
|
# characters with 'x'.
|
|
|
|
#
|
|
|
|
# We also use repr() to improve reproducibility, and to escape terminal control
|
|
|
|
# codes, which can do surprisingly nasty things.
|
|
|
|
_privacy_re = re.compile(r'\w', flags=re.UNICODE)
|
|
|
|
def _sanitize_for_log(md):
|
|
|
|
return repr(_privacy_re.sub('x', md))
|
|
|
|
|
2012-10-15 22:03:50 +02:00
|
|
|
def convert(md):
|
|
|
|
"""Convert Markdown to HTML, with Humbug-specific settings and hacks."""
|
2012-11-20 20:15:55 +01:00
|
|
|
|
|
|
|
# Reset the parser; otherwise it will get slower over time.
|
|
|
|
_md_engine.reset()
|
2012-10-15 22:03:50 +02:00
|
|
|
|
|
|
|
try:
|
2013-01-29 21:47:53 +01:00
|
|
|
# Spend at most 5 seconds rendering.
|
|
|
|
# Sometimes Python-Markdown is really slow; see
|
|
|
|
# https://trac.humbughq.com/ticket/345
|
2013-03-08 20:34:10 +01:00
|
|
|
return timeout(5, _md_engine.convert, md)
|
2012-10-15 22:03:50 +02:00
|
|
|
except:
|
2013-01-31 19:57:25 +01:00
|
|
|
from zephyr.models import Recipient
|
|
|
|
from zephyr.lib.actions import internal_send_message
|
|
|
|
|
|
|
|
cleaned = _sanitize_for_log(md)
|
|
|
|
|
|
|
|
# Output error to log as well as sending a humbug and email
|
2012-10-25 21:38:47 +02:00
|
|
|
logging.getLogger('').error('Exception in Markdown parser: %sInput (sanitized) was: %s'
|
2013-01-31 19:57:25 +01:00
|
|
|
% (traceback.format_exc(), cleaned))
|
2013-05-07 00:31:46 +02:00
|
|
|
subject = "Markdown parser failure on %s" % (platform.node(),)
|
2013-03-18 19:10:21 +01:00
|
|
|
internal_send_message("humbug+errors@humbughq.com", "stream",
|
|
|
|
"devel", subject, "Markdown parser failed, message sent to devel@")
|
2013-01-31 19:57:25 +01:00
|
|
|
mail.mail_admins(subject, "Failed message: %s\n\n%s\n\n" % (
|
|
|
|
cleaned, traceback.format_exc()),
|
|
|
|
fail_silently=False)
|
2013-03-08 20:34:10 +01:00
|
|
|
return None
|