2013-10-10 21:37:26 +02:00
|
|
|
from __future__ import absolute_import
|
2016-04-01 06:58:14 +02:00
|
|
|
# Zulip's main markdown implementation. See docs/markdown.md for
|
|
|
|
# detailed documentation on our markdown syntax.
|
2016-06-04 19:28:49 +02:00
|
|
|
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, TypeVar, Union
|
2016-05-02 20:00:22 +02:00
|
|
|
from typing.re import Match
|
2013-10-10 21:37:26 +02:00
|
|
|
|
2012-10-15 22:03:50 +02:00
|
|
|
import markdown
|
2012-10-22 05:06:28 +02:00
|
|
|
import logging
|
|
|
|
import traceback
|
2016-01-24 03:39:44 +01:00
|
|
|
from six.moves import urllib
|
2012-10-25 21:38:47 +02:00
|
|
|
import re
|
2013-03-01 22:07:27 +01:00
|
|
|
import os.path
|
|
|
|
import glob
|
2013-03-08 20:48:14 +01:00
|
|
|
import twitter
|
2013-05-07 00:31:46 +02:00
|
|
|
import platform
|
2013-05-21 23:59:27 +02:00
|
|
|
import time
|
2015-11-01 17:14:36 +01:00
|
|
|
import six.moves.html_parser
|
2013-05-07 20:50:25 +02:00
|
|
|
import httplib2
|
2014-01-08 22:56:48 +01:00
|
|
|
import itertools
|
2016-01-24 03:39:44 +01:00
|
|
|
from six.moves import urllib
|
2014-05-21 08:11:29 +02:00
|
|
|
import xml.etree.cElementTree as etree
|
2016-06-03 18:38:34 +02:00
|
|
|
from xml.etree.cElementTree import Element, SubElement
|
2013-05-07 20:50:25 +02:00
|
|
|
|
2013-12-11 20:06:37 +01:00
|
|
|
from collections import defaultdict
|
2013-05-01 22:49:16 +02:00
|
|
|
|
2014-05-21 08:11:29 +02:00
|
|
|
import requests
|
|
|
|
|
2013-01-31 19:57:25 +01:00
|
|
|
from django.core import mail
|
2013-03-08 06:27:16 +01:00
|
|
|
from django.conf import settings
|
2013-01-31 19:57:25 +01:00
|
|
|
|
2013-07-29 23:03:31 +02:00
|
|
|
from zerver.lib.avatar import gravatar_hash
|
2016-04-28 21:49:47 +02:00
|
|
|
from zerver.lib.bugdown import codehilite
|
|
|
|
from zerver.lib.bugdown import fenced_code # type: ignore # excluding fenced_code from checks
|
2013-07-29 23:03:31 +02:00
|
|
|
from zerver.lib.bugdown.fenced_code import FENCE_RE
|
2016-04-28 05:40:58 +02:00
|
|
|
from zerver.lib.camo import get_camo_url
|
2013-07-29 23:03:31 +02:00
|
|
|
from zerver.lib.timeout import timeout, TimeoutExpired
|
|
|
|
from zerver.lib.cache import cache_with_key, cache_get_many, cache_set_many
|
2013-09-03 22:41:17 +02:00
|
|
|
import zerver.lib.alert_words as alert_words
|
2013-07-29 23:03:31 +02:00
|
|
|
import zerver.lib.mention as mention
|
2015-11-01 17:14:25 +01:00
|
|
|
import six
|
2015-11-01 17:15:05 +01:00
|
|
|
from six.moves import range
|
2016-06-03 18:38:34 +02:00
|
|
|
from six import text_type
|
2013-04-29 22:22:07 +02:00
|
|
|
|
2013-03-18 22:51:08 +01:00
|
|
|
# Format version of the bugdown rendering; stored along with rendered
|
|
|
|
# messages so that we can efficiently determine what needs to be re-rendered
|
|
|
|
version = 1
|
|
|
|
|
2016-06-03 18:38:34 +02:00
|
|
|
_T = TypeVar('_T')
|
|
|
|
# We need to avoid this running at runtime, but mypy will see this.
|
|
|
|
# The problem is that under python 2, Element isn't exactly a type,
|
|
|
|
# which means that at runtime Union causes this to blow up.
|
|
|
|
if False:
|
|
|
|
# mypy requires the Optional to be inside Union
|
|
|
|
ElementStringNone = Union[Element, Optional[text_type]]
|
|
|
|
|
2013-04-02 17:08:00 +02:00
|
|
|
def list_of_tlds():
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: () -> List[str]
|
2013-04-02 17:08:00 +02:00
|
|
|
# HACK we manually blacklist .py
|
|
|
|
blacklist = ['PY\n', ]
|
|
|
|
|
|
|
|
# tlds-alpha-by-domain.txt comes from http://data.iana.org/TLD/tlds-alpha-by-domain.txt
|
|
|
|
tlds_file = os.path.join(os.path.dirname(__file__), 'tlds-alpha-by-domain.txt')
|
|
|
|
tlds = [tld.lower().strip() for tld in open(tlds_file, 'r')
|
2016-05-10 01:55:43 +02:00
|
|
|
if tld not in blacklist and not tld[0].startswith('#')]
|
2013-04-02 17:08:00 +02:00
|
|
|
tlds.sort(key=len, reverse=True)
|
|
|
|
return tlds
|
|
|
|
|
2013-03-08 21:44:06 +01:00
|
|
|
def walk_tree(root, processor, stop_after_first=False):
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: (Element, Callable[[Element], Optional[_T]], bool) -> List[_T]
|
2013-03-08 20:07:46 +01:00
|
|
|
results = []
|
|
|
|
stack = [root]
|
|
|
|
|
|
|
|
while stack:
|
|
|
|
currElement = stack.pop()
|
|
|
|
for child in currElement.getchildren():
|
|
|
|
if child.getchildren():
|
|
|
|
stack.append(child)
|
|
|
|
|
|
|
|
result = processor(child)
|
|
|
|
if result is not None:
|
|
|
|
results.append(result)
|
2013-03-08 21:44:06 +01:00
|
|
|
if stop_after_first:
|
|
|
|
return results
|
2013-03-08 20:07:46 +01:00
|
|
|
|
|
|
|
return results
|
|
|
|
|
2014-05-21 08:11:29 +02:00
|
|
|
# height is not actually used
|
2014-07-17 02:41:49 +02:00
|
|
|
def add_a(root, url, link, height="", title=None, desc=None,
|
|
|
|
class_attr="message_inline_image"):
|
2016-06-04 03:04:08 +02:00
|
|
|
# type: (Element, text_type, text_type, text_type, Optional[text_type], Optional[text_type], text_type) -> None
|
2014-07-17 02:41:49 +02:00
|
|
|
title = title if title is not None else url_filename(link)
|
|
|
|
title = title if title else ""
|
|
|
|
desc = desc if desc is not None else ""
|
|
|
|
|
2013-04-29 22:17:17 +02:00
|
|
|
div = markdown.util.etree.SubElement(root, "div")
|
2014-05-21 08:11:29 +02:00
|
|
|
div.set("class", class_attr)
|
2013-04-29 22:17:17 +02:00
|
|
|
a = markdown.util.etree.SubElement(div, "a")
|
|
|
|
a.set("href", link)
|
|
|
|
a.set("target", "_blank")
|
2014-07-17 02:41:49 +02:00
|
|
|
a.set("title", title )
|
2013-04-29 22:17:17 +02:00
|
|
|
img = markdown.util.etree.SubElement(a, "img")
|
|
|
|
img.set("src", url)
|
2014-07-17 02:41:49 +02:00
|
|
|
if class_attr == "message_inline_ref":
|
2014-05-21 08:11:29 +02:00
|
|
|
summary_div = markdown.util.etree.SubElement(div, "div")
|
|
|
|
title_div = markdown.util.etree.SubElement(summary_div, "div")
|
2014-07-17 02:41:49 +02:00
|
|
|
title_div.set("class", "message_inline_image_title")
|
2014-05-21 08:11:29 +02:00
|
|
|
title_div.text = title
|
|
|
|
desc_div = markdown.util.etree.SubElement(summary_div, "desc")
|
2014-07-17 02:41:49 +02:00
|
|
|
desc_div.set("class", "message_inline_image_desc")
|
2013-04-29 22:17:17 +02:00
|
|
|
|
2013-05-11 15:50:02 +02:00
|
|
|
@cache_with_key(lambda tweet_id: tweet_id, cache_name="database", with_statsd_key="tweet_data")
|
2013-03-11 16:23:34 +01:00
|
|
|
def fetch_tweet_data(tweet_id):
|
2016-06-05 07:54:49 +02:00
|
|
|
# type: (str) -> Optional[Dict[str, Any]]
|
2013-03-11 16:23:34 +01:00
|
|
|
if settings.TEST_SUITE:
|
2013-10-10 21:37:26 +02:00
|
|
|
from . import testing_mocks
|
2013-03-11 16:23:34 +01:00
|
|
|
res = testing_mocks.twitter(tweet_id)
|
|
|
|
else:
|
2015-09-30 09:55:56 +02:00
|
|
|
creds = {
|
|
|
|
'consumer_key': settings.TWITTER_CONSUMER_KEY,
|
|
|
|
'consumer_secret': settings.TWITTER_CONSUMER_SECRET,
|
|
|
|
'access_token_key': settings.TWITTER_ACCESS_TOKEN_KEY,
|
|
|
|
'access_token_secret': settings.TWITTER_ACCESS_TOKEN_SECRET,
|
|
|
|
}
|
|
|
|
if not all(creds.values()):
|
2013-10-10 21:30:35 +02:00
|
|
|
return None
|
|
|
|
|
2013-03-12 23:40:41 +01:00
|
|
|
try:
|
2015-09-30 21:01:37 +02:00
|
|
|
api = twitter.Api(**creds)
|
2013-04-23 17:01:33 +02:00
|
|
|
# Sometimes Twitter hangs on responses. Timing out here
|
|
|
|
# will cause the Tweet to go through as-is with no inline
|
|
|
|
# preview, rather than having the message be rejected
|
|
|
|
# entirely. This timeout needs to be less than our overall
|
|
|
|
# formatting timeout.
|
2014-01-10 19:04:57 +01:00
|
|
|
tweet = timeout(3, api.GetStatus, tweet_id)
|
|
|
|
res = tweet.AsDict()
|
|
|
|
res['media'] = tweet.media # AsDict does not include media
|
2015-09-30 21:01:37 +02:00
|
|
|
except AttributeError:
|
|
|
|
logging.error('Unable to load twitter api, you may have the wrong '
|
|
|
|
'library installed, see https://github.com/zulip/zulip/issues/86')
|
|
|
|
return None
|
2013-04-23 17:01:33 +02:00
|
|
|
except TimeoutExpired as e:
|
|
|
|
# We'd like to try again later and not cache the bad result,
|
|
|
|
# so we need to re-raise the exception (just as though
|
|
|
|
# we were being rate-limited)
|
|
|
|
raise
|
2013-03-12 23:40:41 +01:00
|
|
|
except twitter.TwitterError as e:
|
|
|
|
t = e.args[0]
|
|
|
|
if len(t) == 1 and ('code' in t[0]) and (t[0]['code'] == 34):
|
|
|
|
# Code 34 means that the message doesn't exist; return
|
|
|
|
# None so that we will cache the error
|
|
|
|
return None
|
|
|
|
elif len(t) == 1 and ('code' in t[0]) and (t[0]['code'] == 88 or
|
|
|
|
t[0]['code'] == 130):
|
|
|
|
# Code 88 means that we were rate-limited and 130
|
|
|
|
# means Twitter is having capacity issues; either way
|
|
|
|
# just raise the error so we don't cache None and will
|
|
|
|
# try again later.
|
|
|
|
raise
|
|
|
|
else:
|
|
|
|
# It's not clear what to do in cases of other errors,
|
|
|
|
# but for now it seems reasonable to log at error
|
|
|
|
# level (so that we get notified), but then cache the
|
|
|
|
# failure to proceed with our usual work
|
|
|
|
logging.error(traceback.format_exc())
|
|
|
|
return None
|
2013-03-11 16:23:34 +01:00
|
|
|
return res
|
|
|
|
|
2014-05-21 08:11:29 +02:00
|
|
|
HEAD_START_RE = re.compile('^head[ >]')
|
|
|
|
HEAD_END_RE = re.compile('^/head[ >]')
|
|
|
|
META_START_RE = re.compile('^meta[ >]')
|
|
|
|
META_END_RE = re.compile('^/meta[ >]')
|
|
|
|
|
|
|
|
def fetch_open_graph_image(url):
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: (text_type) -> Optional[Dict[text_type, Any]]
|
2014-05-21 08:11:29 +02:00
|
|
|
in_head = False
|
|
|
|
# HTML will auto close meta tags, when we start the next tag add a closing tag if it has not been closed yet.
|
|
|
|
last_closed = True
|
|
|
|
head = []
|
|
|
|
|
|
|
|
# TODO: What if response content is huge? Should we get headers first?
|
2014-07-17 02:41:49 +02:00
|
|
|
try:
|
|
|
|
content = requests.get(url, timeout=1).content
|
|
|
|
except:
|
|
|
|
return None
|
2014-05-21 08:11:29 +02:00
|
|
|
|
|
|
|
# Extract the head and meta tags
|
|
|
|
# All meta tags are self closing, have no children or are closed
|
|
|
|
# automatically.
|
|
|
|
for part in content.split('<'):
|
|
|
|
if not in_head and HEAD_START_RE.match(part):
|
|
|
|
# Started the head node output it to have a document root
|
|
|
|
in_head = True
|
|
|
|
head.append('<head>')
|
|
|
|
elif in_head and HEAD_END_RE.match(part):
|
|
|
|
# Found the end of the head close any remaining tag then stop
|
|
|
|
# processing
|
|
|
|
in_head = False
|
|
|
|
if not last_closed:
|
|
|
|
last_closed = True
|
|
|
|
head.append('</meta>')
|
|
|
|
head.append('</head>')
|
|
|
|
break
|
|
|
|
|
|
|
|
elif in_head and META_START_RE.match(part):
|
|
|
|
# Found a meta node copy it
|
|
|
|
if not last_closed:
|
|
|
|
head.append('</meta>')
|
|
|
|
last_closed = True
|
|
|
|
head.append('<')
|
|
|
|
head.append(part)
|
|
|
|
if '/>' not in part:
|
|
|
|
last_closed = False
|
|
|
|
|
|
|
|
elif in_head and META_END_RE.match(part):
|
|
|
|
# End of a meta node just copy it to close the tag
|
|
|
|
head.append('<')
|
|
|
|
head.append(part)
|
|
|
|
last_closed = True
|
|
|
|
|
|
|
|
try:
|
|
|
|
doc = etree.fromstring(''.join(head))
|
|
|
|
except etree.ParseError:
|
|
|
|
return None
|
|
|
|
og_image = doc.find('meta[@property="og:image"]')
|
|
|
|
og_title = doc.find('meta[@property="og:title"]')
|
|
|
|
og_desc = doc.find('meta[@property="og:description"]')
|
2014-07-17 02:41:49 +02:00
|
|
|
title = None
|
|
|
|
desc = None
|
2014-05-21 08:11:29 +02:00
|
|
|
if og_image is not None:
|
|
|
|
image = og_image.get('content')
|
2014-07-17 02:41:49 +02:00
|
|
|
else:
|
|
|
|
return None
|
2014-05-21 08:11:29 +02:00
|
|
|
if og_title is not None:
|
|
|
|
title = og_title.get('content')
|
|
|
|
if og_desc is not None:
|
|
|
|
desc = og_desc.get('content')
|
|
|
|
return {'image': image, 'title': title, 'desc': desc}
|
|
|
|
|
2013-04-30 21:37:22 +02:00
|
|
|
def get_tweet_id(url):
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: (text_type) -> Union[bool, text_type]
|
2016-01-24 03:39:44 +01:00
|
|
|
parsed_url = urllib.parse.urlparse(url)
|
2013-04-30 21:37:22 +02:00
|
|
|
if not (parsed_url.netloc == 'twitter.com' or parsed_url.netloc.endswith('.twitter.com')):
|
2016-06-03 18:38:34 +02:00
|
|
|
return False # TODO: probably should return None instead and change return type to Optional[str]
|
2013-12-13 23:45:01 +01:00
|
|
|
to_match = parsed_url.path
|
|
|
|
# In old-style twitter.com/#!/wdaher/status/1231241234-style URLs, we need to look at the fragment instead
|
|
|
|
if parsed_url.path == '/' and len(parsed_url.fragment) > 5:
|
|
|
|
to_match= parsed_url.fragment
|
2013-04-30 21:37:22 +02:00
|
|
|
|
2013-12-13 23:45:01 +01:00
|
|
|
tweet_id_match = re.match(r'^!?/.*?/status(es)?/(?P<tweetid>\d{10,18})(/photo/[0-9])?/?$', to_match)
|
2013-04-30 21:37:22 +02:00
|
|
|
if not tweet_id_match:
|
2016-06-03 18:38:34 +02:00
|
|
|
return False # TODO: probably should return None instead and change return type to Optional[str]
|
2013-04-30 21:37:22 +02:00
|
|
|
return tweet_id_match.group("tweetid")
|
|
|
|
|
2013-08-28 22:45:26 +02:00
|
|
|
class InlineHttpsProcessor(markdown.treeprocessors.Treeprocessor):
|
|
|
|
def run(self, root):
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: (Element) -> None
|
2013-08-28 22:45:26 +02:00
|
|
|
# Get all URLs from the blob
|
|
|
|
found_imgs = walk_tree(root, lambda e: e if e.tag == "img" else None)
|
|
|
|
for img in found_imgs:
|
|
|
|
url = img.get("src")
|
2013-12-13 19:39:30 +01:00
|
|
|
if not url.startswith("http://"):
|
2013-08-28 22:45:26 +02:00
|
|
|
# Don't rewrite images on our own site (e.g. emoji).
|
|
|
|
continue
|
2016-04-28 05:40:58 +02:00
|
|
|
img.set("src", get_camo_url(url))
|
2013-04-30 21:37:22 +02:00
|
|
|
|
2013-03-08 06:27:16 +01:00
|
|
|
class InlineInterestingLinkProcessor(markdown.treeprocessors.Treeprocessor):
|
2014-01-10 19:04:57 +01:00
|
|
|
TWITTER_MAX_IMAGE_HEIGHT = 400
|
2014-01-28 22:17:12 +01:00
|
|
|
TWITTER_MAX_TO_PREVIEW = 3
|
|
|
|
|
2014-07-17 02:41:49 +02:00
|
|
|
def __init__(self, md, bugdown):
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: (markdown.Markdown, Bugdown) -> None
|
2014-07-17 02:41:49 +02:00
|
|
|
# Passing in bugdown for access to config to check if realm is zulip.com
|
|
|
|
self.bugdown = bugdown
|
|
|
|
markdown.treeprocessors.Treeprocessor.__init__(self, md)
|
2014-01-10 19:04:57 +01:00
|
|
|
|
2013-05-21 16:59:09 +02:00
|
|
|
def is_image(self, url):
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: (text_type) -> bool
|
2013-11-14 14:37:39 +01:00
|
|
|
if not settings.INLINE_IMAGE_PREVIEW:
|
|
|
|
return False
|
2016-01-24 03:39:44 +01:00
|
|
|
parsed_url = urllib.parse.urlparse(url)
|
2013-05-21 16:59:09 +02:00
|
|
|
# List from http://support.google.com/chromeos/bin/answer.py?hl=en&answer=183093
|
|
|
|
for ext in [".bmp", ".gif", ".jpg", "jpeg", ".png", ".webp"]:
|
|
|
|
if parsed_url.path.lower().endswith(ext):
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
def dropbox_image(self, url):
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: (text_type) -> Optional[Dict]
|
|
|
|
# TODO: specify details of returned Dict
|
2016-01-24 03:39:44 +01:00
|
|
|
parsed_url = urllib.parse.urlparse(url)
|
2014-02-26 21:25:27 +01:00
|
|
|
if (parsed_url.netloc == 'dropbox.com' or parsed_url.netloc.endswith('.dropbox.com')):
|
2014-07-17 02:41:49 +02:00
|
|
|
is_album = parsed_url.path.startswith('/sc/') or parsed_url.path.startswith('/photos/')
|
2014-05-21 08:11:29 +02:00
|
|
|
# Only allow preview Dropbox shared links
|
|
|
|
if not (parsed_url.path.startswith('/s/') or
|
|
|
|
parsed_url.path.startswith('/sh/') or
|
|
|
|
is_album):
|
|
|
|
return None
|
|
|
|
|
|
|
|
# Try to retrieve open graph protocol info for a preview
|
|
|
|
# This might be redundant right now for shared links for images.
|
|
|
|
# However, we might want to make use of title and description
|
|
|
|
# in the future. If the actual image is too big, we might also
|
|
|
|
# want to use the open graph image.
|
|
|
|
image_info = fetch_open_graph_image(url)
|
|
|
|
|
|
|
|
is_image = is_album or self.is_image(url)
|
|
|
|
|
|
|
|
# If it is from an album or not an actual image file,
|
|
|
|
# just use open graph image.
|
|
|
|
if is_album or not is_image:
|
2014-07-17 02:41:49 +02:00
|
|
|
# Failed to follow link to find an image preview so
|
|
|
|
# use placeholder image and guess filename
|
|
|
|
if image_info is None:
|
2016-04-30 00:40:52 +02:00
|
|
|
return None
|
2014-07-17 02:41:49 +02:00
|
|
|
|
|
|
|
image_info["is_image"] = is_image
|
2014-05-21 08:11:29 +02:00
|
|
|
return image_info
|
|
|
|
|
|
|
|
# Otherwise, try to retrieve the actual image.
|
|
|
|
# This is because open graph image from Dropbox may have padding
|
|
|
|
# and gifs do not work.
|
|
|
|
# TODO: What if image is huge? Should we get headers first?
|
|
|
|
if image_info is None:
|
|
|
|
image_info = dict()
|
|
|
|
image_info['is_image'] = True
|
|
|
|
parsed_url_list = list(parsed_url)
|
|
|
|
parsed_url_list[4] = "dl=1" # Replaces query
|
2016-01-24 03:39:44 +01:00
|
|
|
image_info["image"] = urllib.parse.urlunparse(parsed_url_list)
|
2014-05-21 08:11:29 +02:00
|
|
|
|
|
|
|
return image_info
|
2013-05-21 16:59:09 +02:00
|
|
|
return None
|
|
|
|
|
|
|
|
def youtube_image(self, url):
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: (text_type) -> Optional[text_type]
|
2013-11-14 14:37:39 +01:00
|
|
|
if not settings.INLINE_IMAGE_PREVIEW:
|
|
|
|
return None
|
2013-05-21 16:59:09 +02:00
|
|
|
# Youtube video id extraction regular expression from http://pastebin.com/KyKAFv1s
|
|
|
|
# If it matches, match.group(2) is the video id.
|
|
|
|
youtube_re = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
|
|
|
|
match = re.match(youtube_re, url)
|
|
|
|
if match is None:
|
|
|
|
return None
|
2013-09-04 21:12:33 +02:00
|
|
|
return "https://i.ytimg.com/vi/%s/default.jpg" % (match.group(2),)
|
2013-05-21 16:59:09 +02:00
|
|
|
|
2014-01-10 19:04:57 +01:00
|
|
|
def twitter_text(self, text, urls, user_mentions, media):
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: (text_type, Dict[text_type, text_type], List[Dict[text_type, Any]], List[Dict[text_type, Any]]) -> Element
|
2014-01-08 22:56:48 +01:00
|
|
|
"""
|
2014-01-10 19:04:57 +01:00
|
|
|
Use data from the twitter API to turn links, mentions and media into A
|
|
|
|
tags.
|
2014-01-08 22:56:48 +01:00
|
|
|
|
2014-01-10 19:04:57 +01:00
|
|
|
This works by using the urls, user_mentions and media data from the
|
|
|
|
twitter API.
|
2014-01-08 22:56:48 +01:00
|
|
|
|
2014-01-10 19:04:57 +01:00
|
|
|
The first step is finding the locations of the URLs, mentions and media
|
|
|
|
in the text. For each match we build a dictionary with the start
|
|
|
|
location, end location, the URL to link to, and the text to show in the
|
|
|
|
link.
|
2014-01-08 22:56:48 +01:00
|
|
|
|
|
|
|
Next we sort the matches by start location. And for each we add the
|
|
|
|
text from the end of the last link to the start of the current link to
|
|
|
|
the output. The text needs to added to the text attribute of the first
|
|
|
|
node (the P tag) or the tail the last link created.
|
|
|
|
|
|
|
|
Finally we add any remaining text to the last node.
|
|
|
|
"""
|
|
|
|
|
2016-06-03 18:38:34 +02:00
|
|
|
to_linkify = [] # type: List[Dict[text_type, Any]]
|
2014-01-08 22:56:48 +01:00
|
|
|
# Build dicts for URLs
|
|
|
|
for short_url, full_url in urls.items():
|
2014-02-15 21:33:22 +01:00
|
|
|
for match in re.finditer(re.escape(short_url), text, re.IGNORECASE):
|
2014-01-08 22:56:48 +01:00
|
|
|
to_linkify.append({
|
|
|
|
'start': match.start(),
|
|
|
|
'end': match.end(),
|
|
|
|
'url': short_url,
|
|
|
|
'text': full_url,
|
|
|
|
})
|
|
|
|
# Build dicts for mentions
|
|
|
|
for user_mention in user_mentions:
|
|
|
|
screen_name = user_mention['screen_name']
|
|
|
|
mention_string = '@' + screen_name
|
2014-02-15 21:33:22 +01:00
|
|
|
for match in re.finditer(re.escape(mention_string), text, re.IGNORECASE):
|
2014-01-08 22:56:48 +01:00
|
|
|
to_linkify.append({
|
|
|
|
'start': match.start(),
|
|
|
|
'end': match.end(),
|
2016-01-24 03:39:44 +01:00
|
|
|
'url': 'https://twitter.com/' + urllib.parse.quote(screen_name),
|
2014-01-08 22:56:48 +01:00
|
|
|
'text': mention_string,
|
|
|
|
})
|
2014-01-10 19:04:57 +01:00
|
|
|
# Build dicts for media
|
|
|
|
for media_item in media:
|
|
|
|
short_url = media_item['url']
|
|
|
|
expanded_url = media_item['expanded_url']
|
2014-02-15 21:33:22 +01:00
|
|
|
for match in re.finditer(re.escape(short_url), text, re.IGNORECASE):
|
2014-01-10 19:04:57 +01:00
|
|
|
to_linkify.append({
|
|
|
|
'start': match.start(),
|
|
|
|
'end': match.end(),
|
|
|
|
'url': short_url,
|
|
|
|
'text': expanded_url,
|
|
|
|
})
|
2014-01-08 22:56:48 +01:00
|
|
|
|
2016-01-25 21:53:23 +01:00
|
|
|
to_linkify.sort(key=lambda x: x['start'])
|
|
|
|
p = current_node = markdown.util.etree.Element('p')
|
|
|
|
|
2014-01-08 22:56:48 +01:00
|
|
|
def set_text(text):
|
2016-06-04 19:28:49 +02:00
|
|
|
# type: (text_type) -> None
|
2014-01-08 22:56:48 +01:00
|
|
|
"""
|
|
|
|
Helper to set the text or the tail of the current_node
|
|
|
|
"""
|
|
|
|
if current_node == p:
|
|
|
|
current_node.text = text
|
|
|
|
else:
|
|
|
|
current_node.tail = text
|
|
|
|
|
|
|
|
current_index = 0
|
|
|
|
for link in to_linkify:
|
|
|
|
# The text we want to link starts in already linked text skip it
|
|
|
|
if link['start'] < current_index:
|
|
|
|
continue
|
|
|
|
# Add text from the end of last link to the start of the current
|
|
|
|
# link
|
|
|
|
set_text(text[current_index:link['start']])
|
|
|
|
current_index = link['end']
|
|
|
|
current_node = a = url_to_a(link['url'], link['text'])
|
|
|
|
p.append(a)
|
|
|
|
|
|
|
|
# Add any unused text
|
|
|
|
set_text(text[current_index:])
|
|
|
|
return p
|
|
|
|
|
2013-03-08 06:27:16 +01:00
|
|
|
def twitter_link(self, url):
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: (text_type) -> Optional[Element]
|
2013-04-30 21:37:22 +02:00
|
|
|
tweet_id = get_tweet_id(url)
|
2013-03-08 06:27:16 +01:00
|
|
|
|
2013-04-30 21:37:22 +02:00
|
|
|
if not tweet_id:
|
2013-03-08 06:27:16 +01:00
|
|
|
return None
|
|
|
|
|
|
|
|
try:
|
2013-03-11 16:23:34 +01:00
|
|
|
res = fetch_tweet_data(tweet_id)
|
2013-03-12 23:40:41 +01:00
|
|
|
if res is None:
|
|
|
|
return None
|
2013-03-08 06:27:16 +01:00
|
|
|
user = res['user']
|
|
|
|
tweet = markdown.util.etree.Element("div")
|
|
|
|
tweet.set("class", "twitter-tweet")
|
|
|
|
img_a = markdown.util.etree.SubElement(tweet, 'a')
|
|
|
|
img_a.set("href", url)
|
|
|
|
img_a.set("target", "_blank")
|
|
|
|
profile_img = markdown.util.etree.SubElement(img_a, 'img')
|
|
|
|
profile_img.set('class', 'twitter-avatar')
|
2013-03-08 20:48:14 +01:00
|
|
|
# For some reason, for, e.g. tweet 285072525413724161,
|
|
|
|
# python-twitter does not give us a
|
|
|
|
# profile_image_url_https, but instead puts that URL in
|
|
|
|
# profile_image_url. So use _https if available, but fall
|
|
|
|
# back gracefully.
|
|
|
|
image_url = user.get('profile_image_url_https', user['profile_image_url'])
|
|
|
|
profile_img.set('src', image_url)
|
2014-01-08 22:56:48 +01:00
|
|
|
|
2013-08-12 22:38:36 +02:00
|
|
|
## TODO: unescape is an internal function, so we should
|
|
|
|
## use something else if we can find it
|
2015-11-01 17:14:36 +01:00
|
|
|
text = six.moves.html_parser.HTMLParser().unescape(res['text'])
|
2014-01-08 22:56:48 +01:00
|
|
|
urls = res.get('urls', {})
|
|
|
|
user_mentions = res.get('user_mentions', [])
|
2014-01-10 19:04:57 +01:00
|
|
|
media = res.get('media', [])
|
|
|
|
p = self.twitter_text(text, urls, user_mentions, media)
|
2014-01-08 22:56:48 +01:00
|
|
|
tweet.append(p)
|
|
|
|
|
2013-03-08 06:27:16 +01:00
|
|
|
span = markdown.util.etree.SubElement(tweet, 'span')
|
|
|
|
span.text = "- %s (@%s)" % (user['name'], user['screen_name'])
|
|
|
|
|
2014-01-10 19:04:57 +01:00
|
|
|
# Add image previews
|
|
|
|
for media_item in media:
|
|
|
|
# Only photos have a preview image
|
|
|
|
if media_item['type'] != 'photo':
|
|
|
|
continue
|
|
|
|
|
|
|
|
# Find the image size that is smaller than
|
|
|
|
# TWITTER_MAX_IMAGE_HEIGHT px tall or the smallest
|
2016-01-25 01:27:18 +01:00
|
|
|
size_name_tuples = list(media_item['sizes'].items())
|
2014-01-10 19:04:57 +01:00
|
|
|
size_name_tuples.sort(reverse=True,
|
|
|
|
key=lambda x: x[1]['h'])
|
|
|
|
for size_name, size in size_name_tuples:
|
|
|
|
if size['h'] < self.TWITTER_MAX_IMAGE_HEIGHT:
|
|
|
|
break
|
|
|
|
|
|
|
|
media_url = '%s:%s' % (media_item['media_url_https'], size_name)
|
2014-01-10 21:08:13 +01:00
|
|
|
img_div = markdown.util.etree.SubElement(tweet, 'div')
|
|
|
|
img_div.set('class', 'twitter-image')
|
|
|
|
img_a = markdown.util.etree.SubElement(img_div, 'a')
|
2014-01-10 19:04:57 +01:00
|
|
|
img_a.set('href', media_item['url'])
|
|
|
|
img_a.set('target', '_blank')
|
|
|
|
img_a.set('title', media_item['url'])
|
|
|
|
img = markdown.util.etree.SubElement(img_a, 'img')
|
|
|
|
img.set('src', media_url)
|
|
|
|
|
2013-05-21 16:59:09 +02:00
|
|
|
return tweet
|
2013-03-08 06:27:16 +01:00
|
|
|
except:
|
|
|
|
# We put this in its own try-except because it requires external
|
|
|
|
# connectivity. If Twitter flakes out, we don't want to not-render
|
|
|
|
# the entire message; we just want to not show the Twitter preview.
|
2013-03-11 16:23:34 +01:00
|
|
|
logging.warning(traceback.format_exc())
|
2013-03-08 06:27:16 +01:00
|
|
|
return None
|
|
|
|
|
2013-05-21 16:59:09 +02:00
|
|
|
def run(self, root):
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: (Element) -> None
|
2013-05-21 16:59:09 +02:00
|
|
|
# Get all URLs from the blob
|
|
|
|
found_urls = walk_tree(root, lambda e: e.get("href") if e.tag == "a" else None)
|
|
|
|
|
|
|
|
# If there are more than 5 URLs in the message, don't do inline previews
|
|
|
|
if len(found_urls) == 0 or len(found_urls) > 5:
|
|
|
|
return
|
|
|
|
|
2014-01-28 22:17:12 +01:00
|
|
|
rendered_tweet_count = 0
|
2014-07-17 02:41:49 +02:00
|
|
|
|
2013-05-21 16:59:09 +02:00
|
|
|
for url in found_urls:
|
2014-05-21 08:11:29 +02:00
|
|
|
dropbox_image = self.dropbox_image(url)
|
|
|
|
if dropbox_image is not None:
|
|
|
|
class_attr = "message_inline_ref"
|
|
|
|
is_image = dropbox_image["is_image"]
|
|
|
|
if is_image:
|
|
|
|
class_attr = "message_inline_image"
|
|
|
|
# Not making use of title and description of images
|
|
|
|
add_a(root, dropbox_image['image'], url,
|
|
|
|
title=dropbox_image.get('title', ""),
|
|
|
|
desc=dropbox_image.get('desc', ""),
|
|
|
|
class_attr=class_attr)
|
2013-05-21 16:59:09 +02:00
|
|
|
continue
|
|
|
|
if self.is_image(url):
|
|
|
|
add_a(root, url, url)
|
|
|
|
continue
|
|
|
|
if get_tweet_id(url):
|
2014-01-28 22:17:12 +01:00
|
|
|
if rendered_tweet_count >= self.TWITTER_MAX_TO_PREVIEW:
|
2013-05-21 16:59:09 +02:00
|
|
|
# Only render at most one tweet per message
|
|
|
|
continue
|
2013-05-29 21:38:16 +02:00
|
|
|
twitter_data = self.twitter_link(url)
|
|
|
|
if twitter_data is None:
|
|
|
|
# This link is not actually a tweet known to twitter
|
|
|
|
continue
|
2014-01-28 22:17:12 +01:00
|
|
|
rendered_tweet_count += 1
|
2013-05-21 16:59:09 +02:00
|
|
|
div = markdown.util.etree.SubElement(root, "div")
|
|
|
|
div.set("class", "inline-preview-twitter")
|
2013-05-29 21:38:16 +02:00
|
|
|
div.insert(0, twitter_data)
|
2013-05-21 16:59:09 +02:00
|
|
|
continue
|
|
|
|
youtube = self.youtube_image(url)
|
|
|
|
if youtube is not None:
|
|
|
|
add_a(root, youtube, url)
|
|
|
|
continue
|
|
|
|
|
2013-11-12 23:37:33 +01:00
|
|
|
class Avatar(markdown.inlinepatterns.Pattern):
|
2012-10-17 04:42:19 +02:00
|
|
|
def handleMatch(self, match):
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: (Match[text_type]) -> Optional[Element]
|
2012-10-17 04:42:19 +02:00
|
|
|
img = markdown.util.etree.Element('img')
|
2013-10-30 16:52:28 +01:00
|
|
|
email_address = match.group('email')
|
|
|
|
img.set('class', 'message_body_gravatar')
|
2013-11-12 23:06:36 +01:00
|
|
|
img.set('src', '/avatar/%s?s=30' % (email_address,))
|
2013-10-30 16:52:28 +01:00
|
|
|
img.set('title', email_address)
|
|
|
|
img.set('alt', email_address)
|
2012-10-17 04:42:19 +02:00
|
|
|
return img
|
|
|
|
|
2015-08-21 11:24:18 +02:00
|
|
|
if settings.VOYAGER:
|
2013-11-14 16:36:29 +01:00
|
|
|
path_to_emoji = os.path.join(os.path.dirname(__file__), '..', '..', '..',
|
|
|
|
'prod-static', 'serve', 'third', 'gemoji', 'images', 'emoji', '*.png')
|
|
|
|
else:
|
|
|
|
path_to_emoji = os.path.join(os.path.dirname(__file__), '..', '..', '..',
|
|
|
|
# This should be the root
|
|
|
|
'static', 'third', 'gemoji', 'images', 'emoji', '*.png')
|
|
|
|
|
2013-03-01 22:07:27 +01:00
|
|
|
emoji_list = [os.path.splitext(os.path.basename(fn))[0] for fn in glob.glob(path_to_emoji)]
|
|
|
|
|
2013-11-14 16:36:29 +01:00
|
|
|
|
2013-08-22 20:50:00 +02:00
|
|
|
def make_emoji(emoji_name, src, display_string):
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: (text_type, text_type, text_type) -> Element
|
2013-03-01 22:07:27 +01:00
|
|
|
elt = markdown.util.etree.Element('img')
|
2013-08-22 20:50:00 +02:00
|
|
|
elt.set('src', src)
|
2013-03-01 22:07:27 +01:00
|
|
|
elt.set('class', 'emoji')
|
|
|
|
elt.set("alt", display_string)
|
|
|
|
elt.set("title", display_string)
|
|
|
|
return elt
|
|
|
|
|
|
|
|
class Emoji(markdown.inlinepatterns.Pattern):
|
|
|
|
def handleMatch(self, match):
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: (Match[text_type]) -> Optional[Element]
|
2013-03-01 22:07:27 +01:00
|
|
|
orig_syntax = match.group("syntax")
|
|
|
|
name = orig_syntax[1:-1]
|
2013-08-22 20:50:00 +02:00
|
|
|
|
2016-04-28 06:43:40 +02:00
|
|
|
realm_emoji = {} # type: Dict[str, Dict[str, str]]
|
2013-10-09 20:48:05 +02:00
|
|
|
if db_data is not None:
|
|
|
|
realm_emoji = db_data['emoji']
|
2013-08-22 20:50:00 +02:00
|
|
|
|
|
|
|
if current_message and name in realm_emoji:
|
2016-04-28 06:43:40 +02:00
|
|
|
return make_emoji(name, realm_emoji[name]['display_url'], orig_syntax)
|
2013-08-22 20:50:00 +02:00
|
|
|
elif name in emoji_list:
|
2016-06-13 08:18:33 +02:00
|
|
|
src = '/static/third/gemoji/images/emoji/%s.png' % (name)
|
2013-08-22 20:50:00 +02:00
|
|
|
return make_emoji(name, src, orig_syntax)
|
|
|
|
else:
|
2013-09-27 20:04:46 +02:00
|
|
|
return None
|
2013-03-01 22:07:27 +01:00
|
|
|
|
2014-01-17 20:00:04 +01:00
|
|
|
class StreamSubscribeButton(markdown.inlinepatterns.Pattern):
|
|
|
|
# This markdown extension has required javascript in
|
|
|
|
# static/js/custom_markdown.js
|
|
|
|
def handleMatch(self, match):
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: (Match[text_type]) -> Element
|
2014-01-17 20:00:04 +01:00
|
|
|
stream_name = match.group('stream_name')
|
|
|
|
stream_name = stream_name.replace('\\)', ')').replace('\\\\', '\\')
|
|
|
|
|
|
|
|
span = markdown.util.etree.Element('span')
|
|
|
|
span.set('class', 'inline-subscribe')
|
|
|
|
span.set('data-stream-name', stream_name)
|
|
|
|
|
|
|
|
button = markdown.util.etree.SubElement(span, 'button')
|
|
|
|
button.text = 'Subscribe to ' + stream_name
|
2014-03-05 22:28:57 +01:00
|
|
|
button.set('class', 'inline-subscribe-button btn')
|
2014-01-17 20:00:04 +01:00
|
|
|
|
|
|
|
error = markdown.util.etree.SubElement(span, 'span')
|
|
|
|
error.set('class', 'inline-subscribe-error')
|
|
|
|
|
|
|
|
return span
|
|
|
|
|
2014-01-22 22:20:42 +01:00
|
|
|
class ModalLink(markdown.inlinepatterns.Pattern):
|
|
|
|
"""
|
|
|
|
A pattern that allows including in-app modal links in messages.
|
|
|
|
"""
|
|
|
|
def handleMatch(self, match):
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: (Match[text_type]) -> Element
|
2014-01-22 22:20:42 +01:00
|
|
|
relative_url = match.group('relative_url')
|
|
|
|
text = match.group('text')
|
|
|
|
|
|
|
|
a_tag = markdown.util.etree.Element("a")
|
|
|
|
a_tag.set("href", relative_url)
|
|
|
|
a_tag.set("title", relative_url)
|
|
|
|
a_tag.set("data-toggle", "modal")
|
|
|
|
a_tag.text = text
|
|
|
|
|
|
|
|
return a_tag
|
|
|
|
|
2016-06-04 03:04:08 +02:00
|
|
|
upload_re = re.compile(ur"^(?:https://%s.s3.amazonaws.com|/user_uploads/\d+)/[^/]*/([^/]*)$" % (settings.S3_BUCKET,))
|
2013-10-23 21:45:15 +02:00
|
|
|
def url_filename(url):
|
2016-06-04 03:04:08 +02:00
|
|
|
# type: (text_type) -> text_type
|
2013-10-23 21:45:15 +02:00
|
|
|
"""Extract the filename if a URL is an uploaded file, or return the original URL"""
|
|
|
|
match = upload_re.match(url)
|
|
|
|
if match:
|
|
|
|
return match.group(1)
|
|
|
|
else:
|
|
|
|
return url
|
|
|
|
|
2013-03-29 20:17:33 +01:00
|
|
|
def fixup_link(link, target_blank=True):
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: (markdown.util.etree.Element, bool) -> None
|
2012-12-04 20:15:50 +01:00
|
|
|
"""Set certain attributes we want on every link."""
|
2013-03-29 20:17:33 +01:00
|
|
|
if target_blank:
|
|
|
|
link.set('target', '_blank')
|
2013-10-23 21:45:15 +02:00
|
|
|
link.set('title', url_filename(link.get('href')))
|
2012-12-04 20:15:50 +01:00
|
|
|
|
2013-02-01 23:15:05 +01:00
|
|
|
|
|
|
|
def sanitize_url(url):
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: (text_type) -> text_type
|
2013-02-01 23:15:05 +01:00
|
|
|
"""
|
|
|
|
Sanitize a url against xss attacks.
|
|
|
|
See the docstring on markdown.inlinepatterns.LinkPattern.sanitize_url.
|
|
|
|
"""
|
|
|
|
try:
|
2016-01-24 03:39:44 +01:00
|
|
|
parts = urllib.parse.urlparse(url.replace(' ', '%20'))
|
2013-02-01 23:15:05 +01:00
|
|
|
scheme, netloc, path, params, query, fragment = parts
|
|
|
|
except ValueError:
|
|
|
|
# Bad url - so bad it couldn't be parsed.
|
|
|
|
return ''
|
|
|
|
|
2013-03-29 20:17:33 +01:00
|
|
|
# If there is no scheme or netloc and there is a '@' in the path,
|
|
|
|
# treat it as a mailto: and set the appropriate scheme
|
|
|
|
if scheme == '' and netloc == '' and '@' in path:
|
|
|
|
scheme = 'mailto'
|
2013-10-24 18:06:33 +02:00
|
|
|
elif scheme == '' and netloc == '' and len(path) > 0 and path[0] == '/':
|
|
|
|
# Allow domain-relative links
|
2016-01-24 03:39:44 +01:00
|
|
|
return urllib.parse.urlunparse(('', '', path, params, query, fragment))
|
2013-10-24 18:06:33 +02:00
|
|
|
elif (scheme, netloc, path, params, query) == ('', '', '', '', '') and len(fragment) > 0:
|
|
|
|
# Allow fragment links
|
2016-01-24 03:39:44 +01:00
|
|
|
return urllib.parse.urlunparse(('', '', '', '', '', fragment))
|
2013-03-29 20:17:33 +01:00
|
|
|
|
2013-08-06 21:32:15 +02:00
|
|
|
# Zulip modification: If scheme is not specified, assume http://
|
2013-02-01 23:15:05 +01:00
|
|
|
# We re-enter sanitize_url because netloc etc. need to be re-parsed.
|
|
|
|
if not scheme:
|
|
|
|
return sanitize_url('http://' + url)
|
|
|
|
|
2013-04-02 19:36:37 +02:00
|
|
|
locless_schemes = ['mailto', 'news']
|
2013-02-01 23:15:05 +01:00
|
|
|
if netloc == '' and scheme not in locless_schemes:
|
|
|
|
# This fails regardless of anything else.
|
|
|
|
# Return immediately to save additional proccessing
|
2013-02-26 22:41:39 +01:00
|
|
|
return None
|
2013-02-01 23:15:05 +01:00
|
|
|
|
2013-04-02 19:57:35 +02:00
|
|
|
# Upstream code will accept a URL like javascript://foo because it
|
|
|
|
# appears to have a netloc. Additionally there are plenty of other
|
|
|
|
# schemes that do weird things like launch external programs. To be
|
|
|
|
# on the safe side, we whitelist the scheme.
|
|
|
|
if scheme not in ('http', 'https', 'ftp', 'mailto'):
|
|
|
|
return None
|
|
|
|
|
2013-04-02 19:36:37 +02:00
|
|
|
# Upstream code scans path, parameters, and query for colon characters
|
|
|
|
# because
|
|
|
|
#
|
2016-01-24 03:39:44 +01:00
|
|
|
# some aliases [for javascript:] will appear to urllib.parse to have
|
2013-04-02 19:36:37 +02:00
|
|
|
# no scheme. On top of that relative links (i.e.: "foo/bar.html")
|
|
|
|
# have no scheme.
|
|
|
|
#
|
|
|
|
# We already converted an empty scheme to http:// above, so we skip
|
|
|
|
# the colon check, which would also forbid a lot of legitimate URLs.
|
2013-02-01 23:15:05 +01:00
|
|
|
|
|
|
|
# Url passes all tests. Return url as-is.
|
2016-01-24 03:39:44 +01:00
|
|
|
return urllib.parse.urlunparse((scheme, netloc, path, params, query, fragment))
|
2013-02-01 23:15:05 +01:00
|
|
|
|
2013-06-05 17:45:57 +02:00
|
|
|
def url_to_a(url, text = None):
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: (text_type, Optional[text_type]) -> Union[Element, text_type]
|
2013-02-11 20:49:48 +01:00
|
|
|
a = markdown.util.etree.Element('a')
|
2013-02-26 22:41:39 +01:00
|
|
|
|
2013-03-29 20:17:33 +01:00
|
|
|
href = sanitize_url(url)
|
2013-02-26 22:41:39 +01:00
|
|
|
if href is None:
|
|
|
|
# Rejected by sanitize_url; render it as plain text.
|
|
|
|
return url
|
2013-06-05 17:45:57 +02:00
|
|
|
if text is None:
|
2013-10-02 21:14:22 +02:00
|
|
|
text = markdown.util.AtomicString(url)
|
2013-02-26 22:41:39 +01:00
|
|
|
|
|
|
|
a.set('href', href)
|
2013-06-05 17:45:57 +02:00
|
|
|
a.text = text
|
2016-05-10 01:55:43 +02:00
|
|
|
fixup_link(a, 'mailto:' not in href[:7])
|
2013-02-11 20:49:48 +01:00
|
|
|
return a
|
|
|
|
|
2012-10-22 02:32:18 +02:00
|
|
|
class AutoLink(markdown.inlinepatterns.Pattern):
|
2013-06-21 23:42:33 +02:00
|
|
|
def __init__(self, pattern):
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: (text_type) -> None
|
2013-06-21 23:42:33 +02:00
|
|
|
markdown.inlinepatterns.Pattern.__init__(self, ' ')
|
|
|
|
|
|
|
|
# HACK: we just had python-markdown compile an empty regex.
|
|
|
|
# Now replace with the real regex compiled with the flags we want.
|
|
|
|
|
|
|
|
self.pattern = pattern
|
|
|
|
self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern,
|
|
|
|
re.DOTALL | re.UNICODE | re.VERBOSE)
|
|
|
|
|
2013-02-11 20:49:48 +01:00
|
|
|
def handleMatch(self, match):
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: (Match[text_type]) -> ElementStringNone
|
2013-02-11 20:49:48 +01:00
|
|
|
url = match.group('url')
|
|
|
|
return url_to_a(url)
|
2012-10-22 02:32:18 +02:00
|
|
|
|
2012-11-02 18:25:37 +01:00
|
|
|
class UListProcessor(markdown.blockprocessors.OListProcessor):
|
|
|
|
""" Process unordered list blocks.
|
|
|
|
|
|
|
|
Based on markdown.blockprocessors.UListProcessor, but does not accept
|
2013-01-23 23:07:01 +01:00
|
|
|
'+' or '-' as a bullet character."""
|
2012-11-02 18:25:37 +01:00
|
|
|
|
|
|
|
TAG = 'ul'
|
2013-01-23 23:07:01 +01:00
|
|
|
RE = re.compile(r'^[ ]{0,3}[*][ ]+(.*)')
|
2012-11-02 18:25:37 +01:00
|
|
|
|
2013-01-24 19:35:20 +01:00
|
|
|
class BugdownUListPreprocessor(markdown.preprocessors.Preprocessor):
|
|
|
|
""" Allows unordered list blocks that come directly after a
|
|
|
|
paragraph to be rendered as an unordered list
|
|
|
|
|
|
|
|
Detects paragraphs that have a matching list item that comes
|
|
|
|
directly after a line of text, and inserts a newline between
|
|
|
|
to satisfy Markdown"""
|
|
|
|
|
|
|
|
LI_RE = re.compile(r'^[ ]{0,3}[*][ ]+(.*)', re.MULTILINE)
|
|
|
|
HANGING_ULIST_RE = re.compile(r'^.+\n([ ]{0,3}[*][ ]+.*)', re.MULTILINE)
|
|
|
|
|
|
|
|
def run(self, lines):
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: (List[str]) -> List[str]
|
2013-01-24 19:35:20 +01:00
|
|
|
""" Insert a newline between a paragraph and ulist if missing """
|
|
|
|
inserts = 0
|
|
|
|
fence = None
|
|
|
|
copy = lines[:]
|
2015-11-01 17:15:05 +01:00
|
|
|
for i in range(len(lines) - 1):
|
2013-01-24 19:35:20 +01:00
|
|
|
# Ignore anything that is inside a fenced code block
|
|
|
|
m = FENCE_RE.match(lines[i])
|
|
|
|
if not fence and m:
|
|
|
|
fence = m.group('fence')
|
|
|
|
elif fence and m and fence == m.group('fence'):
|
|
|
|
fence = None
|
|
|
|
|
|
|
|
# If we're not in a fenced block and we detect an upcoming list
|
|
|
|
# hanging off a paragraph, add a newline
|
|
|
|
if not fence and lines[i] and \
|
|
|
|
self.LI_RE.match(lines[i+1]) and not self.LI_RE.match(lines[i]):
|
|
|
|
copy.insert(i+inserts+1, '')
|
|
|
|
inserts += 1
|
|
|
|
return copy
|
|
|
|
|
2012-12-04 19:57:54 +01:00
|
|
|
# Based on markdown.inlinepatterns.LinkPattern
|
|
|
|
class LinkPattern(markdown.inlinepatterns.Pattern):
|
|
|
|
""" Return a link element from the given match. """
|
|
|
|
def handleMatch(self, m):
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: (Match[text_type]) -> Optional[Element]
|
2012-12-04 19:57:54 +01:00
|
|
|
href = m.group(9)
|
2013-02-26 22:41:39 +01:00
|
|
|
if not href:
|
2013-09-18 22:04:18 +02:00
|
|
|
return None
|
2012-12-04 19:57:54 +01:00
|
|
|
|
2013-02-26 22:41:39 +01:00
|
|
|
if href[0] == "<":
|
|
|
|
href = href[1:-1]
|
|
|
|
href = sanitize_url(self.unescape(href.strip()))
|
|
|
|
if href is None:
|
2013-09-18 22:04:18 +02:00
|
|
|
return None
|
2012-12-04 19:57:54 +01:00
|
|
|
|
2013-02-26 22:41:39 +01:00
|
|
|
el = markdown.util.etree.Element('a')
|
|
|
|
el.text = m.group(2)
|
|
|
|
el.set('href', href)
|
2013-11-25 23:24:27 +01:00
|
|
|
fixup_link(el, target_blank = (href[:1] != '#'))
|
2012-12-04 19:57:54 +01:00
|
|
|
return el
|
|
|
|
|
2013-07-15 17:56:45 +02:00
|
|
|
def prepare_realm_pattern(source):
|
2016-06-12 11:52:42 +02:00
|
|
|
# type: (text_type) -> text_type
|
2013-07-15 17:56:45 +02:00
|
|
|
""" Augment a realm filter so it only matches after start-of-string,
|
|
|
|
whitespace, or opening delimiters, won't match if there are word
|
|
|
|
characters directly after, and saves what was matched as "name". """
|
|
|
|
return r"""(?<![^\s'"\(,:<])(?P<name>""" + source + ')(?!\w)'
|
|
|
|
|
2013-06-05 17:45:57 +02:00
|
|
|
# Given a regular expression pattern, linkifies groups that match it
|
|
|
|
# using the provided format string to construct the URL.
|
|
|
|
class RealmFilterPattern(markdown.inlinepatterns.Pattern):
|
|
|
|
""" Applied a given realm filter to the input """
|
|
|
|
def __init__(self, source_pattern, format_string, markdown_instance=None):
|
2016-06-04 19:28:49 +02:00
|
|
|
# type: (str, str, Optional[markdown.Markdown]) -> None
|
2013-07-15 17:56:45 +02:00
|
|
|
self.pattern = prepare_realm_pattern(source_pattern)
|
2013-06-05 17:45:57 +02:00
|
|
|
self.format_string = format_string
|
|
|
|
markdown.inlinepatterns.Pattern.__init__(self, self.pattern, markdown_instance)
|
|
|
|
|
|
|
|
def handleMatch(self, m):
|
2016-06-04 19:28:49 +02:00
|
|
|
# type: (Match[text_type]) -> Union[Element, text_type]
|
2013-06-05 17:45:57 +02:00
|
|
|
return url_to_a(self.format_string % m.groupdict(),
|
|
|
|
m.group("name"))
|
|
|
|
|
2013-06-28 16:02:58 +02:00
|
|
|
class UserMentionPattern(markdown.inlinepatterns.Pattern):
|
2013-10-09 20:48:05 +02:00
|
|
|
def find_user_for_mention(self, name):
|
2016-06-04 19:28:49 +02:00
|
|
|
# type: (str) -> Tuple[bool, Dict[str, Any]]
|
2013-10-09 20:48:05 +02:00
|
|
|
if db_data is None:
|
|
|
|
return (False, None)
|
|
|
|
|
|
|
|
if mention.user_mention_matches_wildcard(name):
|
|
|
|
return (True, None)
|
|
|
|
|
|
|
|
user = db_data['full_names'].get(name.lower(), None)
|
|
|
|
if user is None:
|
|
|
|
user = db_data['short_names'].get(name.lower(), None)
|
|
|
|
|
|
|
|
return (False, user)
|
|
|
|
|
2013-06-28 16:02:58 +02:00
|
|
|
def handleMatch(self, m):
|
2016-06-04 19:28:49 +02:00
|
|
|
# type: (Match[text_type]) -> Optional[Element]
|
2013-06-28 16:02:58 +02:00
|
|
|
name = m.group(2) or m.group(3)
|
|
|
|
|
|
|
|
if current_message:
|
2013-10-09 20:48:05 +02:00
|
|
|
wildcard, user = self.find_user_for_mention(name)
|
2013-06-28 16:02:58 +02:00
|
|
|
|
|
|
|
if wildcard:
|
|
|
|
current_message.mentions_wildcard = True
|
|
|
|
email = "*"
|
|
|
|
elif user:
|
2013-10-09 20:48:05 +02:00
|
|
|
current_message.mentions_user_ids.add(user['id'])
|
|
|
|
name = user['full_name']
|
|
|
|
email = user['email']
|
2013-06-28 16:02:58 +02:00
|
|
|
else:
|
|
|
|
# Don't highlight @mentions that don't refer to a valid user
|
|
|
|
return None
|
|
|
|
|
|
|
|
el = markdown.util.etree.Element("span")
|
|
|
|
el.set('class', 'user-mention')
|
|
|
|
el.set('data-user-email', email)
|
|
|
|
el.text = "@%s" % (name,)
|
|
|
|
return el
|
|
|
|
|
2013-09-03 22:41:17 +02:00
|
|
|
class AlertWordsNotificationProcessor(markdown.preprocessors.Preprocessor):
|
|
|
|
def run(self, lines):
|
2016-06-04 19:28:49 +02:00
|
|
|
# type: (Iterable[str]) -> Iterable[str]
|
2013-10-09 20:48:05 +02:00
|
|
|
if current_message and db_data is not None:
|
2013-09-03 22:41:17 +02:00
|
|
|
# We check for a user's custom notifications here, as we want
|
|
|
|
# to check for plaintext words that depend on the recipient.
|
2013-10-09 20:48:05 +02:00
|
|
|
realm_words = db_data['realm_alert_words']
|
2013-10-09 20:09:48 +02:00
|
|
|
content = '\n'.join(lines).lower()
|
|
|
|
|
2014-02-13 21:12:19 +01:00
|
|
|
allowed_before_punctuation = "|".join([r'\s', '^', r'[\(\".,\';\[\*`>]'])
|
|
|
|
allowed_after_punctuation = "|".join([r'\s', '$', r'[\)\"\?:.,\';\]!\*`]'])
|
2013-10-09 20:48:05 +02:00
|
|
|
|
2016-03-11 10:57:29 +01:00
|
|
|
for user_id, words in six.iteritems(realm_words):
|
2013-09-03 22:41:17 +02:00
|
|
|
for word in words:
|
2013-10-09 20:09:48 +02:00
|
|
|
escaped = re.escape(word.lower())
|
|
|
|
match_re = re.compile(r'(?:%s)%s(?:%s)' %
|
|
|
|
(allowed_before_punctuation,
|
|
|
|
escaped,
|
|
|
|
allowed_after_punctuation))
|
2013-09-03 22:41:17 +02:00
|
|
|
if re.search(match_re, content):
|
2013-10-09 20:48:05 +02:00
|
|
|
current_message.user_ids_with_alert_words.add(user_id)
|
2013-09-03 22:41:17 +02:00
|
|
|
|
|
|
|
return lines
|
|
|
|
|
2013-07-31 22:53:15 +02:00
|
|
|
# This prevents realm_filters from running on the content of a
|
|
|
|
# Markdown link, breaking up the link. This is a monkey-patch, but it
|
|
|
|
# might be worth sending a version of this change upstream.
|
|
|
|
class AtomicLinkPattern(LinkPattern):
|
|
|
|
def handleMatch(self, m):
|
2016-06-04 19:28:49 +02:00
|
|
|
# type: (Match[text_type]) -> Optional[Element]
|
2013-07-31 22:53:15 +02:00
|
|
|
ret = LinkPattern.handleMatch(self, m)
|
2013-09-18 22:04:18 +02:00
|
|
|
if ret is None:
|
|
|
|
return None
|
2015-11-01 17:14:25 +01:00
|
|
|
if not isinstance(ret, six.string_types):
|
2013-07-31 22:53:15 +02:00
|
|
|
ret.text = markdown.util.AtomicString(ret.text)
|
|
|
|
return ret
|
|
|
|
|
2012-10-16 17:35:58 +02:00
|
|
|
class Bugdown(markdown.Extension):
|
|
|
|
def extendMarkdown(self, md, md_globals):
|
2016-06-04 19:28:49 +02:00
|
|
|
# type: (markdown.Markdown, Dict[str, Any]) -> None
|
2012-12-04 20:22:14 +01:00
|
|
|
del md.preprocessors['reference']
|
|
|
|
|
|
|
|
for k in ('image_link', 'image_reference', 'automail',
|
2012-12-11 21:19:15 +01:00
|
|
|
'autolink', 'link', 'reference', 'short_reference',
|
2013-01-31 21:13:09 +01:00
|
|
|
'escape', 'strong_em', 'emphasis', 'emphasis2',
|
2013-10-22 20:30:14 +02:00
|
|
|
'linebreak', 'strong'):
|
2012-10-22 02:35:36 +02:00
|
|
|
del md.inlinePatterns[k]
|
2013-10-22 20:30:14 +02:00
|
|
|
try:
|
|
|
|
# linebreak2 was removed upstream in version 3.2.1, so
|
|
|
|
# don't throw an error if it is not there
|
|
|
|
del md.inlinePatterns['linebreak2']
|
|
|
|
except Exception:
|
|
|
|
pass
|
2012-10-22 02:35:36 +02:00
|
|
|
|
2013-09-06 20:51:40 +02:00
|
|
|
md.preprocessors.add("custom_text_notifications", AlertWordsNotificationProcessor(md), "_end")
|
2013-09-03 22:41:17 +02:00
|
|
|
|
2013-01-31 21:13:09 +01:00
|
|
|
# Custom bold syntax: **foo** but not __foo__
|
|
|
|
md.inlinePatterns.add('strong',
|
2013-02-14 17:45:04 +01:00
|
|
|
markdown.inlinepatterns.SimpleTagPattern(r'(\*\*)([^\n]+?)\2', 'strong'),
|
2013-01-31 21:13:09 +01:00
|
|
|
'>not_strong')
|
|
|
|
|
2012-11-02 18:25:37 +01:00
|
|
|
for k in ('hashheader', 'setextheader', 'olist', 'ulist'):
|
2012-10-22 02:35:36 +02:00
|
|
|
del md.parser.blockprocessors[k]
|
2012-10-16 17:35:58 +02:00
|
|
|
|
2012-11-02 18:25:37 +01:00
|
|
|
md.parser.blockprocessors.add('ulist', UListProcessor(md.parser), '>hr')
|
|
|
|
|
2013-11-12 23:48:05 +01:00
|
|
|
# Note that !gravatar syntax should be deprecated long term.
|
|
|
|
md.inlinePatterns.add('avatar', Avatar(r'!avatar\((?P<email>[^)]*)\)'), '_begin')
|
2013-11-12 23:37:33 +01:00
|
|
|
md.inlinePatterns.add('gravatar', Avatar(r'!gravatar\((?P<email>[^)]*)\)'), '_begin')
|
2013-11-12 23:48:05 +01:00
|
|
|
|
2014-01-17 20:00:04 +01:00
|
|
|
md.inlinePatterns.add('stream_subscribe_button', StreamSubscribeButton(r'!_stream_subscribe_button\((?P<stream_name>(?:[^)\\]|\\\)|\\)*)\)'), '_begin')
|
2014-01-22 22:20:42 +01:00
|
|
|
md.inlinePatterns.add(
|
|
|
|
'modal_link',
|
|
|
|
ModalLink(r'!modal_link\((?P<relative_url>[^)]*), (?P<text>[^)]*)\)'),
|
|
|
|
'_begin')
|
2013-06-28 16:02:58 +02:00
|
|
|
md.inlinePatterns.add('usermention', UserMentionPattern(mention.find_mentions), '>backtick')
|
2013-10-02 21:14:22 +02:00
|
|
|
md.inlinePatterns.add('emoji', Emoji(r'(?<!\w)(?P<syntax>:[^:\s]+:)(?!\w)'), '_end')
|
2013-07-31 22:53:15 +02:00
|
|
|
md.inlinePatterns.add('link', AtomicLinkPattern(markdown.inlinepatterns.LINK_RE, md), '>backtick')
|
2012-10-17 04:42:19 +02:00
|
|
|
|
2013-06-05 17:45:57 +02:00
|
|
|
for (pattern, format_string) in self.getConfig("realm_filters"):
|
|
|
|
md.inlinePatterns.add('realm_filters/%s' % (pattern,),
|
2013-07-31 22:53:15 +02:00
|
|
|
RealmFilterPattern(pattern, format_string), '>link')
|
2013-06-05 17:45:57 +02:00
|
|
|
|
2013-02-01 20:04:28 +01:00
|
|
|
# A link starts at a word boundary, and ends at space, punctuation, or end-of-input.
|
2012-11-20 19:33:10 +01:00
|
|
|
#
|
2013-06-21 23:44:22 +02:00
|
|
|
# We detect a url either by the `https?://` or by building around the TLD.
|
2014-02-06 23:07:24 +01:00
|
|
|
|
|
|
|
# In lieu of having a recursive regex (which python doesn't support) to match
|
|
|
|
# arbitrary numbers of nested matching parenthesis, we manually build a regexp that
|
|
|
|
# can match up to six
|
|
|
|
# The inner_paren_contents chunk matches the innermore non-parenthesis-holding text,
|
|
|
|
# and the paren_group matches text with, optionally, a matching set of parens
|
|
|
|
inner_paren_contents = r"[^\s()\"]*"
|
|
|
|
paren_group = r"""
|
|
|
|
[^\s()\"]*? # Containing characters that won't end the URL
|
|
|
|
(?: \( %s \) # and more characters in matched parens
|
|
|
|
[^\s()\"]*? # followed by more characters
|
|
|
|
)* # zero-or-more sets of paired parens
|
|
|
|
"""
|
|
|
|
nested_paren_chunk = paren_group
|
|
|
|
for i in range(6):
|
|
|
|
nested_paren_chunk = nested_paren_chunk % (paren_group,)
|
|
|
|
nested_paren_chunk = nested_paren_chunk % (inner_paren_contents,)
|
2013-04-02 17:08:00 +02:00
|
|
|
tlds = '|'.join(list_of_tlds())
|
2013-06-21 23:42:33 +02:00
|
|
|
link_regex = r"""
|
2013-07-08 16:46:52 +02:00
|
|
|
(?<![^\s'"\(,:<]) # Start after whitespace or specified chars
|
2013-07-05 19:23:14 +02:00
|
|
|
# (Double-negative lookbehind to allow start-of-string)
|
2013-06-21 23:42:33 +02:00
|
|
|
(?P<url> # Main group
|
2013-07-08 16:46:52 +02:00
|
|
|
(?:(?: # Domain part
|
2013-07-05 19:23:14 +02:00
|
|
|
https?://[\w.:@-]+? # If it has a protocol, anything goes.
|
2013-07-08 16:46:52 +02:00
|
|
|
|(?: # Or, if not, be more strict to avoid false-positives
|
|
|
|
(?:[\w-]+\.)+ # One or more domain components, separated by dots
|
|
|
|
(?:%s) # TLDs (filled in via format from tlds-alpha-by-domain.txt)
|
2013-06-21 23:44:22 +02:00
|
|
|
)
|
|
|
|
)
|
2013-06-21 23:42:33 +02:00
|
|
|
(?:/ # A path, beginning with /
|
2014-02-06 23:07:24 +01:00
|
|
|
%s # zero-to-6 sets of paired parens
|
2013-07-08 16:46:52 +02:00
|
|
|
)?) # Path is optional
|
|
|
|
| (?:[\w.-]+\@[\w.-]+\.[\w]+) # Email is separate, since it can't have a path
|
2013-06-21 23:42:33 +02:00
|
|
|
)
|
2013-07-08 16:46:52 +02:00
|
|
|
(?= # URL must be followed by (not included in group)
|
2014-03-05 21:17:48 +01:00
|
|
|
[!:;\?\),\.\'\"\>]* # Optional punctuation characters
|
2013-07-08 16:46:52 +02:00
|
|
|
(?:\Z|\s) # followed by whitespace or end of string
|
2013-06-21 23:42:33 +02:00
|
|
|
)
|
2014-02-06 23:07:24 +01:00
|
|
|
""" % (tlds, nested_paren_chunk)
|
2013-06-21 23:44:22 +02:00
|
|
|
md.inlinePatterns.add('autolink', AutoLink(link_regex), '>link')
|
2012-10-22 02:32:18 +02:00
|
|
|
|
2013-01-24 19:35:20 +01:00
|
|
|
md.preprocessors.add('hanging_ulists',
|
|
|
|
BugdownUListPreprocessor(md),
|
|
|
|
"_begin")
|
|
|
|
|
2014-07-17 02:41:49 +02:00
|
|
|
md.treeprocessors.add("inline_interesting_links", InlineInterestingLinkProcessor(md, self), "_end")
|
2013-11-15 19:53:04 +01:00
|
|
|
|
|
|
|
if settings.CAMO_URI:
|
|
|
|
md.treeprocessors.add("rewrite_to_https", InlineHttpsProcessor(md), "_end")
|
2013-03-01 19:20:53 +01:00
|
|
|
|
2013-07-29 23:57:26 +02:00
|
|
|
if self.getConfig("realm") == "mit.edu/zephyr_mirror":
|
2013-08-28 21:26:21 +02:00
|
|
|
# Disable almost all inline patterns for mit.edu users' traffic that is mirrored
|
|
|
|
# Note that inline_interesting_links is a treeprocessor and thus is not removed
|
2013-07-29 23:57:26 +02:00
|
|
|
for k in md.inlinePatterns.keys():
|
2013-08-28 21:26:21 +02:00
|
|
|
if k not in ["autolink"]:
|
2013-07-29 23:57:26 +02:00
|
|
|
del md.inlinePatterns[k]
|
2014-02-04 23:37:21 +01:00
|
|
|
for k in md.treeprocessors.keys():
|
2014-02-07 22:05:19 +01:00
|
|
|
if k not in ["inline_interesting_links", "inline", "rewrite_to_https"]:
|
2014-02-04 23:37:21 +01:00
|
|
|
del md.treeprocessors[k]
|
|
|
|
for k in md.preprocessors.keys():
|
2014-02-07 22:05:19 +01:00
|
|
|
if k not in ["custom_text_notifications"]:
|
|
|
|
del md.preprocessors[k]
|
2014-02-04 23:37:21 +01:00
|
|
|
for k in md.parser.blockprocessors.keys():
|
|
|
|
if k not in ["paragraph"]:
|
|
|
|
del md.parser.blockprocessors[k]
|
2013-06-05 17:45:57 +02:00
|
|
|
|
|
|
|
md_engines = {}
|
2016-06-12 11:52:42 +02:00
|
|
|
realm_filter_data = {} # type: Dict[text_type, List[Tuple[text_type, text_type]]]
|
2013-06-05 17:45:57 +02:00
|
|
|
|
|
|
|
def make_md_engine(key, opts):
|
2016-06-12 11:52:42 +02:00
|
|
|
# type: (text_type, Dict[str, Any]) -> None
|
2013-06-05 17:45:57 +02:00
|
|
|
md_engines[key] = markdown.Markdown(
|
|
|
|
safe_mode = 'escape',
|
|
|
|
output_format = 'html',
|
|
|
|
extensions = ['nl2br',
|
2013-10-10 22:05:12 +02:00
|
|
|
'tables',
|
2013-06-05 17:45:57 +02:00
|
|
|
codehilite.makeExtension(configs=[
|
|
|
|
('force_linenos', False),
|
|
|
|
('guess_lang', False)]),
|
|
|
|
fenced_code.makeExtension(),
|
|
|
|
Bugdown(opts)])
|
|
|
|
|
2013-07-12 22:29:25 +02:00
|
|
|
def subject_links(domain, subject):
|
2016-06-12 11:52:42 +02:00
|
|
|
# type: (text_type, text_type) -> List[text_type]
|
2013-12-11 20:06:37 +01:00
|
|
|
from zerver.models import get_realm, RealmFilter, realm_filters_for_domain
|
2016-06-03 18:38:34 +02:00
|
|
|
matches = [] # type: List[text_type]
|
2013-07-12 22:29:25 +02:00
|
|
|
|
2016-06-01 04:46:42 +02:00
|
|
|
realm_filters = realm_filters_for_domain(domain)
|
|
|
|
|
|
|
|
for realm_filter in realm_filters:
|
|
|
|
pattern = prepare_realm_pattern(realm_filter[0])
|
|
|
|
for m in re.finditer(pattern, subject):
|
|
|
|
matches += [realm_filter[1] % m.groupdict()]
|
|
|
|
return matches
|
2013-12-11 20:06:37 +01:00
|
|
|
|
|
|
|
def make_realm_filters(domain, filters):
|
2016-06-12 11:52:42 +02:00
|
|
|
# type: (text_type, List[Tuple[text_type, text_type]]) -> None
|
2013-12-11 20:06:37 +01:00
|
|
|
global md_engines, realm_filter_data
|
|
|
|
if domain in md_engines:
|
|
|
|
del md_engines[domain]
|
|
|
|
realm_filter_data[domain] = filters
|
|
|
|
|
2013-06-05 17:45:57 +02:00
|
|
|
# Because of how the Markdown config API works, this has confusing
|
|
|
|
# large number of layers of dicts/arrays :(
|
2013-12-11 20:06:37 +01:00
|
|
|
make_md_engine(domain, {"realm_filters": [filters, "Realm-specific filters for %s" % (domain,)],
|
|
|
|
"realm": [domain, "Realm name"]})
|
|
|
|
|
|
|
|
def maybe_update_realm_filters(domain):
|
2016-06-12 11:52:42 +02:00
|
|
|
# type: (Optional[text_type]) -> None
|
2013-12-11 20:06:37 +01:00
|
|
|
from zerver.models import realm_filters_for_domain, all_realm_filters
|
|
|
|
|
|
|
|
# If domain is None, load all filters
|
|
|
|
if domain is None:
|
|
|
|
all_filters = all_realm_filters()
|
|
|
|
all_filters['default'] = []
|
2016-03-11 10:57:29 +01:00
|
|
|
for domain, filters in six.iteritems(all_filters):
|
2013-12-11 20:06:37 +01:00
|
|
|
make_realm_filters(domain, filters)
|
2014-01-29 20:01:54 +01:00
|
|
|
# Hack to ensure that getConfig("realm") is right for mirrored Zephyrs
|
|
|
|
make_realm_filters("mit.edu/zephyr_mirror", [])
|
2013-12-11 20:06:37 +01:00
|
|
|
else:
|
|
|
|
realm_filters = realm_filters_for_domain(domain)
|
|
|
|
if domain not in realm_filter_data or realm_filter_data[domain] != realm_filters:
|
|
|
|
# Data has changed, re-load filters
|
|
|
|
make_realm_filters(domain, realm_filters)
|
|
|
|
|
|
|
|
maybe_update_realm_filters(domain=None)
|
2012-10-15 22:03:50 +02:00
|
|
|
|
2012-10-25 21:38:47 +02:00
|
|
|
# We want to log Markdown parser failures, but shouldn't log the actual input
|
|
|
|
# message for privacy reasons. The compromise is to replace all alphanumeric
|
|
|
|
# characters with 'x'.
|
|
|
|
#
|
|
|
|
# We also use repr() to improve reproducibility, and to escape terminal control
|
|
|
|
# codes, which can do surprisingly nasty things.
|
|
|
|
_privacy_re = re.compile(r'\w', flags=re.UNICODE)
|
|
|
|
def _sanitize_for_log(md):
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: (markdown.Markdown) -> text_type
|
2012-10-25 21:38:47 +02:00
|
|
|
return repr(_privacy_re.sub('x', md))
|
|
|
|
|
2013-06-28 16:02:58 +02:00
|
|
|
|
|
|
|
# Filters such as UserMentionPattern need a message, but python-markdown
|
|
|
|
# provides no way to pass extra params through to a pattern. Thus, a global.
|
2016-01-25 23:42:16 +01:00
|
|
|
current_message = None # type: Any # Should be Message but bugdown doesn't import models.py.
|
2013-06-28 16:02:58 +02:00
|
|
|
|
2013-10-09 20:48:05 +02:00
|
|
|
# We avoid doing DB queries in our markdown thread to avoid the overhead of
|
|
|
|
# opening a new DB connection. These connections tend to live longer than the
|
|
|
|
# threads themselves, as well.
|
2016-06-03 18:38:34 +02:00
|
|
|
db_data = None # type: Dict[text_type, Any]
|
2013-10-09 20:48:05 +02:00
|
|
|
|
2013-06-28 16:02:58 +02:00
|
|
|
def do_convert(md, realm_domain=None, message=None):
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: ignore # (markdown.Markdown, Optional[text_type], Message) -> Optional[text_type]
|
2013-08-06 21:32:15 +02:00
|
|
|
"""Convert Markdown to HTML, with Zulip-specific settings and hacks."""
|
2013-12-11 20:06:37 +01:00
|
|
|
from zerver.models import get_active_user_dicts_in_realm, UserProfile
|
|
|
|
|
|
|
|
if message:
|
|
|
|
maybe_update_realm_filters(message.get_realm().domain)
|
2012-11-20 20:15:55 +01:00
|
|
|
|
2013-06-28 16:02:58 +02:00
|
|
|
if realm_domain in md_engines:
|
|
|
|
_md_engine = md_engines[realm_domain]
|
2013-06-05 17:45:57 +02:00
|
|
|
else:
|
|
|
|
_md_engine = md_engines["default"]
|
2012-11-20 20:15:55 +01:00
|
|
|
# Reset the parser; otherwise it will get slower over time.
|
|
|
|
_md_engine.reset()
|
2012-10-15 22:03:50 +02:00
|
|
|
|
2013-06-28 16:02:58 +02:00
|
|
|
global current_message
|
|
|
|
current_message = message
|
2013-10-09 20:48:05 +02:00
|
|
|
|
|
|
|
# Pre-fetch data from the DB that is used in the bugdown thread
|
|
|
|
global db_data
|
|
|
|
if message:
|
2013-10-23 23:16:39 +02:00
|
|
|
realm_users = get_active_user_dicts_in_realm(message.get_realm())
|
2013-10-09 20:48:05 +02:00
|
|
|
|
|
|
|
db_data = {'realm_alert_words': alert_words.alert_words_in_realm(message.get_realm()),
|
|
|
|
'full_names': dict((user['full_name'].lower(), user) for user in realm_users),
|
|
|
|
'short_names': dict((user['short_name'].lower(), user) for user in realm_users),
|
|
|
|
'emoji': message.get_realm().get_emoji()}
|
|
|
|
|
2012-10-15 22:03:50 +02:00
|
|
|
try:
|
2013-01-29 21:47:53 +01:00
|
|
|
# Spend at most 5 seconds rendering.
|
|
|
|
# Sometimes Python-Markdown is really slow; see
|
2013-09-05 21:24:18 +02:00
|
|
|
# https://trac.zulip.net/ticket/345
|
2013-03-08 20:34:10 +01:00
|
|
|
return timeout(5, _md_engine.convert, md)
|
2012-10-15 22:03:50 +02:00
|
|
|
except:
|
2013-07-29 23:03:31 +02:00
|
|
|
from zerver.lib.actions import internal_send_message
|
2013-01-31 19:57:25 +01:00
|
|
|
|
|
|
|
cleaned = _sanitize_for_log(md)
|
|
|
|
|
2013-10-04 19:27:01 +02:00
|
|
|
# Output error to log as well as sending a zulip and email
|
2012-10-25 21:38:47 +02:00
|
|
|
logging.getLogger('').error('Exception in Markdown parser: %sInput (sanitized) was: %s'
|
2013-01-31 19:57:25 +01:00
|
|
|
% (traceback.format_exc(), cleaned))
|
2013-05-07 00:31:46 +02:00
|
|
|
subject = "Markdown parser failure on %s" % (platform.node(),)
|
2013-10-31 18:33:19 +01:00
|
|
|
if settings.ERROR_BOT is not None:
|
|
|
|
internal_send_message(settings.ERROR_BOT, "stream",
|
|
|
|
"errors", subject, "Markdown parser failed, email sent with details.")
|
2013-01-31 19:57:25 +01:00
|
|
|
mail.mail_admins(subject, "Failed message: %s\n\n%s\n\n" % (
|
|
|
|
cleaned, traceback.format_exc()),
|
|
|
|
fail_silently=False)
|
2013-03-08 20:34:10 +01:00
|
|
|
return None
|
2013-06-28 16:02:58 +02:00
|
|
|
finally:
|
|
|
|
current_message = None
|
2013-10-09 20:48:05 +02:00
|
|
|
db_data = None
|
2013-05-21 23:59:27 +02:00
|
|
|
|
2016-01-26 04:08:05 +01:00
|
|
|
bugdown_time_start = 0.0
|
|
|
|
bugdown_total_time = 0.0
|
2013-05-21 23:59:27 +02:00
|
|
|
bugdown_total_requests = 0
|
|
|
|
|
|
|
|
def get_bugdown_time():
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: () -> float
|
2013-05-21 23:59:27 +02:00
|
|
|
return bugdown_total_time
|
|
|
|
|
|
|
|
def get_bugdown_requests():
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: () -> int
|
2013-05-21 23:59:27 +02:00
|
|
|
return bugdown_total_requests
|
|
|
|
|
|
|
|
def bugdown_stats_start():
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: () -> None
|
2013-05-21 23:59:27 +02:00
|
|
|
global bugdown_time_start
|
|
|
|
bugdown_time_start = time.time()
|
|
|
|
|
|
|
|
def bugdown_stats_finish():
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: () -> None
|
2013-05-21 23:59:27 +02:00
|
|
|
global bugdown_total_time
|
|
|
|
global bugdown_total_requests
|
|
|
|
global bugdown_time_start
|
|
|
|
bugdown_total_requests += 1
|
|
|
|
bugdown_total_time += (time.time() - bugdown_time_start)
|
|
|
|
|
2013-06-28 16:02:58 +02:00
|
|
|
def convert(md, realm_domain=None, message=None):
|
2016-06-03 18:38:34 +02:00
|
|
|
# type: (markdown.Markdown, Optional[text_type], Optional[text_type]) -> Optional[text_type]
|
2013-05-21 23:59:27 +02:00
|
|
|
bugdown_stats_start()
|
2013-06-28 16:02:58 +02:00
|
|
|
ret = do_convert(md, realm_domain, message)
|
2013-05-21 23:59:27 +02:00
|
|
|
bugdown_stats_finish()
|
|
|
|
return ret
|