2017-11-08 17:55:36 +01:00
|
|
|
# Zulip's main markdown implementation. See docs/subsystems/markdown.md for
|
2016-04-01 06:58:14 +02:00
|
|
|
# detailed documentation on our markdown syntax.
|
2017-12-25 21:35:23 +01:00
|
|
|
from typing import (Any, Callable, Dict, Iterable, List, NamedTuple,
|
2018-05-11 01:42:51 +02:00
|
|
|
Optional, Set, Tuple, TypeVar, Union, cast)
|
2017-09-14 19:47:22 +02:00
|
|
|
from mypy_extensions import TypedDict
|
2018-11-03 17:12:15 +01:00
|
|
|
from typing.re import Match, Pattern
|
2013-10-10 21:37:26 +02:00
|
|
|
|
2012-10-15 22:03:50 +02:00
|
|
|
import markdown
|
2012-10-22 05:06:28 +02:00
|
|
|
import logging
|
|
|
|
import traceback
|
2017-11-05 05:30:31 +01:00
|
|
|
import urllib
|
2012-10-25 21:38:47 +02:00
|
|
|
import re
|
2019-01-23 20:13:05 +01:00
|
|
|
import regex
|
2017-09-22 18:30:18 +02:00
|
|
|
import os
|
2017-08-25 20:06:43 +02:00
|
|
|
import html
|
2013-05-21 23:59:27 +02:00
|
|
|
import time
|
2017-09-14 19:47:22 +02:00
|
|
|
import functools
|
2017-02-04 23:27:24 +01:00
|
|
|
import ujson
|
2014-05-21 08:11:29 +02:00
|
|
|
import xml.etree.cElementTree as etree
|
2019-02-02 23:53:31 +01:00
|
|
|
from xml.etree.cElementTree import Element
|
2013-05-07 20:50:25 +02:00
|
|
|
|
2017-10-27 14:47:54 +02:00
|
|
|
from collections import deque, defaultdict
|
2013-05-01 22:49:16 +02:00
|
|
|
|
2014-05-21 08:11:29 +02:00
|
|
|
import requests
|
|
|
|
|
2013-03-08 06:27:16 +01:00
|
|
|
from django.conf import settings
|
2017-09-14 19:47:22 +02:00
|
|
|
from django.db.models import Q
|
2013-01-31 19:57:25 +01:00
|
|
|
|
2018-11-03 19:04:52 +01:00
|
|
|
from markdown.extensions import codehilite, nl2br, tables
|
2016-06-16 13:06:13 +02:00
|
|
|
from zerver.lib.bugdown import fenced_code
|
2013-07-29 23:03:31 +02:00
|
|
|
from zerver.lib.bugdown.fenced_code import FENCE_RE
|
2016-04-28 05:40:58 +02:00
|
|
|
from zerver.lib.camo import get_camo_url
|
2018-01-15 19:36:32 +01:00
|
|
|
from zerver.lib.emoji import translate_emoticons, emoticon_regex
|
2017-09-25 09:47:15 +02:00
|
|
|
from zerver.lib.mention import possible_mentions, \
|
|
|
|
possible_user_group_mentions, extract_user_group
|
2018-07-29 00:05:45 +02:00
|
|
|
from zerver.lib.url_encoding import encode_stream
|
2019-01-04 16:22:04 +01:00
|
|
|
from zerver.lib.thumbnail import user_uploads_or_external
|
2013-07-29 23:03:31 +02:00
|
|
|
from zerver.lib.timeout import timeout, TimeoutExpired
|
2017-10-28 18:02:36 +02:00
|
|
|
from zerver.lib.cache import cache_with_key, NotFoundInCache
|
2016-10-27 12:06:44 +02:00
|
|
|
from zerver.lib.url_preview import preview as link_preview
|
2017-09-14 20:44:56 +02:00
|
|
|
from zerver.models import (
|
|
|
|
all_realm_filters,
|
|
|
|
get_active_streams,
|
2018-02-09 19:49:13 +01:00
|
|
|
MAX_MESSAGE_LENGTH,
|
2017-09-14 20:44:56 +02:00
|
|
|
Message,
|
|
|
|
Realm,
|
|
|
|
realm_filters_for_realm,
|
|
|
|
UserProfile,
|
2017-09-25 09:47:15 +02:00
|
|
|
UserGroup,
|
2017-10-27 14:47:54 +02:00
|
|
|
UserGroupMembership,
|
2017-09-14 20:44:56 +02:00
|
|
|
)
|
2013-07-29 23:03:31 +02:00
|
|
|
import zerver.lib.mention as mention
|
2017-03-20 16:56:39 +01:00
|
|
|
from zerver.lib.tex import render_tex
|
2018-06-21 15:09:14 +02:00
|
|
|
from zerver.lib.exceptions import BugdownRenderingException
|
2013-04-29 22:22:07 +02:00
|
|
|
|
2019-01-22 19:31:25 +01:00
|
|
|
ReturnT = TypeVar('ReturnT')
|
|
|
|
|
|
|
|
def one_time(method: Callable[[], ReturnT]) -> Callable[[], ReturnT]:
|
|
|
|
'''
|
|
|
|
Use this decorator with extreme caution.
|
|
|
|
The function you wrap should have no dependency
|
|
|
|
on any arguments (no args, no kwargs) nor should
|
|
|
|
it depend on any global state.
|
|
|
|
'''
|
|
|
|
val = None
|
|
|
|
|
|
|
|
def cache_wrapper() -> ReturnT:
|
|
|
|
nonlocal val
|
|
|
|
if val is None:
|
|
|
|
val = method()
|
|
|
|
return val
|
|
|
|
return cache_wrapper
|
|
|
|
|
2017-09-14 19:47:22 +02:00
|
|
|
FullNameInfo = TypedDict('FullNameInfo', {
|
|
|
|
'id': int,
|
2018-05-11 01:42:51 +02:00
|
|
|
'email': str,
|
|
|
|
'full_name': str,
|
2017-09-14 19:47:22 +02:00
|
|
|
})
|
|
|
|
|
2018-11-07 16:07:34 +01:00
|
|
|
DbData = Dict[str, Any]
|
|
|
|
|
2013-03-18 22:51:08 +01:00
|
|
|
# Format version of the bugdown rendering; stored along with rendered
|
|
|
|
# messages so that we can efficiently determine what needs to be re-rendered
|
|
|
|
version = 1
|
|
|
|
|
2016-06-03 18:38:34 +02:00
|
|
|
_T = TypeVar('_T')
|
2018-05-11 01:42:51 +02:00
|
|
|
ElementStringNone = Union[Element, Optional[str]]
|
2016-06-03 18:38:34 +02:00
|
|
|
|
2017-09-14 22:11:34 +02:00
|
|
|
AVATAR_REGEX = r'!avatar\((?P<email>[^)]*)\)'
|
|
|
|
GRAVATAR_REGEX = r'!gravatar\((?P<email>[^)]*)\)'
|
2017-09-15 03:08:15 +02:00
|
|
|
EMOJI_REGEX = r'(?P<syntax>:[\w\-\+]+:)'
|
2017-09-14 22:11:34 +02:00
|
|
|
|
2018-11-03 17:12:15 +01:00
|
|
|
def verbose_compile(pattern: str) -> Any:
|
|
|
|
return re.compile(
|
|
|
|
"^(.*?)%s(.*?)$" % pattern,
|
|
|
|
re.DOTALL | re.UNICODE | re.VERBOSE
|
|
|
|
)
|
|
|
|
|
2019-01-22 19:31:25 +01:00
|
|
|
def normal_compile(pattern: str) -> Any:
|
|
|
|
return re.compile(
|
|
|
|
r"^(.*?)%s(.*)$" % pattern,
|
|
|
|
re.DOTALL | re.UNICODE
|
|
|
|
)
|
|
|
|
|
2017-09-15 00:25:38 +02:00
|
|
|
STREAM_LINK_REGEX = r"""
|
|
|
|
(?<![^\s'"\(,:<]) # Start after whitespace or specified chars
|
|
|
|
\#\*\* # and after hash sign followed by double asterisks
|
|
|
|
(?P<stream_name>[^\*]+) # stream name can contain anything
|
|
|
|
\*\* # ends by double asterisks
|
|
|
|
"""
|
|
|
|
|
2019-01-22 20:16:39 +01:00
|
|
|
@one_time
|
|
|
|
def get_compiled_stream_link_regex() -> Pattern:
|
|
|
|
return verbose_compile(STREAM_LINK_REGEX)
|
|
|
|
|
2018-11-03 17:12:15 +01:00
|
|
|
LINK_REGEX = None # type: Pattern
|
|
|
|
|
|
|
|
def get_web_link_regex() -> str:
|
|
|
|
# We create this one time, but not at startup. So the
|
|
|
|
# first message rendered in any process will have some
|
2019-01-22 19:35:41 +01:00
|
|
|
# extra costs. It's roughly 75ms to run this code, so
|
|
|
|
# caching the value in LINK_REGEX is super important here.
|
2018-11-03 17:12:15 +01:00
|
|
|
global LINK_REGEX
|
2019-01-22 19:35:41 +01:00
|
|
|
if LINK_REGEX is not None:
|
|
|
|
return LINK_REGEX
|
|
|
|
|
|
|
|
tlds = '|'.join(list_of_tlds())
|
|
|
|
|
|
|
|
# A link starts at a word boundary, and ends at space, punctuation, or end-of-input.
|
|
|
|
#
|
|
|
|
# We detect a url either by the `https?://` or by building around the TLD.
|
|
|
|
|
|
|
|
# In lieu of having a recursive regex (which python doesn't support) to match
|
|
|
|
# arbitrary numbers of nested matching parenthesis, we manually build a regexp that
|
|
|
|
# can match up to six
|
|
|
|
# The inner_paren_contents chunk matches the innermore non-parenthesis-holding text,
|
|
|
|
# and the paren_group matches text with, optionally, a matching set of parens
|
|
|
|
inner_paren_contents = r"[^\s()\"]*"
|
|
|
|
paren_group = r"""
|
|
|
|
[^\s()\"]*? # Containing characters that won't end the URL
|
|
|
|
(?: \( %s \) # and more characters in matched parens
|
|
|
|
[^\s()\"]*? # followed by more characters
|
|
|
|
)* # zero-or-more sets of paired parens
|
|
|
|
"""
|
|
|
|
nested_paren_chunk = paren_group
|
|
|
|
for i in range(6):
|
|
|
|
nested_paren_chunk = nested_paren_chunk % (paren_group,)
|
|
|
|
nested_paren_chunk = nested_paren_chunk % (inner_paren_contents,)
|
|
|
|
|
|
|
|
file_links = r"| (?:file://(/[^/ ]*)+/?)" if settings.ENABLE_FILE_LINKS else r""
|
2019-01-23 21:30:00 +01:00
|
|
|
REGEX = r"""
|
2019-01-22 19:35:41 +01:00
|
|
|
(?<![^\s'"\(,:<]) # Start after whitespace or specified chars
|
|
|
|
# (Double-negative lookbehind to allow start-of-string)
|
|
|
|
(?P<url> # Main group
|
|
|
|
(?:(?: # Domain part
|
|
|
|
https?://[\w.:@-]+? # If it has a protocol, anything goes.
|
|
|
|
|(?: # Or, if not, be more strict to avoid false-positives
|
|
|
|
(?:[\w-]+\.)+ # One or more domain components, separated by dots
|
|
|
|
(?:%s) # TLDs (filled in via format from tlds-alpha-by-domain.txt)
|
2018-11-03 17:12:15 +01:00
|
|
|
)
|
|
|
|
)
|
2019-01-22 19:35:41 +01:00
|
|
|
(?:/ # A path, beginning with /
|
|
|
|
%s # zero-to-6 sets of paired parens
|
|
|
|
)?) # Path is optional
|
|
|
|
| (?:[\w.-]+\@[\w.-]+\.[\w]+) # Email is separate, since it can't have a path
|
|
|
|
%s # File path start with file:///, enable by setting ENABLE_FILE_LINKS=True
|
|
|
|
| (?:bitcoin:[13][a-km-zA-HJ-NP-Z1-9]{25,34}) # Bitcoin address pattern, see https://mokagio.github.io/tech-journal/2014/11/21/regex-bitcoin.html
|
|
|
|
)
|
|
|
|
(?= # URL must be followed by (not included in group)
|
|
|
|
[!:;\?\),\.\'\"\>]* # Optional punctuation characters
|
|
|
|
(?:\Z|\s) # followed by whitespace or end of string
|
|
|
|
)
|
|
|
|
""" % (tlds, nested_paren_chunk, file_links)
|
2019-01-23 21:30:00 +01:00
|
|
|
LINK_REGEX = verbose_compile(REGEX)
|
2018-11-03 17:12:15 +01:00
|
|
|
return LINK_REGEX
|
|
|
|
|
|
|
|
def clear_state_for_testing() -> None:
|
|
|
|
# The link regex never changes in production, but our tests
|
|
|
|
# try out both sides of ENABLE_FILE_LINKS, so we need
|
|
|
|
# a way to clear it.
|
|
|
|
global LINK_REGEX
|
|
|
|
LINK_REGEX = None
|
|
|
|
|
2018-07-03 07:25:29 +02:00
|
|
|
bugdown_logger = logging.getLogger()
|
|
|
|
|
2018-11-07 16:07:34 +01:00
|
|
|
def rewrite_local_links_to_relative(db_data: Optional[DbData], link: str) -> str:
|
2017-12-11 17:35:04 +01:00
|
|
|
""" If the link points to a local destination we can just switch to that
|
|
|
|
instead of opening a new tab. """
|
|
|
|
|
2018-11-07 16:07:34 +01:00
|
|
|
if db_data:
|
|
|
|
realm_uri_prefix = db_data['realm_uri'] + "/"
|
2018-04-02 19:29:32 +02:00
|
|
|
if link.startswith(realm_uri_prefix):
|
2017-12-11 17:49:15 +01:00
|
|
|
# +1 to skip the `/` before the hash link.
|
2018-04-02 19:29:32 +02:00
|
|
|
return link[len(realm_uri_prefix):]
|
2017-12-11 17:35:04 +01:00
|
|
|
|
|
|
|
return link
|
|
|
|
|
2018-11-07 15:48:08 +01:00
|
|
|
def url_embed_preview_enabled_for_realm(message: Optional[Message]=None,
|
|
|
|
realm: Optional[Realm]=None) -> bool:
|
2017-03-13 14:42:03 +01:00
|
|
|
if not settings.INLINE_URL_EMBED_PREVIEW:
|
|
|
|
return False
|
2018-11-02 14:56:32 +01:00
|
|
|
|
|
|
|
if realm is None:
|
|
|
|
if message is not None:
|
|
|
|
realm = message.get_realm()
|
|
|
|
|
2017-03-13 14:42:03 +01:00
|
|
|
if realm is None:
|
2018-11-02 14:56:32 +01:00
|
|
|
# realm can be None for odd use cases
|
|
|
|
# like generating documentation or running
|
|
|
|
# test code
|
2017-03-13 14:42:03 +01:00
|
|
|
return True
|
2018-11-02 14:56:32 +01:00
|
|
|
|
2017-03-13 14:42:03 +01:00
|
|
|
return realm.inline_url_embed_preview
|
|
|
|
|
2018-11-07 15:48:08 +01:00
|
|
|
def image_preview_enabled_for_realm(message: Optional[Message]=None,
|
|
|
|
realm: Optional[Realm]=None) -> bool:
|
2017-03-13 14:42:03 +01:00
|
|
|
if not settings.INLINE_IMAGE_PREVIEW:
|
|
|
|
return False
|
2018-11-02 14:56:32 +01:00
|
|
|
|
2017-03-13 14:42:03 +01:00
|
|
|
if realm is None:
|
2018-11-02 14:56:32 +01:00
|
|
|
if message is not None:
|
|
|
|
realm = message.get_realm()
|
|
|
|
|
|
|
|
if realm is None:
|
|
|
|
# realm can be None for odd use cases
|
|
|
|
# like generating documentation or running
|
|
|
|
# test code
|
2017-03-13 14:42:03 +01:00
|
|
|
return True
|
2018-11-02 14:56:32 +01:00
|
|
|
|
2017-03-13 14:42:03 +01:00
|
|
|
return realm.inline_image_preview
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def list_of_tlds() -> List[str]:
|
2016-12-30 03:15:48 +01:00
|
|
|
# HACK we manually blacklist a few domains
|
2017-11-03 03:12:25 +01:00
|
|
|
blacklist = ['PY\n', "MD\n"]
|
2013-04-02 17:08:00 +02:00
|
|
|
|
|
|
|
# tlds-alpha-by-domain.txt comes from http://data.iana.org/TLD/tlds-alpha-by-domain.txt
|
|
|
|
tlds_file = os.path.join(os.path.dirname(__file__), 'tlds-alpha-by-domain.txt')
|
2017-11-04 17:52:38 +01:00
|
|
|
tlds = [tld.lower().strip() for tld in open(tlds_file, 'r')
|
2016-11-30 14:17:35 +01:00
|
|
|
if tld not in blacklist and not tld[0].startswith('#')]
|
2013-04-02 17:08:00 +02:00
|
|
|
tlds.sort(key=len, reverse=True)
|
|
|
|
return tlds
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def walk_tree(root: Element,
|
|
|
|
processor: Callable[[Element], Optional[_T]],
|
|
|
|
stop_after_first: bool=False) -> List[_T]:
|
2013-03-08 20:07:46 +01:00
|
|
|
results = []
|
2017-04-15 12:53:10 +02:00
|
|
|
queue = deque([root])
|
2013-03-08 20:07:46 +01:00
|
|
|
|
2017-04-15 12:53:10 +02:00
|
|
|
while queue:
|
|
|
|
currElement = queue.popleft()
|
2013-03-08 20:07:46 +01:00
|
|
|
for child in currElement.getchildren():
|
|
|
|
if child.getchildren():
|
2017-04-15 12:53:10 +02:00
|
|
|
queue.append(child)
|
2013-03-08 20:07:46 +01:00
|
|
|
|
|
|
|
result = processor(child)
|
|
|
|
if result is not None:
|
|
|
|
results.append(result)
|
2013-03-08 21:44:06 +01:00
|
|
|
if stop_after_first:
|
|
|
|
return results
|
2013-03-08 20:07:46 +01:00
|
|
|
|
|
|
|
return results
|
|
|
|
|
2017-12-25 21:35:23 +01:00
|
|
|
ElementFamily = NamedTuple('ElementFamily', [
|
|
|
|
('grandparent', Optional[Element]),
|
|
|
|
('parent', Element),
|
|
|
|
('child', Element)
|
|
|
|
])
|
|
|
|
|
|
|
|
ResultWithFamily = NamedTuple('ResultWithFamily', [
|
|
|
|
('family', ElementFamily),
|
|
|
|
('result', Any)
|
|
|
|
])
|
|
|
|
|
2018-03-10 07:51:01 +01:00
|
|
|
ElementPair = NamedTuple('ElementPair', [
|
|
|
|
('parent', Optional[Element]),
|
|
|
|
('value', Element)
|
|
|
|
])
|
|
|
|
|
2017-12-25 21:35:23 +01:00
|
|
|
def walk_tree_with_family(root: Element,
|
|
|
|
processor: Callable[[Element], Optional[_T]]
|
|
|
|
) -> List[ResultWithFamily]:
|
|
|
|
results = []
|
|
|
|
|
2018-03-10 07:51:01 +01:00
|
|
|
queue = deque([ElementPair(parent=None, value=root)])
|
2017-12-25 21:35:23 +01:00
|
|
|
while queue:
|
|
|
|
currElementPair = queue.popleft()
|
2018-03-10 07:51:01 +01:00
|
|
|
for child in currElementPair.value.getchildren():
|
2017-12-25 21:35:23 +01:00
|
|
|
if child.getchildren():
|
2018-03-10 07:51:01 +01:00
|
|
|
queue.append(ElementPair(parent=currElementPair, value=child)) # type: ignore # Lack of Deque support in typing module for Python 3.4.3
|
2017-12-25 21:35:23 +01:00
|
|
|
result = processor(child)
|
|
|
|
if result is not None:
|
2018-03-10 07:51:01 +01:00
|
|
|
if currElementPair.parent is not None:
|
|
|
|
grandparent_element = cast(ElementPair, currElementPair.parent)
|
|
|
|
grandparent = grandparent_element.value
|
2017-12-25 21:35:23 +01:00
|
|
|
else:
|
|
|
|
grandparent = None
|
|
|
|
family = ElementFamily(
|
|
|
|
grandparent=grandparent,
|
2018-03-10 07:51:01 +01:00
|
|
|
parent=currElementPair.value,
|
2017-12-25 21:35:23 +01:00
|
|
|
child=child
|
|
|
|
)
|
|
|
|
|
|
|
|
results.append(ResultWithFamily(
|
|
|
|
family=family,
|
|
|
|
result=result
|
|
|
|
))
|
|
|
|
|
|
|
|
return results
|
|
|
|
|
2014-05-21 08:11:29 +02:00
|
|
|
# height is not actually used
|
2017-12-06 10:30:42 +01:00
|
|
|
def add_a(
|
|
|
|
root: Element,
|
2018-05-11 01:42:51 +02:00
|
|
|
url: str,
|
|
|
|
link: str,
|
|
|
|
title: Optional[str]=None,
|
|
|
|
desc: Optional[str]=None,
|
|
|
|
class_attr: str="message_inline_image",
|
|
|
|
data_id: Optional[str]=None,
|
2018-03-08 09:37:09 +01:00
|
|
|
insertion_index: Optional[int]=None,
|
2018-10-25 15:51:56 +02:00
|
|
|
already_thumbnailed: Optional[bool]=False
|
2017-12-06 10:30:42 +01:00
|
|
|
) -> None:
|
2014-07-17 02:41:49 +02:00
|
|
|
title = title if title is not None else url_filename(link)
|
|
|
|
title = title if title else ""
|
|
|
|
desc = desc if desc is not None else ""
|
|
|
|
|
2017-12-25 21:35:23 +01:00
|
|
|
if insertion_index is not None:
|
|
|
|
div = markdown.util.etree.Element("div")
|
|
|
|
root.insert(insertion_index, div)
|
|
|
|
else:
|
|
|
|
div = markdown.util.etree.SubElement(root, "div")
|
|
|
|
|
2014-05-21 08:11:29 +02:00
|
|
|
div.set("class", class_attr)
|
2013-04-29 22:17:17 +02:00
|
|
|
a = markdown.util.etree.SubElement(div, "a")
|
|
|
|
a.set("href", link)
|
|
|
|
a.set("target", "_blank")
|
2016-10-17 22:02:01 +02:00
|
|
|
a.set("title", title)
|
|
|
|
if data_id is not None:
|
|
|
|
a.set("data-id", data_id)
|
2013-04-29 22:17:17 +02:00
|
|
|
img = markdown.util.etree.SubElement(a, "img")
|
2019-01-04 16:22:04 +01:00
|
|
|
if settings.THUMBNAIL_IMAGES and (not already_thumbnailed) and user_uploads_or_external(url):
|
2018-07-30 22:16:26 +02:00
|
|
|
# See docs/thumbnailing.md for some high-level documentation.
|
|
|
|
#
|
2018-03-08 09:37:09 +01:00
|
|
|
# We strip leading '/' from relative URLs here to ensure
|
|
|
|
# consistency in what gets passed to /thumbnail
|
|
|
|
url = url.lstrip('/')
|
|
|
|
img.set("src", "/thumbnail?url={0}&size=thumbnail".format(
|
|
|
|
urllib.parse.quote(url, safe='')
|
|
|
|
))
|
2018-07-30 21:26:01 +02:00
|
|
|
img.set('data-src-fullsize', "/thumbnail?url={0}&size=full".format(
|
2018-03-08 09:37:09 +01:00
|
|
|
urllib.parse.quote(url, safe='')
|
|
|
|
))
|
|
|
|
else:
|
|
|
|
img.set("src", url)
|
|
|
|
|
2014-07-17 02:41:49 +02:00
|
|
|
if class_attr == "message_inline_ref":
|
2014-05-21 08:11:29 +02:00
|
|
|
summary_div = markdown.util.etree.SubElement(div, "div")
|
|
|
|
title_div = markdown.util.etree.SubElement(summary_div, "div")
|
2014-07-17 02:41:49 +02:00
|
|
|
title_div.set("class", "message_inline_image_title")
|
2014-05-21 08:11:29 +02:00
|
|
|
title_div.text = title
|
|
|
|
desc_div = markdown.util.etree.SubElement(summary_div, "desc")
|
2014-07-17 02:41:49 +02:00
|
|
|
desc_div.set("class", "message_inline_image_desc")
|
2013-04-29 22:17:17 +02:00
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def add_embed(root: Element, link: str, extracted_data: Dict[str, Any]) -> None:
|
2016-10-27 12:06:44 +02:00
|
|
|
container = markdown.util.etree.SubElement(root, "div")
|
|
|
|
container.set("class", "message_embed")
|
|
|
|
|
2016-12-17 01:41:23 +01:00
|
|
|
img_link = extracted_data.get('image')
|
|
|
|
if img_link:
|
2017-05-04 01:41:35 +02:00
|
|
|
parsed_img_link = urllib.parse.urlparse(img_link)
|
|
|
|
# Append domain where relative img_link url is given
|
|
|
|
if not parsed_img_link.netloc:
|
|
|
|
parsed_url = urllib.parse.urlparse(link)
|
|
|
|
domain = '{url.scheme}://{url.netloc}/'.format(url=parsed_url)
|
|
|
|
img_link = urllib.parse.urljoin(domain, img_link)
|
2016-12-17 01:41:23 +01:00
|
|
|
img = markdown.util.etree.SubElement(container, "a")
|
|
|
|
img.set("style", "background-image: url(" + img_link + ")")
|
|
|
|
img.set("href", link)
|
|
|
|
img.set("target", "_blank")
|
|
|
|
img.set("class", "message_embed_image")
|
|
|
|
|
|
|
|
data_container = markdown.util.etree.SubElement(container, "div")
|
|
|
|
data_container.set("class", "data-container")
|
|
|
|
|
2016-10-27 12:06:44 +02:00
|
|
|
title = extracted_data.get('title')
|
|
|
|
if title:
|
2016-12-17 01:41:23 +01:00
|
|
|
title_elm = markdown.util.etree.SubElement(data_container, "div")
|
2016-10-27 12:06:44 +02:00
|
|
|
title_elm.set("class", "message_embed_title")
|
|
|
|
a = markdown.util.etree.SubElement(title_elm, "a")
|
|
|
|
a.set("href", link)
|
|
|
|
a.set("target", "_blank")
|
|
|
|
a.set("title", title)
|
|
|
|
a.text = title
|
|
|
|
description = extracted_data.get('description')
|
|
|
|
if description:
|
2016-12-17 01:41:23 +01:00
|
|
|
description_elm = markdown.util.etree.SubElement(data_container, "div")
|
2016-10-27 12:06:44 +02:00
|
|
|
description_elm.set("class", "message_embed_description")
|
|
|
|
description_elm.text = description
|
|
|
|
|
2013-05-11 15:50:02 +02:00
|
|
|
@cache_with_key(lambda tweet_id: tweet_id, cache_name="database", with_statsd_key="tweet_data")
|
2018-05-11 01:42:51 +02:00
|
|
|
def fetch_tweet_data(tweet_id: str) -> Optional[Dict[str, Any]]:
|
2013-03-11 16:23:34 +01:00
|
|
|
if settings.TEST_SUITE:
|
2013-10-10 21:37:26 +02:00
|
|
|
from . import testing_mocks
|
2013-03-11 16:23:34 +01:00
|
|
|
res = testing_mocks.twitter(tweet_id)
|
|
|
|
else:
|
2015-09-30 09:55:56 +02:00
|
|
|
creds = {
|
|
|
|
'consumer_key': settings.TWITTER_CONSUMER_KEY,
|
|
|
|
'consumer_secret': settings.TWITTER_CONSUMER_SECRET,
|
|
|
|
'access_token_key': settings.TWITTER_ACCESS_TOKEN_KEY,
|
|
|
|
'access_token_secret': settings.TWITTER_ACCESS_TOKEN_SECRET,
|
|
|
|
}
|
|
|
|
if not all(creds.values()):
|
2016-12-01 06:20:27 +01:00
|
|
|
return None
|
2013-10-10 21:30:35 +02:00
|
|
|
|
2018-08-08 18:06:51 +02:00
|
|
|
# We lazily import twitter here because its import process is
|
|
|
|
# surprisingly slow, and doing so has a significant impact on
|
|
|
|
# the startup performance of `manage.py` commands.
|
|
|
|
import twitter
|
|
|
|
|
2013-03-12 23:40:41 +01:00
|
|
|
try:
|
2017-11-06 18:10:35 +01:00
|
|
|
api = twitter.Api(tweet_mode='extended', **creds)
|
2013-04-23 17:01:33 +02:00
|
|
|
# Sometimes Twitter hangs on responses. Timing out here
|
|
|
|
# will cause the Tweet to go through as-is with no inline
|
|
|
|
# preview, rather than having the message be rejected
|
|
|
|
# entirely. This timeout needs to be less than our overall
|
|
|
|
# formatting timeout.
|
2014-01-10 19:04:57 +01:00
|
|
|
tweet = timeout(3, api.GetStatus, tweet_id)
|
|
|
|
res = tweet.AsDict()
|
2015-09-30 21:01:37 +02:00
|
|
|
except AttributeError:
|
2018-07-03 07:25:29 +02:00
|
|
|
bugdown_logger.error('Unable to load twitter api, you may have the wrong '
|
|
|
|
'library installed, see https://github.com/zulip/zulip/issues/86')
|
2015-09-30 21:01:37 +02:00
|
|
|
return None
|
2018-05-24 16:41:34 +02:00
|
|
|
except TimeoutExpired:
|
2013-04-23 17:01:33 +02:00
|
|
|
# We'd like to try again later and not cache the bad result,
|
|
|
|
# so we need to re-raise the exception (just as though
|
|
|
|
# we were being rate-limited)
|
|
|
|
raise
|
2013-03-12 23:40:41 +01:00
|
|
|
except twitter.TwitterError as e:
|
|
|
|
t = e.args[0]
|
|
|
|
if len(t) == 1 and ('code' in t[0]) and (t[0]['code'] == 34):
|
|
|
|
# Code 34 means that the message doesn't exist; return
|
|
|
|
# None so that we will cache the error
|
|
|
|
return None
|
|
|
|
elif len(t) == 1 and ('code' in t[0]) and (t[0]['code'] == 88 or
|
|
|
|
t[0]['code'] == 130):
|
|
|
|
# Code 88 means that we were rate-limited and 130
|
|
|
|
# means Twitter is having capacity issues; either way
|
|
|
|
# just raise the error so we don't cache None and will
|
|
|
|
# try again later.
|
|
|
|
raise
|
|
|
|
else:
|
|
|
|
# It's not clear what to do in cases of other errors,
|
|
|
|
# but for now it seems reasonable to log at error
|
|
|
|
# level (so that we get notified), but then cache the
|
|
|
|
# failure to proceed with our usual work
|
2018-07-03 07:25:29 +02:00
|
|
|
bugdown_logger.error(traceback.format_exc())
|
2013-03-12 23:40:41 +01:00
|
|
|
return None
|
2013-03-11 16:23:34 +01:00
|
|
|
return res
|
|
|
|
|
2017-11-03 03:12:25 +01:00
|
|
|
HEAD_START_RE = re.compile('^head[ >]')
|
|
|
|
HEAD_END_RE = re.compile('^/head[ >]')
|
|
|
|
META_START_RE = re.compile('^meta[ >]')
|
|
|
|
META_END_RE = re.compile('^/meta[ >]')
|
2014-05-21 08:11:29 +02:00
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def fetch_open_graph_image(url: str) -> Optional[Dict[str, Any]]:
|
2014-05-21 08:11:29 +02:00
|
|
|
in_head = False
|
2017-11-10 03:49:42 +01:00
|
|
|
# HTML will auto close meta tags, when we start the next tag add
|
|
|
|
# a closing tag if it has not been closed yet.
|
2014-05-21 08:11:29 +02:00
|
|
|
last_closed = True
|
|
|
|
head = []
|
|
|
|
# TODO: What if response content is huge? Should we get headers first?
|
2014-07-17 02:41:49 +02:00
|
|
|
try:
|
2016-07-04 09:16:58 +02:00
|
|
|
content = requests.get(url, timeout=1).text
|
2017-03-05 10:25:27 +01:00
|
|
|
except Exception:
|
2014-07-17 02:41:49 +02:00
|
|
|
return None
|
2014-05-21 08:11:29 +02:00
|
|
|
# Extract the head and meta tags
|
|
|
|
# All meta tags are self closing, have no children or are closed
|
|
|
|
# automatically.
|
|
|
|
for part in content.split('<'):
|
|
|
|
if not in_head and HEAD_START_RE.match(part):
|
|
|
|
# Started the head node output it to have a document root
|
|
|
|
in_head = True
|
|
|
|
head.append('<head>')
|
|
|
|
elif in_head and HEAD_END_RE.match(part):
|
|
|
|
# Found the end of the head close any remaining tag then stop
|
|
|
|
# processing
|
|
|
|
in_head = False
|
|
|
|
if not last_closed:
|
|
|
|
last_closed = True
|
|
|
|
head.append('</meta>')
|
|
|
|
head.append('</head>')
|
|
|
|
break
|
|
|
|
|
|
|
|
elif in_head and META_START_RE.match(part):
|
|
|
|
# Found a meta node copy it
|
|
|
|
if not last_closed:
|
|
|
|
head.append('</meta>')
|
|
|
|
last_closed = True
|
|
|
|
head.append('<')
|
|
|
|
head.append(part)
|
|
|
|
if '/>' not in part:
|
|
|
|
last_closed = False
|
|
|
|
|
|
|
|
elif in_head and META_END_RE.match(part):
|
|
|
|
# End of a meta node just copy it to close the tag
|
|
|
|
head.append('<')
|
|
|
|
head.append(part)
|
|
|
|
last_closed = True
|
|
|
|
|
|
|
|
try:
|
|
|
|
doc = etree.fromstring(''.join(head))
|
|
|
|
except etree.ParseError:
|
|
|
|
return None
|
|
|
|
og_image = doc.find('meta[@property="og:image"]')
|
|
|
|
og_title = doc.find('meta[@property="og:title"]')
|
|
|
|
og_desc = doc.find('meta[@property="og:description"]')
|
2014-07-17 02:41:49 +02:00
|
|
|
title = None
|
|
|
|
desc = None
|
2014-05-21 08:11:29 +02:00
|
|
|
if og_image is not None:
|
|
|
|
image = og_image.get('content')
|
2014-07-17 02:41:49 +02:00
|
|
|
else:
|
|
|
|
return None
|
2014-05-21 08:11:29 +02:00
|
|
|
if og_title is not None:
|
|
|
|
title = og_title.get('content')
|
|
|
|
if og_desc is not None:
|
|
|
|
desc = og_desc.get('content')
|
|
|
|
return {'image': image, 'title': title, 'desc': desc}
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def get_tweet_id(url: str) -> Optional[str]:
|
2016-01-24 03:39:44 +01:00
|
|
|
parsed_url = urllib.parse.urlparse(url)
|
2013-04-30 21:37:22 +02:00
|
|
|
if not (parsed_url.netloc == 'twitter.com' or parsed_url.netloc.endswith('.twitter.com')):
|
2016-06-17 00:21:01 +02:00
|
|
|
return None
|
2013-12-13 23:45:01 +01:00
|
|
|
to_match = parsed_url.path
|
2017-11-10 03:49:42 +01:00
|
|
|
# In old-style twitter.com/#!/wdaher/status/1231241234-style URLs,
|
|
|
|
# we need to look at the fragment instead
|
2013-12-13 23:45:01 +01:00
|
|
|
if parsed_url.path == '/' and len(parsed_url.fragment) > 5:
|
2016-11-28 23:29:01 +01:00
|
|
|
to_match = parsed_url.fragment
|
2013-04-30 21:37:22 +02:00
|
|
|
|
2018-05-26 19:11:32 +02:00
|
|
|
tweet_id_match = re.match(r'^!?/.*?/status(es)?/(?P<tweetid>\d{10,30})(/photo/[0-9])?/?$', to_match)
|
2013-04-30 21:37:22 +02:00
|
|
|
if not tweet_id_match:
|
2016-06-17 00:21:01 +02:00
|
|
|
return None
|
2013-04-30 21:37:22 +02:00
|
|
|
return tweet_id_match.group("tweetid")
|
|
|
|
|
2013-08-28 22:45:26 +02:00
|
|
|
class InlineHttpsProcessor(markdown.treeprocessors.Treeprocessor):
|
2017-11-05 11:15:10 +01:00
|
|
|
def run(self, root: Element) -> None:
|
2013-08-28 22:45:26 +02:00
|
|
|
# Get all URLs from the blob
|
|
|
|
found_imgs = walk_tree(root, lambda e: e if e.tag == "img" else None)
|
|
|
|
for img in found_imgs:
|
|
|
|
url = img.get("src")
|
2013-12-13 19:39:30 +01:00
|
|
|
if not url.startswith("http://"):
|
2013-08-28 22:45:26 +02:00
|
|
|
# Don't rewrite images on our own site (e.g. emoji).
|
|
|
|
continue
|
2016-04-28 05:40:58 +02:00
|
|
|
img.set("src", get_camo_url(url))
|
2013-04-30 21:37:22 +02:00
|
|
|
|
2017-11-22 02:27:19 +01:00
|
|
|
class BacktickPattern(markdown.inlinepatterns.Pattern):
|
|
|
|
""" Return a `<code>` element containing the matching text. """
|
2018-05-11 01:42:51 +02:00
|
|
|
def __init__(self, pattern: str) -> None:
|
2017-11-22 02:27:19 +01:00
|
|
|
markdown.inlinepatterns.Pattern.__init__(self, pattern)
|
|
|
|
self.ESCAPED_BSLASH = '%s%s%s' % (markdown.util.STX, ord('\\'), markdown.util.ETX)
|
|
|
|
self.tag = 'code'
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def handleMatch(self, m: Match[str]) -> Union[str, Element]:
|
2017-11-22 02:27:19 +01:00
|
|
|
if m.group(4):
|
|
|
|
el = markdown.util.etree.Element(self.tag)
|
|
|
|
# Modified to not strip whitespace
|
|
|
|
el.text = markdown.util.AtomicString(m.group(4))
|
|
|
|
return el
|
|
|
|
else:
|
|
|
|
return m.group(2).replace('\\\\', self.ESCAPED_BSLASH)
|
|
|
|
|
2013-03-08 06:27:16 +01:00
|
|
|
class InlineInterestingLinkProcessor(markdown.treeprocessors.Treeprocessor):
|
2014-01-10 19:04:57 +01:00
|
|
|
TWITTER_MAX_IMAGE_HEIGHT = 400
|
2014-01-28 22:17:12 +01:00
|
|
|
TWITTER_MAX_TO_PREVIEW = 3
|
2018-02-23 12:16:09 +01:00
|
|
|
INLINE_PREVIEW_LIMIT_PER_MESSAGE = 5
|
2014-01-28 22:17:12 +01:00
|
|
|
|
2019-01-20 09:10:58 +01:00
|
|
|
def __init__(self, md: markdown.Markdown) -> None:
|
2014-07-17 02:41:49 +02:00
|
|
|
markdown.treeprocessors.Treeprocessor.__init__(self, md)
|
2014-01-10 19:04:57 +01:00
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def get_actual_image_url(self, url: str) -> str:
|
2017-05-03 18:42:55 +02:00
|
|
|
# Add specific per-site cases to convert image-preview urls to image urls.
|
|
|
|
# See https://github.com/zulip/zulip/issues/4658 for more information
|
|
|
|
parsed_url = urllib.parse.urlparse(url)
|
|
|
|
if (parsed_url.netloc == 'github.com' or parsed_url.netloc.endswith('.github.com')):
|
|
|
|
# https://github.com/zulip/zulip/blob/master/static/images/logo/zulip-icon-128x128.png ->
|
|
|
|
# https://raw.githubusercontent.com/zulip/zulip/master/static/images/logo/zulip-icon-128x128.png
|
|
|
|
split_path = parsed_url.path.split('/')
|
|
|
|
if len(split_path) > 3 and split_path[3] == "blob":
|
|
|
|
return urllib.parse.urljoin('https://raw.githubusercontent.com',
|
|
|
|
'/'.join(split_path[0:3] + split_path[4:]))
|
|
|
|
|
|
|
|
return url
|
|
|
|
|
2018-11-07 15:48:08 +01:00
|
|
|
def image_preview_enabled(self) -> bool:
|
|
|
|
return image_preview_enabled_for_realm(
|
|
|
|
self.markdown.zulip_message,
|
|
|
|
self.markdown.zulip_realm,
|
|
|
|
)
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def is_image(self, url: str) -> bool:
|
2018-11-07 15:48:08 +01:00
|
|
|
if not self.image_preview_enabled():
|
2013-11-14 14:37:39 +01:00
|
|
|
return False
|
2016-01-24 03:39:44 +01:00
|
|
|
parsed_url = urllib.parse.urlparse(url)
|
2013-05-21 16:59:09 +02:00
|
|
|
# List from http://support.google.com/chromeos/bin/answer.py?hl=en&answer=183093
|
|
|
|
for ext in [".bmp", ".gif", ".jpg", "jpeg", ".png", ".webp"]:
|
|
|
|
if parsed_url.path.lower().endswith(ext):
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def dropbox_image(self, url: str) -> Optional[Dict[str, Any]]:
|
2017-11-02 05:39:39 +01:00
|
|
|
# TODO: The returned Dict could possibly be a TypedDict in future.
|
2016-01-24 03:39:44 +01:00
|
|
|
parsed_url = urllib.parse.urlparse(url)
|
2014-02-26 21:25:27 +01:00
|
|
|
if (parsed_url.netloc == 'dropbox.com' or parsed_url.netloc.endswith('.dropbox.com')):
|
2014-07-17 02:41:49 +02:00
|
|
|
is_album = parsed_url.path.startswith('/sc/') or parsed_url.path.startswith('/photos/')
|
2014-05-21 08:11:29 +02:00
|
|
|
# Only allow preview Dropbox shared links
|
|
|
|
if not (parsed_url.path.startswith('/s/') or
|
|
|
|
parsed_url.path.startswith('/sh/') or
|
|
|
|
is_album):
|
|
|
|
return None
|
|
|
|
|
|
|
|
# Try to retrieve open graph protocol info for a preview
|
|
|
|
# This might be redundant right now for shared links for images.
|
|
|
|
# However, we might want to make use of title and description
|
|
|
|
# in the future. If the actual image is too big, we might also
|
|
|
|
# want to use the open graph image.
|
|
|
|
image_info = fetch_open_graph_image(url)
|
|
|
|
|
|
|
|
is_image = is_album or self.is_image(url)
|
|
|
|
|
|
|
|
# If it is from an album or not an actual image file,
|
|
|
|
# just use open graph image.
|
|
|
|
if is_album or not is_image:
|
2014-07-17 02:41:49 +02:00
|
|
|
# Failed to follow link to find an image preview so
|
|
|
|
# use placeholder image and guess filename
|
|
|
|
if image_info is None:
|
2016-04-30 00:40:52 +02:00
|
|
|
return None
|
2014-07-17 02:41:49 +02:00
|
|
|
|
|
|
|
image_info["is_image"] = is_image
|
2014-05-21 08:11:29 +02:00
|
|
|
return image_info
|
|
|
|
|
|
|
|
# Otherwise, try to retrieve the actual image.
|
|
|
|
# This is because open graph image from Dropbox may have padding
|
|
|
|
# and gifs do not work.
|
|
|
|
# TODO: What if image is huge? Should we get headers first?
|
|
|
|
if image_info is None:
|
|
|
|
image_info = dict()
|
|
|
|
image_info['is_image'] = True
|
|
|
|
parsed_url_list = list(parsed_url)
|
2017-07-09 01:28:18 +02:00
|
|
|
parsed_url_list[4] = "dl=1" # Replaces query
|
2016-01-24 03:39:44 +01:00
|
|
|
image_info["image"] = urllib.parse.urlunparse(parsed_url_list)
|
2014-05-21 08:11:29 +02:00
|
|
|
|
|
|
|
return image_info
|
2013-05-21 16:59:09 +02:00
|
|
|
return None
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def youtube_id(self, url: str) -> Optional[str]:
|
2018-11-07 15:48:08 +01:00
|
|
|
if not self.image_preview_enabled():
|
2013-11-14 14:37:39 +01:00
|
|
|
return None
|
2013-05-21 16:59:09 +02:00
|
|
|
# Youtube video id extraction regular expression from http://pastebin.com/KyKAFv1s
|
|
|
|
# If it matches, match.group(2) is the video id.
|
2016-06-23 23:35:45 +02:00
|
|
|
youtube_re = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)' + \
|
2016-08-13 16:54:27 +02:00
|
|
|
r'(?:(?:(?:v|embed)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))' + \
|
|
|
|
r'?([0-9A-Za-z_-]+)(?(1).+)?$'
|
2013-05-21 16:59:09 +02:00
|
|
|
match = re.match(youtube_re, url)
|
|
|
|
if match is None:
|
|
|
|
return None
|
2016-10-17 22:02:01 +02:00
|
|
|
return match.group(2)
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def youtube_image(self, url: str) -> Optional[str]:
|
2016-10-17 22:02:01 +02:00
|
|
|
yt_id = self.youtube_id(url)
|
|
|
|
|
|
|
|
if yt_id is not None:
|
|
|
|
return "https://i.ytimg.com/vi/%s/default.jpg" % (yt_id,)
|
2017-03-03 20:30:49 +01:00
|
|
|
return None
|
2013-05-21 16:59:09 +02:00
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def vimeo_id(self, url: str) -> Optional[str]:
|
2018-11-07 15:48:08 +01:00
|
|
|
if not self.image_preview_enabled():
|
2017-12-14 22:17:00 +01:00
|
|
|
return None
|
|
|
|
#(http|https)?:\/\/(www\.)?vimeo.com\/(?:channels\/(?:\w+\/)?|groups\/([^\/]*)\/videos\/|)(\d+)(?:|\/\?)
|
|
|
|
# If it matches, match.group('id') is the video id.
|
|
|
|
|
|
|
|
vimeo_re = r'^((http|https)?:\/\/(www\.)?vimeo.com\/' + \
|
|
|
|
r'(?:channels\/(?:\w+\/)?|groups\/' + \
|
|
|
|
r'([^\/]*)\/videos\/|)(\d+)(?:|\/\?))$'
|
|
|
|
match = re.match(vimeo_re, url)
|
|
|
|
if match is None:
|
|
|
|
return None
|
|
|
|
return match.group(5)
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def vimeo_title(self, extracted_data: Dict[str, Any]) -> Optional[str]:
|
2018-03-24 12:53:47 +01:00
|
|
|
title = extracted_data.get("title")
|
|
|
|
if title is not None:
|
|
|
|
return "Vimeo - {}".format(title)
|
|
|
|
return None
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def twitter_text(self, text: str,
|
|
|
|
urls: List[Dict[str, str]],
|
|
|
|
user_mentions: List[Dict[str, Any]],
|
|
|
|
media: List[Dict[str, Any]]) -> Element:
|
2014-01-08 22:56:48 +01:00
|
|
|
"""
|
2014-01-10 19:04:57 +01:00
|
|
|
Use data from the twitter API to turn links, mentions and media into A
|
2017-06-19 23:18:55 +02:00
|
|
|
tags. Also convert unicode emojis to images.
|
2014-01-08 22:56:48 +01:00
|
|
|
|
2017-06-19 23:18:55 +02:00
|
|
|
This works by using the urls, user_mentions and media data from
|
|
|
|
the twitter API and searching for unicode emojis in the text using
|
|
|
|
`unicode_emoji_regex`.
|
2014-01-08 22:56:48 +01:00
|
|
|
|
2017-06-19 23:18:55 +02:00
|
|
|
The first step is finding the locations of the URLs, mentions, media and
|
|
|
|
emoji in the text. For each match we build a dictionary with type, the start
|
|
|
|
location, end location, the URL to link to, and the text(codepoint and title
|
|
|
|
in case of emojis) to be used in the link(image in case of emojis).
|
2014-01-08 22:56:48 +01:00
|
|
|
|
|
|
|
Next we sort the matches by start location. And for each we add the
|
|
|
|
text from the end of the last link to the start of the current link to
|
|
|
|
the output. The text needs to added to the text attribute of the first
|
|
|
|
node (the P tag) or the tail the last link created.
|
|
|
|
|
|
|
|
Finally we add any remaining text to the last node.
|
|
|
|
"""
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
to_process = [] # type: List[Dict[str, Any]]
|
2014-01-08 22:56:48 +01:00
|
|
|
# Build dicts for URLs
|
2016-07-16 06:48:10 +02:00
|
|
|
for url_data in urls:
|
|
|
|
short_url = url_data["url"]
|
|
|
|
full_url = url_data["expanded_url"]
|
2014-02-15 21:33:22 +01:00
|
|
|
for match in re.finditer(re.escape(short_url), text, re.IGNORECASE):
|
2017-06-19 23:41:20 +02:00
|
|
|
to_process.append({
|
|
|
|
'type': 'url',
|
2014-01-08 22:56:48 +01:00
|
|
|
'start': match.start(),
|
|
|
|
'end': match.end(),
|
|
|
|
'url': short_url,
|
|
|
|
'text': full_url,
|
|
|
|
})
|
|
|
|
# Build dicts for mentions
|
|
|
|
for user_mention in user_mentions:
|
|
|
|
screen_name = user_mention['screen_name']
|
2017-11-03 03:12:25 +01:00
|
|
|
mention_string = '@' + screen_name
|
2014-02-15 21:33:22 +01:00
|
|
|
for match in re.finditer(re.escape(mention_string), text, re.IGNORECASE):
|
2017-06-19 23:41:20 +02:00
|
|
|
to_process.append({
|
|
|
|
'type': 'mention',
|
2014-01-08 22:56:48 +01:00
|
|
|
'start': match.start(),
|
|
|
|
'end': match.end(),
|
2017-11-04 19:38:53 +01:00
|
|
|
'url': 'https://twitter.com/' + urllib.parse.quote(screen_name),
|
2014-01-08 22:56:48 +01:00
|
|
|
'text': mention_string,
|
|
|
|
})
|
2014-01-10 19:04:57 +01:00
|
|
|
# Build dicts for media
|
|
|
|
for media_item in media:
|
|
|
|
short_url = media_item['url']
|
|
|
|
expanded_url = media_item['expanded_url']
|
2014-02-15 21:33:22 +01:00
|
|
|
for match in re.finditer(re.escape(short_url), text, re.IGNORECASE):
|
2017-06-19 23:41:20 +02:00
|
|
|
to_process.append({
|
|
|
|
'type': 'media',
|
2014-01-10 19:04:57 +01:00
|
|
|
'start': match.start(),
|
|
|
|
'end': match.end(),
|
|
|
|
'url': short_url,
|
|
|
|
'text': expanded_url,
|
|
|
|
})
|
2017-06-19 23:18:55 +02:00
|
|
|
# Build dicts for emojis
|
|
|
|
for match in re.finditer(unicode_emoji_regex, text, re.IGNORECASE):
|
|
|
|
orig_syntax = match.group('syntax')
|
|
|
|
codepoint = unicode_emoji_to_codepoint(orig_syntax)
|
|
|
|
if codepoint in codepoint_to_name:
|
|
|
|
display_string = ':' + codepoint_to_name[codepoint] + ':'
|
|
|
|
to_process.append({
|
|
|
|
'type': 'emoji',
|
|
|
|
'start': match.start(),
|
|
|
|
'end': match.end(),
|
|
|
|
'codepoint': codepoint,
|
|
|
|
'title': display_string,
|
|
|
|
})
|
2014-01-08 22:56:48 +01:00
|
|
|
|
2017-06-19 23:41:20 +02:00
|
|
|
to_process.sort(key=lambda x: x['start'])
|
2016-01-25 21:53:23 +01:00
|
|
|
p = current_node = markdown.util.etree.Element('p')
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def set_text(text: str) -> None:
|
2014-01-08 22:56:48 +01:00
|
|
|
"""
|
|
|
|
Helper to set the text or the tail of the current_node
|
|
|
|
"""
|
|
|
|
if current_node == p:
|
|
|
|
current_node.text = text
|
|
|
|
else:
|
|
|
|
current_node.tail = text
|
|
|
|
|
2018-11-07 16:26:33 +01:00
|
|
|
db_data = self.markdown.zulip_db_data
|
2014-01-08 22:56:48 +01:00
|
|
|
current_index = 0
|
2017-06-19 23:41:20 +02:00
|
|
|
for item in to_process:
|
2014-01-08 22:56:48 +01:00
|
|
|
# The text we want to link starts in already linked text skip it
|
2017-06-19 23:41:20 +02:00
|
|
|
if item['start'] < current_index:
|
2014-01-08 22:56:48 +01:00
|
|
|
continue
|
|
|
|
# Add text from the end of last link to the start of the current
|
|
|
|
# link
|
2017-06-19 23:41:20 +02:00
|
|
|
set_text(text[current_index:item['start']])
|
|
|
|
current_index = item['end']
|
2017-06-19 23:18:55 +02:00
|
|
|
if item['type'] != 'emoji':
|
2018-11-07 16:07:34 +01:00
|
|
|
current_node = elem = url_to_a(db_data, item['url'], item['text'])
|
2017-06-19 23:18:55 +02:00
|
|
|
else:
|
|
|
|
current_node = elem = make_emoji(item['codepoint'], item['title'])
|
|
|
|
p.append(elem)
|
2014-01-08 22:56:48 +01:00
|
|
|
|
|
|
|
# Add any unused text
|
|
|
|
set_text(text[current_index:])
|
|
|
|
return p
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def twitter_link(self, url: str) -> Optional[Element]:
|
2013-04-30 21:37:22 +02:00
|
|
|
tweet_id = get_tweet_id(url)
|
2013-03-08 06:27:16 +01:00
|
|
|
|
2016-06-17 00:21:01 +02:00
|
|
|
if tweet_id is None:
|
2013-03-08 06:27:16 +01:00
|
|
|
return None
|
|
|
|
|
|
|
|
try:
|
2013-03-11 16:23:34 +01:00
|
|
|
res = fetch_tweet_data(tweet_id)
|
2013-03-12 23:40:41 +01:00
|
|
|
if res is None:
|
|
|
|
return None
|
2018-05-11 01:42:51 +02:00
|
|
|
user = res['user'] # type: Dict[str, Any]
|
2013-03-08 06:27:16 +01:00
|
|
|
tweet = markdown.util.etree.Element("div")
|
|
|
|
tweet.set("class", "twitter-tweet")
|
|
|
|
img_a = markdown.util.etree.SubElement(tweet, 'a')
|
|
|
|
img_a.set("href", url)
|
|
|
|
img_a.set("target", "_blank")
|
|
|
|
profile_img = markdown.util.etree.SubElement(img_a, 'img')
|
|
|
|
profile_img.set('class', 'twitter-avatar')
|
2013-03-08 20:48:14 +01:00
|
|
|
# For some reason, for, e.g. tweet 285072525413724161,
|
|
|
|
# python-twitter does not give us a
|
|
|
|
# profile_image_url_https, but instead puts that URL in
|
|
|
|
# profile_image_url. So use _https if available, but fall
|
|
|
|
# back gracefully.
|
|
|
|
image_url = user.get('profile_image_url_https', user['profile_image_url'])
|
|
|
|
profile_img.set('src', image_url)
|
2014-01-08 22:56:48 +01:00
|
|
|
|
2017-11-06 18:10:35 +01:00
|
|
|
text = html.unescape(res['full_text'])
|
2016-07-16 06:48:10 +02:00
|
|
|
urls = res.get('urls', [])
|
2014-01-08 22:56:48 +01:00
|
|
|
user_mentions = res.get('user_mentions', [])
|
2018-05-11 01:42:51 +02:00
|
|
|
media = res.get('media', []) # type: List[Dict[str, Any]]
|
2014-01-10 19:04:57 +01:00
|
|
|
p = self.twitter_text(text, urls, user_mentions, media)
|
2014-01-08 22:56:48 +01:00
|
|
|
tweet.append(p)
|
|
|
|
|
2013-03-08 06:27:16 +01:00
|
|
|
span = markdown.util.etree.SubElement(tweet, 'span')
|
2017-11-03 03:12:25 +01:00
|
|
|
span.text = "- %s (@%s)" % (user['name'], user['screen_name'])
|
2013-03-08 06:27:16 +01:00
|
|
|
|
2014-01-10 19:04:57 +01:00
|
|
|
# Add image previews
|
|
|
|
for media_item in media:
|
|
|
|
# Only photos have a preview image
|
|
|
|
if media_item['type'] != 'photo':
|
|
|
|
continue
|
|
|
|
|
|
|
|
# Find the image size that is smaller than
|
|
|
|
# TWITTER_MAX_IMAGE_HEIGHT px tall or the smallest
|
2016-01-25 01:27:18 +01:00
|
|
|
size_name_tuples = list(media_item['sizes'].items())
|
2014-01-10 19:04:57 +01:00
|
|
|
size_name_tuples.sort(reverse=True,
|
|
|
|
key=lambda x: x[1]['h'])
|
|
|
|
for size_name, size in size_name_tuples:
|
|
|
|
if size['h'] < self.TWITTER_MAX_IMAGE_HEIGHT:
|
|
|
|
break
|
|
|
|
|
2017-11-03 03:12:25 +01:00
|
|
|
media_url = '%s:%s' % (media_item['media_url_https'], size_name)
|
2014-01-10 21:08:13 +01:00
|
|
|
img_div = markdown.util.etree.SubElement(tweet, 'div')
|
|
|
|
img_div.set('class', 'twitter-image')
|
|
|
|
img_a = markdown.util.etree.SubElement(img_div, 'a')
|
2014-01-10 19:04:57 +01:00
|
|
|
img_a.set('href', media_item['url'])
|
|
|
|
img_a.set('target', '_blank')
|
|
|
|
img_a.set('title', media_item['url'])
|
|
|
|
img = markdown.util.etree.SubElement(img_a, 'img')
|
|
|
|
img.set('src', media_url)
|
|
|
|
|
2013-05-21 16:59:09 +02:00
|
|
|
return tweet
|
2017-03-05 10:25:27 +01:00
|
|
|
except Exception:
|
2013-03-08 06:27:16 +01:00
|
|
|
# We put this in its own try-except because it requires external
|
|
|
|
# connectivity. If Twitter flakes out, we don't want to not-render
|
|
|
|
# the entire message; we just want to not show the Twitter preview.
|
2018-07-03 07:25:29 +02:00
|
|
|
bugdown_logger.warning(traceback.format_exc())
|
2013-03-08 06:27:16 +01:00
|
|
|
return None
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def get_url_data(self, e: Element) -> Optional[Tuple[str, str]]:
|
2016-09-22 22:39:24 +02:00
|
|
|
if e.tag == "a":
|
|
|
|
if e.text is not None:
|
2017-11-04 17:58:33 +01:00
|
|
|
return (e.get("href"), e.text)
|
2016-09-22 22:39:24 +02:00
|
|
|
return (e.get("href"), e.get("href"))
|
|
|
|
return None
|
|
|
|
|
2017-12-25 21:35:23 +01:00
|
|
|
def handle_image_inlining(self, root: Element, found_url: ResultWithFamily) -> None:
|
|
|
|
grandparent = found_url.family.grandparent
|
|
|
|
parent = found_url.family.parent
|
|
|
|
ahref_element = found_url.family.child
|
|
|
|
(url, text) = found_url.result
|
|
|
|
actual_url = self.get_actual_image_url(url)
|
2017-11-27 10:03:18 +01:00
|
|
|
|
2017-12-25 21:35:23 +01:00
|
|
|
# url != text usually implies a named link, which we opt not to remove
|
|
|
|
url_eq_text = (url == text)
|
2017-11-27 10:03:18 +01:00
|
|
|
|
2017-12-25 21:35:23 +01:00
|
|
|
if parent.tag == 'li':
|
|
|
|
add_a(parent, self.get_actual_image_url(url), url, title=text)
|
|
|
|
if not parent.text and not ahref_element.tail and url_eq_text:
|
|
|
|
parent.remove(ahref_element)
|
2017-11-27 10:03:18 +01:00
|
|
|
|
2017-12-25 21:35:23 +01:00
|
|
|
elif parent.tag == 'p':
|
|
|
|
parent_index = None
|
|
|
|
for index, uncle in enumerate(grandparent.getchildren()):
|
|
|
|
if uncle is parent:
|
|
|
|
parent_index = index
|
|
|
|
break
|
2017-11-27 10:03:18 +01:00
|
|
|
|
2017-12-25 21:35:23 +01:00
|
|
|
if parent_index is not None:
|
|
|
|
ins_index = self.find_proper_insertion_index(grandparent, parent, parent_index)
|
|
|
|
add_a(grandparent, actual_url, url, title=text, insertion_index=ins_index)
|
2017-11-27 10:03:18 +01:00
|
|
|
|
2017-12-25 21:35:23 +01:00
|
|
|
else:
|
|
|
|
# We're not inserting after parent, since parent not found.
|
|
|
|
# Append to end of list of grandparent's children as normal
|
|
|
|
add_a(grandparent, actual_url, url, title=text)
|
|
|
|
|
|
|
|
# If link is alone in a paragraph, delete paragraph containing it
|
|
|
|
if (len(parent.getchildren()) == 1 and
|
|
|
|
(not parent.text or parent.text == "\n") and
|
|
|
|
not ahref_element.tail and
|
|
|
|
url_eq_text):
|
|
|
|
grandparent.remove(parent)
|
|
|
|
|
|
|
|
else:
|
|
|
|
# If none of the above criteria match, fall back to old behavior
|
|
|
|
add_a(root, actual_url, url, title=text)
|
|
|
|
|
|
|
|
def find_proper_insertion_index(self, grandparent: Element, parent: Element,
|
|
|
|
parent_index_in_grandparent: int) -> int:
|
|
|
|
# If there are several inline images from same paragraph, ensure that
|
|
|
|
# they are in correct (and not opposite) order by inserting after last
|
|
|
|
# inline image from paragraph 'parent'
|
|
|
|
|
|
|
|
uncles = grandparent.getchildren()
|
|
|
|
parent_links = [ele.attrib['href'] for ele in parent.iter(tag="a")]
|
|
|
|
insertion_index = parent_index_in_grandparent
|
|
|
|
|
|
|
|
while True:
|
|
|
|
insertion_index += 1
|
|
|
|
if insertion_index >= len(uncles):
|
|
|
|
return insertion_index
|
|
|
|
|
|
|
|
uncle = uncles[insertion_index]
|
|
|
|
inline_image_classes = ['message_inline_image', 'message_inline_ref']
|
|
|
|
if (
|
|
|
|
uncle.tag != 'div' or
|
|
|
|
'class' not in uncle.keys() or
|
|
|
|
uncle.attrib['class'] not in inline_image_classes
|
|
|
|
):
|
|
|
|
return insertion_index
|
|
|
|
|
|
|
|
uncle_link = list(uncle.iter(tag="a"))[0].attrib['href']
|
|
|
|
if uncle_link not in parent_links:
|
|
|
|
return insertion_index
|
2017-11-27 10:03:18 +01:00
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def is_absolute_url(self, url: str) -> bool:
|
2018-02-23 21:17:29 +01:00
|
|
|
return bool(urllib.parse.urlparse(url).netloc)
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def run(self, root: Element) -> None:
|
2013-05-21 16:59:09 +02:00
|
|
|
# Get all URLs from the blob
|
2017-12-25 21:35:23 +01:00
|
|
|
found_urls = walk_tree_with_family(root, self.get_url_data)
|
2018-02-23 12:16:09 +01:00
|
|
|
if len(found_urls) == 0 or len(found_urls) > self.INLINE_PREVIEW_LIMIT_PER_MESSAGE:
|
2013-05-21 16:59:09 +02:00
|
|
|
return
|
|
|
|
|
2014-01-28 22:17:12 +01:00
|
|
|
rendered_tweet_count = 0
|
2014-07-17 02:41:49 +02:00
|
|
|
|
2017-12-25 21:35:23 +01:00
|
|
|
for found_url in found_urls:
|
|
|
|
(url, text) = found_url.result
|
2018-02-23 21:17:29 +01:00
|
|
|
if not self.is_absolute_url(url):
|
|
|
|
if self.is_image(url):
|
|
|
|
self.handle_image_inlining(root, found_url)
|
|
|
|
# We don't have a strong use case for doing url preview for relative links.
|
|
|
|
continue
|
2016-09-22 22:39:24 +02:00
|
|
|
|
2018-02-23 21:17:29 +01:00
|
|
|
dropbox_image = self.dropbox_image(url)
|
2014-05-21 08:11:29 +02:00
|
|
|
if dropbox_image is not None:
|
|
|
|
class_attr = "message_inline_ref"
|
|
|
|
is_image = dropbox_image["is_image"]
|
|
|
|
if is_image:
|
|
|
|
class_attr = "message_inline_image"
|
|
|
|
# Not making use of title and description of images
|
|
|
|
add_a(root, dropbox_image['image'], url,
|
|
|
|
title=dropbox_image.get('title', ""),
|
|
|
|
desc=dropbox_image.get('desc', ""),
|
2018-03-08 09:37:09 +01:00
|
|
|
class_attr=class_attr,
|
2018-10-25 15:51:56 +02:00
|
|
|
already_thumbnailed=True)
|
2013-05-21 16:59:09 +02:00
|
|
|
continue
|
|
|
|
if self.is_image(url):
|
2017-12-25 21:35:23 +01:00
|
|
|
self.handle_image_inlining(root, found_url)
|
2013-05-21 16:59:09 +02:00
|
|
|
continue
|
2016-06-17 00:21:01 +02:00
|
|
|
if get_tweet_id(url) is not None:
|
2014-01-28 22:17:12 +01:00
|
|
|
if rendered_tweet_count >= self.TWITTER_MAX_TO_PREVIEW:
|
2013-05-21 16:59:09 +02:00
|
|
|
# Only render at most one tweet per message
|
|
|
|
continue
|
2013-05-29 21:38:16 +02:00
|
|
|
twitter_data = self.twitter_link(url)
|
|
|
|
if twitter_data is None:
|
|
|
|
# This link is not actually a tweet known to twitter
|
|
|
|
continue
|
2014-01-28 22:17:12 +01:00
|
|
|
rendered_tweet_count += 1
|
2013-05-21 16:59:09 +02:00
|
|
|
div = markdown.util.etree.SubElement(root, "div")
|
|
|
|
div.set("class", "inline-preview-twitter")
|
2013-05-29 21:38:16 +02:00
|
|
|
div.insert(0, twitter_data)
|
2013-05-21 16:59:09 +02:00
|
|
|
continue
|
|
|
|
youtube = self.youtube_image(url)
|
|
|
|
if youtube is not None:
|
2016-10-17 22:02:01 +02:00
|
|
|
yt_id = self.youtube_id(url)
|
2018-03-08 09:37:09 +01:00
|
|
|
add_a(root, youtube, url, None, None,
|
|
|
|
"youtube-video message_inline_image",
|
2018-10-25 15:51:56 +02:00
|
|
|
yt_id, already_thumbnailed=True)
|
2013-05-21 16:59:09 +02:00
|
|
|
continue
|
|
|
|
|
2018-11-07 16:26:33 +01:00
|
|
|
db_data = self.markdown.zulip_db_data
|
|
|
|
if db_data and db_data['sent_by_bot']:
|
2017-02-03 23:28:26 +01:00
|
|
|
continue
|
|
|
|
|
2018-11-07 15:48:08 +01:00
|
|
|
if not url_embed_preview_enabled_for_realm(self.markdown.zulip_message,
|
|
|
|
self.markdown.zulip_realm):
|
2016-10-27 12:06:44 +02:00
|
|
|
continue
|
2018-11-02 14:56:32 +01:00
|
|
|
|
2016-10-27 12:06:44 +02:00
|
|
|
try:
|
|
|
|
extracted_data = link_preview.link_embed_data_from_cache(url)
|
|
|
|
except NotFoundInCache:
|
2018-11-07 15:24:36 +01:00
|
|
|
self.markdown.zulip_message.links_for_preview.add(url)
|
2016-10-27 12:06:44 +02:00
|
|
|
continue
|
|
|
|
if extracted_data:
|
2018-03-24 12:48:57 +01:00
|
|
|
vm_id = self.vimeo_id(url)
|
|
|
|
if vm_id is not None:
|
|
|
|
vimeo_image = extracted_data.get('image')
|
2018-03-24 12:53:47 +01:00
|
|
|
vimeo_title = self.vimeo_title(extracted_data)
|
2018-03-24 12:48:57 +01:00
|
|
|
if vimeo_image is not None:
|
2018-03-24 12:53:47 +01:00
|
|
|
add_a(root, vimeo_image, url, vimeo_title,
|
2018-03-08 09:37:09 +01:00
|
|
|
None, "vimeo-video message_inline_image", vm_id,
|
2018-10-25 15:51:56 +02:00
|
|
|
already_thumbnailed=True)
|
2018-03-24 12:53:47 +01:00
|
|
|
if vimeo_title is not None:
|
|
|
|
found_url.family.child.text = vimeo_title
|
2017-12-14 22:17:00 +01:00
|
|
|
else:
|
|
|
|
add_embed(root, url, extracted_data)
|
2016-10-27 12:06:44 +02:00
|
|
|
|
2013-11-12 23:37:33 +01:00
|
|
|
class Avatar(markdown.inlinepatterns.Pattern):
|
2018-05-11 01:42:51 +02:00
|
|
|
def handleMatch(self, match: Match[str]) -> Optional[Element]:
|
2012-10-17 04:42:19 +02:00
|
|
|
img = markdown.util.etree.Element('img')
|
2013-10-30 16:52:28 +01:00
|
|
|
email_address = match.group('email')
|
2016-10-24 16:42:43 +02:00
|
|
|
email = email_address.strip().lower()
|
|
|
|
profile_id = None
|
|
|
|
|
2018-11-07 16:26:33 +01:00
|
|
|
db_data = self.markdown.zulip_db_data
|
|
|
|
if db_data is not None:
|
|
|
|
user_dict = db_data['email_info'].get(email)
|
2016-10-24 16:42:43 +02:00
|
|
|
if user_dict is not None:
|
|
|
|
profile_id = user_dict['id']
|
|
|
|
|
2013-10-30 16:52:28 +01:00
|
|
|
img.set('class', 'message_body_gravatar')
|
2016-10-24 16:42:43 +02:00
|
|
|
img.set('src', '/avatar/{0}?s=30'.format(profile_id or email))
|
|
|
|
img.set('title', email)
|
|
|
|
img.set('alt', email)
|
2012-10-17 04:42:19 +02:00
|
|
|
return img
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def possible_avatar_emails(content: str) -> Set[str]:
|
2017-09-14 22:11:34 +02:00
|
|
|
emails = set()
|
2019-01-23 21:30:00 +01:00
|
|
|
for REGEX in [AVATAR_REGEX, GRAVATAR_REGEX]:
|
|
|
|
matches = re.findall(REGEX, content)
|
2017-09-14 22:11:34 +02:00
|
|
|
for email in matches:
|
|
|
|
if email:
|
|
|
|
emails.add(email)
|
|
|
|
|
|
|
|
return emails
|
|
|
|
|
2017-11-10 03:49:42 +01:00
|
|
|
path_to_name_to_codepoint = os.path.join(settings.STATIC_ROOT,
|
|
|
|
"generated", "emoji", "name_to_codepoint.json")
|
2017-06-29 23:41:28 +02:00
|
|
|
with open(path_to_name_to_codepoint) as name_to_codepoint_file:
|
|
|
|
name_to_codepoint = ujson.load(name_to_codepoint_file)
|
2017-06-20 15:52:14 +02:00
|
|
|
|
2017-11-10 03:49:42 +01:00
|
|
|
path_to_codepoint_to_name = os.path.join(settings.STATIC_ROOT,
|
|
|
|
"generated", "emoji", "codepoint_to_name.json")
|
2017-06-20 15:52:14 +02:00
|
|
|
with open(path_to_codepoint_to_name) as codepoint_to_name_file:
|
|
|
|
codepoint_to_name = ujson.load(codepoint_to_name_file)
|
2013-03-01 22:07:27 +01:00
|
|
|
|
2017-06-19 23:30:14 +02:00
|
|
|
# All of our emojis(non ZWJ sequences) belong to one of these unicode blocks:
|
|
|
|
# \U0001f100-\U0001f1ff - Enclosed Alphanumeric Supplement
|
|
|
|
# \U0001f200-\U0001f2ff - Enclosed Ideographic Supplement
|
|
|
|
# \U0001f300-\U0001f5ff - Miscellaneous Symbols and Pictographs
|
|
|
|
# \U0001f600-\U0001f64f - Emoticons (Emoji)
|
|
|
|
# \U0001f680-\U0001f6ff - Transport and Map Symbols
|
|
|
|
# \U0001f900-\U0001f9ff - Supplemental Symbols and Pictographs
|
|
|
|
# \u2000-\u206f - General Punctuation
|
|
|
|
# \u2300-\u23ff - Miscellaneous Technical
|
|
|
|
# \u2400-\u243f - Control Pictures
|
|
|
|
# \u2440-\u245f - Optical Character Recognition
|
|
|
|
# \u2460-\u24ff - Enclosed Alphanumerics
|
|
|
|
# \u2500-\u257f - Box Drawing
|
|
|
|
# \u2580-\u259f - Block Elements
|
|
|
|
# \u25a0-\u25ff - Geometric Shapes
|
|
|
|
# \u2600-\u26ff - Miscellaneous Symbols
|
|
|
|
# \u2700-\u27bf - Dingbats
|
|
|
|
# \u2900-\u297f - Supplemental Arrows-B
|
|
|
|
# \u2b00-\u2bff - Miscellaneous Symbols and Arrows
|
|
|
|
# \u3000-\u303f - CJK Symbols and Punctuation
|
|
|
|
# \u3200-\u32ff - Enclosed CJK Letters and Months
|
2017-11-03 03:12:25 +01:00
|
|
|
unicode_emoji_regex = '(?P<syntax>['\
|
|
|
|
'\U0001F100-\U0001F64F' \
|
|
|
|
'\U0001F680-\U0001F6FF' \
|
|
|
|
'\U0001F900-\U0001F9FF' \
|
|
|
|
'\u2000-\u206F' \
|
|
|
|
'\u2300-\u27BF' \
|
|
|
|
'\u2900-\u297F' \
|
|
|
|
'\u2B00-\u2BFF' \
|
|
|
|
'\u3000-\u303F' \
|
|
|
|
'\u3200-\u32FF' \
|
|
|
|
'])'
|
2017-06-19 23:30:14 +02:00
|
|
|
# The equivalent JS regex is \ud83c[\udd00-\udfff]|\ud83d[\udc00-\ude4f]|\ud83d[\ude80-\udeff]|
|
|
|
|
# \ud83e[\udd00-\uddff]|[\u2000-\u206f]|[\u2300-\u27bf]|[\u2b00-\u2bff]|[\u3000-\u303f]|
|
|
|
|
# [\u3200-\u32ff]. See below comments for explanation. The JS regex is used by marked.js for
|
|
|
|
# frontend unicode emoji processing.
|
|
|
|
# The JS regex \ud83c[\udd00-\udfff]|\ud83d[\udc00-\ude4f] represents U0001f100-\U0001f64f
|
|
|
|
# The JS regex \ud83d[\ude80-\udeff] represents \U0001f680-\U0001f6ff
|
|
|
|
# The JS regex \ud83e[\udd00-\uddff] represents \U0001f900-\U0001f9ff
|
|
|
|
# The JS regex [\u2000-\u206f] represents \u2000-\u206f
|
|
|
|
# The JS regex [\u2300-\u27bf] represents \u2300-\u27bf
|
|
|
|
# Similarly other JS regexes can be mapped to the respective unicode blocks.
|
|
|
|
# For more information, please refer to the following article:
|
|
|
|
# http://crocodillon.com/blog/parsing-emoji-unicode-in-javascript
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def make_emoji(codepoint: str, display_string: str) -> Element:
|
2017-06-09 10:30:24 +02:00
|
|
|
# Replace underscore in emoji's title with space
|
|
|
|
title = display_string[1:-1].replace("_", " ")
|
2017-09-27 19:39:42 +02:00
|
|
|
span = markdown.util.etree.Element('span')
|
|
|
|
span.set('class', 'emoji emoji-%s' % (codepoint,))
|
|
|
|
span.set('title', title)
|
2019-01-14 08:45:37 +01:00
|
|
|
span.set('role', 'img')
|
|
|
|
span.set('aria-label', title)
|
2017-09-27 19:39:42 +02:00
|
|
|
span.text = display_string
|
|
|
|
return span
|
2013-03-01 22:07:27 +01:00
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def make_realm_emoji(src: str, display_string: str) -> Element:
|
2017-05-01 01:25:03 +02:00
|
|
|
elt = markdown.util.etree.Element('img')
|
|
|
|
elt.set('src', src)
|
|
|
|
elt.set('class', 'emoji')
|
|
|
|
elt.set("alt", display_string)
|
2017-06-09 10:30:24 +02:00
|
|
|
elt.set("title", display_string[1:-1].replace("_", " "))
|
2017-05-01 01:25:03 +02:00
|
|
|
return elt
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def unicode_emoji_to_codepoint(unicode_emoji: str) -> str:
|
2017-05-01 01:23:41 +02:00
|
|
|
codepoint = hex(ord(unicode_emoji))[2:]
|
2017-05-16 08:59:24 +02:00
|
|
|
# Unicode codepoints are minimum of length 4, padded
|
|
|
|
# with zeroes if the length is less than zero.
|
|
|
|
while len(codepoint) < 4:
|
|
|
|
codepoint = '0' + codepoint
|
2017-05-01 01:23:41 +02:00
|
|
|
return codepoint
|
|
|
|
|
2018-01-15 19:36:32 +01:00
|
|
|
class EmoticonTranslation(markdown.inlinepatterns.Pattern):
|
|
|
|
""" Translates emoticons like `:)` into emoji like `:smile:`. """
|
2018-05-11 01:42:51 +02:00
|
|
|
def handleMatch(self, match: Match[str]) -> Optional[Element]:
|
2018-11-07 16:26:33 +01:00
|
|
|
db_data = self.markdown.zulip_db_data
|
|
|
|
if db_data is None or not db_data['translate_emoticons']:
|
2018-01-15 19:36:32 +01:00
|
|
|
return None
|
|
|
|
|
|
|
|
emoticon = match.group('emoticon')
|
|
|
|
translated = translate_emoticons(emoticon)
|
|
|
|
name = translated[1:-1]
|
|
|
|
return make_emoji(name_to_codepoint[name], translated)
|
|
|
|
|
2016-06-24 20:03:56 +02:00
|
|
|
class UnicodeEmoji(markdown.inlinepatterns.Pattern):
|
2018-05-11 01:42:51 +02:00
|
|
|
def handleMatch(self, match: Match[str]) -> Optional[Element]:
|
2016-06-24 20:03:56 +02:00
|
|
|
orig_syntax = match.group('syntax')
|
2017-05-01 01:23:41 +02:00
|
|
|
codepoint = unicode_emoji_to_codepoint(orig_syntax)
|
2017-06-20 15:52:14 +02:00
|
|
|
if codepoint in codepoint_to_name:
|
|
|
|
display_string = ':' + codepoint_to_name[codepoint] + ':'
|
|
|
|
return make_emoji(codepoint, display_string)
|
2016-06-24 20:03:56 +02:00
|
|
|
else:
|
|
|
|
return None
|
|
|
|
|
2013-03-01 22:07:27 +01:00
|
|
|
class Emoji(markdown.inlinepatterns.Pattern):
|
2018-05-11 01:42:51 +02:00
|
|
|
def handleMatch(self, match: Match[str]) -> Optional[Element]:
|
2013-03-01 22:07:27 +01:00
|
|
|
orig_syntax = match.group("syntax")
|
|
|
|
name = orig_syntax[1:-1]
|
2013-08-22 20:50:00 +02:00
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
active_realm_emoji = {} # type: Dict[str, Dict[str, str]]
|
2018-11-07 16:26:33 +01:00
|
|
|
db_data = self.markdown.zulip_db_data
|
|
|
|
if db_data is not None:
|
|
|
|
active_realm_emoji = db_data['active_realm_emoji']
|
2013-08-22 20:50:00 +02:00
|
|
|
|
2018-11-07 15:24:36 +01:00
|
|
|
if self.markdown.zulip_message and name in active_realm_emoji:
|
2018-03-11 18:48:56 +01:00
|
|
|
return make_realm_emoji(active_realm_emoji[name]['source_url'], orig_syntax)
|
2017-05-01 01:13:28 +02:00
|
|
|
elif name == 'zulip':
|
2017-05-01 01:25:03 +02:00
|
|
|
return make_realm_emoji('/static/generated/emoji/images/emoji/unicode/zulip.png', orig_syntax)
|
2017-02-04 23:27:24 +01:00
|
|
|
elif name in name_to_codepoint:
|
2017-05-01 01:34:31 +02:00
|
|
|
return make_emoji(name_to_codepoint[name], orig_syntax)
|
2013-08-22 20:50:00 +02:00
|
|
|
else:
|
2013-09-27 20:04:46 +02:00
|
|
|
return None
|
2013-03-01 22:07:27 +01:00
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def content_has_emoji_syntax(content: str) -> bool:
|
2017-09-15 03:08:15 +02:00
|
|
|
return re.search(EMOJI_REGEX, content) is not None
|
|
|
|
|
2014-01-22 22:20:42 +01:00
|
|
|
class ModalLink(markdown.inlinepatterns.Pattern):
|
|
|
|
"""
|
|
|
|
A pattern that allows including in-app modal links in messages.
|
|
|
|
"""
|
2016-11-29 07:22:02 +01:00
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def handleMatch(self, match: Match[str]) -> Element:
|
2014-01-22 22:20:42 +01:00
|
|
|
relative_url = match.group('relative_url')
|
|
|
|
text = match.group('text')
|
|
|
|
|
|
|
|
a_tag = markdown.util.etree.Element("a")
|
|
|
|
a_tag.set("href", relative_url)
|
|
|
|
a_tag.set("title", relative_url)
|
|
|
|
a_tag.text = text
|
|
|
|
|
|
|
|
return a_tag
|
|
|
|
|
2017-03-20 16:56:39 +01:00
|
|
|
class Tex(markdown.inlinepatterns.Pattern):
|
2018-05-11 01:42:51 +02:00
|
|
|
def handleMatch(self, match: Match[str]) -> Element:
|
2017-03-20 16:56:39 +01:00
|
|
|
rendered = render_tex(match.group('body'), is_inline=True)
|
|
|
|
if rendered is not None:
|
|
|
|
return etree.fromstring(rendered.encode('utf-8'))
|
2017-07-09 01:28:18 +02:00
|
|
|
else: # Something went wrong while rendering
|
2017-03-20 16:56:39 +01:00
|
|
|
span = markdown.util.etree.Element('span')
|
|
|
|
span.set('class', 'tex-error')
|
|
|
|
span.text = '$$' + match.group('body') + '$$'
|
|
|
|
return span
|
|
|
|
|
2017-11-03 03:12:25 +01:00
|
|
|
upload_title_re = re.compile("^(https?://[^/]*)?(/user_uploads/\\d+)(/[^/]*)?/[^/]*/(?P<filename>[^/]*)$")
|
2018-05-11 01:42:51 +02:00
|
|
|
def url_filename(url: str) -> str:
|
2013-10-23 21:45:15 +02:00
|
|
|
"""Extract the filename if a URL is an uploaded file, or return the original URL"""
|
2016-06-14 04:39:33 +02:00
|
|
|
match = upload_title_re.match(url)
|
2013-10-23 21:45:15 +02:00
|
|
|
if match:
|
2016-06-14 04:09:33 +02:00
|
|
|
return match.group('filename')
|
2013-10-23 21:45:15 +02:00
|
|
|
else:
|
|
|
|
return url
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def fixup_link(link: markdown.util.etree.Element, target_blank: bool=True) -> None:
|
2012-12-04 20:15:50 +01:00
|
|
|
"""Set certain attributes we want on every link."""
|
2013-03-29 20:17:33 +01:00
|
|
|
if target_blank:
|
|
|
|
link.set('target', '_blank')
|
2017-01-24 06:21:14 +01:00
|
|
|
link.set('title', url_filename(link.get('href')))
|
2012-12-04 20:15:50 +01:00
|
|
|
|
2013-02-01 23:15:05 +01:00
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def sanitize_url(url: str) -> Optional[str]:
|
2013-02-01 23:15:05 +01:00
|
|
|
"""
|
|
|
|
Sanitize a url against xss attacks.
|
|
|
|
See the docstring on markdown.inlinepatterns.LinkPattern.sanitize_url.
|
|
|
|
"""
|
|
|
|
try:
|
2016-01-24 03:39:44 +01:00
|
|
|
parts = urllib.parse.urlparse(url.replace(' ', '%20'))
|
2013-02-01 23:15:05 +01:00
|
|
|
scheme, netloc, path, params, query, fragment = parts
|
|
|
|
except ValueError:
|
|
|
|
# Bad url - so bad it couldn't be parsed.
|
|
|
|
return ''
|
|
|
|
|
2013-03-29 20:17:33 +01:00
|
|
|
# If there is no scheme or netloc and there is a '@' in the path,
|
|
|
|
# treat it as a mailto: and set the appropriate scheme
|
|
|
|
if scheme == '' and netloc == '' and '@' in path:
|
|
|
|
scheme = 'mailto'
|
2013-10-24 18:06:33 +02:00
|
|
|
elif scheme == '' and netloc == '' and len(path) > 0 and path[0] == '/':
|
|
|
|
# Allow domain-relative links
|
2016-01-24 03:39:44 +01:00
|
|
|
return urllib.parse.urlunparse(('', '', path, params, query, fragment))
|
2013-10-24 18:06:33 +02:00
|
|
|
elif (scheme, netloc, path, params, query) == ('', '', '', '', '') and len(fragment) > 0:
|
|
|
|
# Allow fragment links
|
2016-01-24 03:39:44 +01:00
|
|
|
return urllib.parse.urlunparse(('', '', '', '', '', fragment))
|
2013-03-29 20:17:33 +01:00
|
|
|
|
2013-08-06 21:32:15 +02:00
|
|
|
# Zulip modification: If scheme is not specified, assume http://
|
2013-02-01 23:15:05 +01:00
|
|
|
# We re-enter sanitize_url because netloc etc. need to be re-parsed.
|
|
|
|
if not scheme:
|
|
|
|
return sanitize_url('http://' + url)
|
|
|
|
|
2018-01-19 11:17:38 +01:00
|
|
|
locless_schemes = ['mailto', 'news', 'file', 'bitcoin']
|
2013-02-01 23:15:05 +01:00
|
|
|
if netloc == '' and scheme not in locless_schemes:
|
|
|
|
# This fails regardless of anything else.
|
2017-11-09 16:26:38 +01:00
|
|
|
# Return immediately to save additional processing
|
2013-02-26 22:41:39 +01:00
|
|
|
return None
|
2013-02-01 23:15:05 +01:00
|
|
|
|
2013-04-02 19:57:35 +02:00
|
|
|
# Upstream code will accept a URL like javascript://foo because it
|
|
|
|
# appears to have a netloc. Additionally there are plenty of other
|
|
|
|
# schemes that do weird things like launch external programs. To be
|
|
|
|
# on the safe side, we whitelist the scheme.
|
2018-01-19 11:17:38 +01:00
|
|
|
if scheme not in ('http', 'https', 'ftp', 'mailto', 'file', 'bitcoin'):
|
2013-04-02 19:57:35 +02:00
|
|
|
return None
|
|
|
|
|
2013-04-02 19:36:37 +02:00
|
|
|
# Upstream code scans path, parameters, and query for colon characters
|
|
|
|
# because
|
|
|
|
#
|
2016-01-24 03:39:44 +01:00
|
|
|
# some aliases [for javascript:] will appear to urllib.parse to have
|
2013-04-02 19:36:37 +02:00
|
|
|
# no scheme. On top of that relative links (i.e.: "foo/bar.html")
|
|
|
|
# have no scheme.
|
|
|
|
#
|
|
|
|
# We already converted an empty scheme to http:// above, so we skip
|
|
|
|
# the colon check, which would also forbid a lot of legitimate URLs.
|
2013-02-01 23:15:05 +01:00
|
|
|
|
|
|
|
# Url passes all tests. Return url as-is.
|
2016-01-24 03:39:44 +01:00
|
|
|
return urllib.parse.urlunparse((scheme, netloc, path, params, query, fragment))
|
2013-02-01 23:15:05 +01:00
|
|
|
|
2018-11-07 16:07:34 +01:00
|
|
|
def url_to_a(db_data: Optional[DbData], url: str, text: Optional[str]=None) -> Union[Element, str]:
|
2013-02-11 20:49:48 +01:00
|
|
|
a = markdown.util.etree.Element('a')
|
2013-02-26 22:41:39 +01:00
|
|
|
|
2013-03-29 20:17:33 +01:00
|
|
|
href = sanitize_url(url)
|
2017-10-31 22:03:39 +01:00
|
|
|
target_blank = True
|
2013-02-26 22:41:39 +01:00
|
|
|
if href is None:
|
|
|
|
# Rejected by sanitize_url; render it as plain text.
|
|
|
|
return url
|
2013-06-05 17:45:57 +02:00
|
|
|
if text is None:
|
2013-10-02 21:14:22 +02:00
|
|
|
text = markdown.util.AtomicString(url)
|
2013-02-26 22:41:39 +01:00
|
|
|
|
2018-11-07 16:07:34 +01:00
|
|
|
href = rewrite_local_links_to_relative(db_data, href)
|
2018-04-02 19:31:21 +02:00
|
|
|
target_blank = not href.startswith("#narrow") and not href.startswith('mailto:')
|
2017-10-31 22:03:39 +01:00
|
|
|
|
2013-02-26 22:41:39 +01:00
|
|
|
a.set('href', href)
|
2013-06-05 17:45:57 +02:00
|
|
|
a.text = text
|
2017-10-31 22:03:39 +01:00
|
|
|
fixup_link(a, target_blank)
|
2013-02-11 20:49:48 +01:00
|
|
|
return a
|
|
|
|
|
2019-01-22 19:08:33 +01:00
|
|
|
class CompiledPattern(markdown.inlinepatterns.Pattern):
|
2018-11-07 15:24:36 +01:00
|
|
|
def __init__(self, compiled_re: Pattern, md: markdown.Markdown) -> None:
|
2019-01-22 19:11:50 +01:00
|
|
|
# This is similar to the superclass's small __init__ function,
|
|
|
|
# but we skip the compilation step and let the caller give us
|
|
|
|
# a compiled regex.
|
2018-11-03 17:12:15 +01:00
|
|
|
self.compiled_re = compiled_re
|
2019-01-22 19:11:50 +01:00
|
|
|
self.md = md
|
2013-06-21 23:42:33 +02:00
|
|
|
|
2019-01-22 19:08:33 +01:00
|
|
|
class AutoLink(CompiledPattern):
|
2018-05-11 01:42:51 +02:00
|
|
|
def handleMatch(self, match: Match[str]) -> ElementStringNone:
|
2013-02-11 20:49:48 +01:00
|
|
|
url = match.group('url')
|
2018-11-07 16:26:33 +01:00
|
|
|
db_data = self.markdown.zulip_db_data
|
2018-11-07 16:07:34 +01:00
|
|
|
return url_to_a(db_data, url)
|
2012-10-22 02:32:18 +02:00
|
|
|
|
2016-10-14 05:23:15 +02:00
|
|
|
class UListProcessor(markdown.blockprocessors.UListProcessor):
|
2012-11-02 18:25:37 +01:00
|
|
|
""" Process unordered list blocks.
|
|
|
|
|
|
|
|
Based on markdown.blockprocessors.UListProcessor, but does not accept
|
2013-01-23 23:07:01 +01:00
|
|
|
'+' or '-' as a bullet character."""
|
2012-11-02 18:25:37 +01:00
|
|
|
|
|
|
|
TAG = 'ul'
|
2017-11-03 03:12:25 +01:00
|
|
|
RE = re.compile('^[ ]{0,3}[*][ ]+(.*)')
|
2012-11-02 18:25:37 +01:00
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def __init__(self, parser: Any) -> None:
|
2017-03-26 21:14:05 +02:00
|
|
|
|
|
|
|
# HACK: Set the tab length to 2 just for the initialization of
|
|
|
|
# this class, so that bulleted lists (and only bulleted lists)
|
|
|
|
# work off 2-space indentation.
|
|
|
|
parser.markdown.tab_length = 2
|
2017-10-27 08:28:23 +02:00
|
|
|
super().__init__(parser)
|
2017-03-26 21:14:05 +02:00
|
|
|
parser.markdown.tab_length = 4
|
|
|
|
|
|
|
|
class ListIndentProcessor(markdown.blockprocessors.ListIndentProcessor):
|
|
|
|
""" Process unordered list blocks.
|
|
|
|
|
|
|
|
Based on markdown.blockprocessors.ListIndentProcessor, but with 2-space indent
|
|
|
|
"""
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def __init__(self, parser: Any) -> None:
|
2017-03-26 21:14:05 +02:00
|
|
|
|
|
|
|
# HACK: Set the tab length to 2 just for the initialization of
|
|
|
|
# this class, so that bulleted lists (and only bulleted lists)
|
|
|
|
# work off 2-space indentation.
|
|
|
|
parser.markdown.tab_length = 2
|
2017-10-27 08:28:23 +02:00
|
|
|
super().__init__(parser)
|
2017-03-26 21:14:05 +02:00
|
|
|
parser.markdown.tab_length = 4
|
|
|
|
|
2019-01-08 11:30:13 +01:00
|
|
|
class BlockQuoteProcessor(markdown.blockprocessors.BlockQuoteProcessor):
|
|
|
|
""" Process BlockQuotes.
|
|
|
|
|
|
|
|
Based on markdown.blockprocessors.BlockQuoteProcessor, but with 2-space indent
|
|
|
|
"""
|
|
|
|
|
|
|
|
# Original regex for blockquote is RE = re.compile(r'(^|\n)[ ]{0,3}>[ ]?(.*)')
|
|
|
|
RE = re.compile(r'(^|\n)(?!(?:[ ]{0,3}>\s*(?:$|\n))*(?:$|\n))'
|
|
|
|
r'[ ]{0,3}>[ ]?(.*)')
|
|
|
|
mention_re = re.compile(mention.find_mentions)
|
|
|
|
|
|
|
|
def clean(self, line: str) -> str:
|
|
|
|
# Silence all the mentions inside blockquotes
|
|
|
|
line = re.sub(self.mention_re, lambda m: "_@{}".format(m.group('match')), line)
|
|
|
|
|
|
|
|
# And then run the upstream processor's code for removing the '>'
|
|
|
|
return super().clean(line)
|
|
|
|
|
2013-01-24 19:35:20 +01:00
|
|
|
class BugdownUListPreprocessor(markdown.preprocessors.Preprocessor):
|
|
|
|
""" Allows unordered list blocks that come directly after a
|
|
|
|
paragraph to be rendered as an unordered list
|
|
|
|
|
|
|
|
Detects paragraphs that have a matching list item that comes
|
|
|
|
directly after a line of text, and inserts a newline between
|
|
|
|
to satisfy Markdown"""
|
|
|
|
|
2017-11-03 03:12:25 +01:00
|
|
|
LI_RE = re.compile('^[ ]{0,3}[*][ ]+(.*)', re.MULTILINE)
|
|
|
|
HANGING_ULIST_RE = re.compile('^.+\\n([ ]{0,3}[*][ ]+.*)', re.MULTILINE)
|
2013-01-24 19:35:20 +01:00
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def run(self, lines: List[str]) -> List[str]:
|
2013-01-24 19:35:20 +01:00
|
|
|
""" Insert a newline between a paragraph and ulist if missing """
|
|
|
|
inserts = 0
|
|
|
|
fence = None
|
|
|
|
copy = lines[:]
|
2015-11-01 17:15:05 +01:00
|
|
|
for i in range(len(lines) - 1):
|
2013-01-24 19:35:20 +01:00
|
|
|
# Ignore anything that is inside a fenced code block
|
|
|
|
m = FENCE_RE.match(lines[i])
|
|
|
|
if not fence and m:
|
|
|
|
fence = m.group('fence')
|
|
|
|
elif fence and m and fence == m.group('fence'):
|
|
|
|
fence = None
|
|
|
|
|
|
|
|
# If we're not in a fenced block and we detect an upcoming list
|
|
|
|
# hanging off a paragraph, add a newline
|
2016-11-30 21:54:51 +01:00
|
|
|
if (not fence and lines[i] and
|
|
|
|
self.LI_RE.match(lines[i+1]) and
|
2016-12-03 18:19:09 +01:00
|
|
|
not self.LI_RE.match(lines[i])):
|
2016-11-30 21:54:51 +01:00
|
|
|
|
2013-01-24 19:35:20 +01:00
|
|
|
copy.insert(i+inserts+1, '')
|
|
|
|
inserts += 1
|
|
|
|
return copy
|
|
|
|
|
2017-12-03 15:24:21 +01:00
|
|
|
class AutoNumberOListPreprocessor(markdown.preprocessors.Preprocessor):
|
|
|
|
""" Finds a sequence of lines numbered by the same number"""
|
|
|
|
RE = re.compile(r'^([ ]*)(\d+)\.[ ]+(.*)')
|
|
|
|
TAB_LENGTH = 2
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def run(self, lines: List[str]) -> List[str]:
|
|
|
|
new_lines = [] # type: List[str]
|
|
|
|
current_list = [] # type: List[Match[str]]
|
2017-12-03 15:24:21 +01:00
|
|
|
current_indent = 0
|
|
|
|
|
|
|
|
for line in lines:
|
|
|
|
m = self.RE.match(line)
|
|
|
|
|
|
|
|
# Remember if this line is a continuation of already started list
|
|
|
|
is_next_item = (m and current_list
|
|
|
|
and current_indent == len(m.group(1)) // self.TAB_LENGTH)
|
|
|
|
|
|
|
|
if not is_next_item:
|
|
|
|
# There is no more items in the list we were processing
|
|
|
|
new_lines.extend(self.renumber(current_list))
|
|
|
|
current_list = []
|
|
|
|
|
|
|
|
if not m:
|
|
|
|
# Ordinary line
|
|
|
|
new_lines.append(line)
|
|
|
|
elif is_next_item:
|
|
|
|
# Another list item
|
|
|
|
current_list.append(m)
|
|
|
|
else:
|
|
|
|
# First list item
|
|
|
|
current_list = [m]
|
|
|
|
current_indent = len(m.group(1)) // self.TAB_LENGTH
|
|
|
|
|
|
|
|
new_lines.extend(self.renumber(current_list))
|
|
|
|
|
|
|
|
return new_lines
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def renumber(self, mlist: List[Match[str]]) -> List[str]:
|
2017-12-03 15:24:21 +01:00
|
|
|
if not mlist:
|
|
|
|
return []
|
|
|
|
|
|
|
|
start_number = int(mlist[0].group(2))
|
|
|
|
|
|
|
|
# Change numbers only if every one is the same
|
|
|
|
change_numbers = True
|
|
|
|
for m in mlist:
|
|
|
|
if int(m.group(2)) != start_number:
|
|
|
|
change_numbers = False
|
|
|
|
break
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
lines = [] # type: List[str]
|
2017-12-03 15:24:21 +01:00
|
|
|
counter = start_number
|
|
|
|
|
|
|
|
for m in mlist:
|
|
|
|
number = str(counter) if change_numbers else m.group(2)
|
|
|
|
lines.append('%s%s. %s' % (m.group(1), number, m.group(3)))
|
|
|
|
counter += 1
|
|
|
|
|
|
|
|
return lines
|
|
|
|
|
2018-12-20 08:28:40 +01:00
|
|
|
# We need the following since upgrade from py-markdown 2.6.11 to 3.0.1
|
|
|
|
# modifies the link handling significantly. The following is taken from
|
|
|
|
# py-markdown 2.6.11 markdown/inlinepatterns.py.
|
2019-01-22 19:31:25 +01:00
|
|
|
@one_time
|
2018-12-20 08:28:40 +01:00
|
|
|
def get_link_re() -> str:
|
2019-01-22 19:31:25 +01:00
|
|
|
'''
|
|
|
|
Very important--if you need to change this code to depend on
|
|
|
|
any arguments, you must eliminate the "one_time" decorator
|
|
|
|
and consider performance implications. We only want to compute
|
|
|
|
this value once.
|
|
|
|
'''
|
|
|
|
|
2018-12-20 08:28:40 +01:00
|
|
|
NOBRACKET = r'[^\]\[]*'
|
|
|
|
BRK = (
|
|
|
|
r'\[(' +
|
|
|
|
(NOBRACKET + r'(\[')*6 +
|
|
|
|
(NOBRACKET + r'\])*')*6 +
|
|
|
|
NOBRACKET + r')\]'
|
|
|
|
)
|
|
|
|
NOIMG = r'(?<!\!)'
|
|
|
|
|
|
|
|
# [text](url) or [text](<url>) or [text](url "title")
|
|
|
|
LINK_RE = NOIMG + BRK + \
|
|
|
|
r'''\(\s*(<.*?>|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*?)\12\s*)?\)'''
|
2019-01-22 19:31:25 +01:00
|
|
|
return normal_compile(LINK_RE)
|
2018-12-20 08:28:40 +01:00
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def prepare_realm_pattern(source: str) -> str:
|
2019-01-23 20:13:05 +01:00
|
|
|
""" Augment a realm filter to liberally match all occurences of the filter,
|
|
|
|
along with the preceeding and proceeding characters for further analysis in
|
|
|
|
the realm filter pattern and saves what was matched as "name". """
|
|
|
|
return r"""(?P<total>.?(?P<wrap>(?P<name>""" + source + r')).?)'
|
2013-07-15 17:56:45 +02:00
|
|
|
|
2013-06-05 17:45:57 +02:00
|
|
|
# Given a regular expression pattern, linkifies groups that match it
|
|
|
|
# using the provided format string to construct the URL.
|
2019-01-23 20:13:05 +01:00
|
|
|
class RealmFilterPattern(markdown.inlinepatterns.InlineProcessor):
|
2013-06-05 17:45:57 +02:00
|
|
|
""" Applied a given realm filter to the input """
|
2016-11-29 07:22:02 +01:00
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def __init__(self, source_pattern: str,
|
|
|
|
format_string: str,
|
2017-11-05 11:15:10 +01:00
|
|
|
markdown_instance: Optional[markdown.Markdown]=None) -> None:
|
2013-07-15 17:56:45 +02:00
|
|
|
self.pattern = prepare_realm_pattern(source_pattern)
|
2013-06-05 17:45:57 +02:00
|
|
|
self.format_string = format_string
|
2019-01-23 20:13:05 +01:00
|
|
|
# To properly convert realm patterns in languages that do not use spaces
|
|
|
|
# as separators, we have to apply a somewhat convulated approach. The third
|
|
|
|
# party module `regex` has better unicode support than `re`. Also, we need
|
|
|
|
# to keep two regular expressions because of how word boundaries are computed.
|
|
|
|
#
|
|
|
|
# For example, consider the message: 'hello#123world'
|
|
|
|
# For pattern '#123', computed word boundaries are: 'hello{\b}#123world'
|
|
|
|
# and our pattern's beginning matches even when it
|
|
|
|
# shouldn't. A simple hack is to convert the pattern
|
|
|
|
# to 'a#123a' ('a' is a valid 'word' character).
|
|
|
|
# Now, we get no word boundaries as follows: 'helloa#123world'
|
|
|
|
# and we can safely reject this message.
|
|
|
|
# Conversely, in languages like Japanese that do
|
|
|
|
# not use spaces, a similar message would become: 'チケットは{\b}a#123a{\b}です'
|
|
|
|
# and we can convert this message.
|
|
|
|
|
|
|
|
# Regex: - (should have nothing but listed symbols before)
|
|
|
|
# - (should have word boundary on left with 'a')
|
|
|
|
# - (should have word boundary on right with the second 'a')
|
|
|
|
word_boundary_pattern = r"""(?<![^\w\s'"\(,:<])(\b)a{}a(\b)""".format(source_pattern)
|
|
|
|
flags = regex.WORD | regex.DOTALL | regex.UNICODE
|
|
|
|
self.word_boundary_pattern = regex.compile(word_boundary_pattern, flags=flags)
|
|
|
|
super().__init__(self.pattern, markdown_instance)
|
|
|
|
|
|
|
|
def handleMatch(self, m: Match[str], data: str) -> Tuple[Union[Element, str, None],
|
|
|
|
Union[int, None],
|
|
|
|
Union[int, None]]:
|
|
|
|
string_new = m.group('total').replace(m.group('wrap'), 'a' + m.group('wrap') + 'a')
|
|
|
|
if not self.word_boundary_pattern.search(string_new):
|
|
|
|
return None, None, None
|
2018-11-07 16:26:33 +01:00
|
|
|
db_data = self.markdown.zulip_db_data
|
2018-11-07 16:07:34 +01:00
|
|
|
return url_to_a(db_data,
|
|
|
|
self.format_string % m.groupdict(),
|
2019-01-23 20:13:05 +01:00
|
|
|
m.group("name")), m.start('name'), m.end('name')
|
2013-06-05 17:45:57 +02:00
|
|
|
|
2013-06-28 16:02:58 +02:00
|
|
|
class UserMentionPattern(markdown.inlinepatterns.Pattern):
|
2018-05-11 01:42:51 +02:00
|
|
|
def handleMatch(self, m: Match[str]) -> Optional[Element]:
|
2019-01-08 09:30:19 +01:00
|
|
|
match = m.group('match')
|
|
|
|
silent = m.group('silent') == '_'
|
2017-08-16 20:18:09 +02:00
|
|
|
|
2018-11-07 16:26:33 +01:00
|
|
|
db_data = self.markdown.zulip_db_data
|
|
|
|
if self.markdown.zulip_message and db_data is not None:
|
2017-08-16 20:18:09 +02:00
|
|
|
if match.startswith("**") and match.endswith("**"):
|
|
|
|
name = match[2:-2]
|
|
|
|
else:
|
2018-01-24 17:18:07 +01:00
|
|
|
return None
|
2013-06-28 16:02:58 +02:00
|
|
|
|
2017-08-16 20:18:09 +02:00
|
|
|
wildcard = mention.user_mention_matches_wildcard(name)
|
2018-08-19 00:02:17 +02:00
|
|
|
|
|
|
|
id_syntax_match = re.match(r'.+\|(?P<user_id>\d+)$', name)
|
|
|
|
if id_syntax_match:
|
|
|
|
id = id_syntax_match.group("user_id")
|
2018-11-07 16:26:33 +01:00
|
|
|
user = db_data['mention_data'].get_user_by_id(id)
|
2018-08-19 00:02:17 +02:00
|
|
|
else:
|
2018-11-02 09:24:27 +01:00
|
|
|
user = db_data['mention_data'].get_user_by_name(name)
|
2013-06-28 16:02:58 +02:00
|
|
|
|
|
|
|
if wildcard:
|
2018-11-07 15:24:36 +01:00
|
|
|
self.markdown.zulip_message.mentions_wildcard = True
|
2017-01-20 18:27:30 +01:00
|
|
|
user_id = "*"
|
2013-06-28 16:02:58 +02:00
|
|
|
elif user:
|
2019-01-08 09:30:19 +01:00
|
|
|
if not silent:
|
|
|
|
self.markdown.zulip_message.mentions_user_ids.add(user['id'])
|
2013-10-09 20:48:05 +02:00
|
|
|
name = user['full_name']
|
2017-01-20 18:27:30 +01:00
|
|
|
user_id = str(user['id'])
|
2013-06-28 16:02:58 +02:00
|
|
|
else:
|
|
|
|
# Don't highlight @mentions that don't refer to a valid user
|
|
|
|
return None
|
|
|
|
|
|
|
|
el = markdown.util.etree.Element("span")
|
2017-01-20 18:27:30 +01:00
|
|
|
el.set('data-user-id', user_id)
|
2019-01-08 09:30:19 +01:00
|
|
|
if silent:
|
|
|
|
el.set('class', 'user-mention silent')
|
|
|
|
else:
|
|
|
|
el.set('class', 'user-mention')
|
2013-06-28 16:02:58 +02:00
|
|
|
el.text = "@%s" % (name,)
|
|
|
|
return el
|
2017-03-03 20:30:49 +01:00
|
|
|
return None
|
2016-10-26 20:56:17 +02:00
|
|
|
|
2017-09-25 09:47:15 +02:00
|
|
|
class UserGroupMentionPattern(markdown.inlinepatterns.Pattern):
|
2018-05-11 01:42:51 +02:00
|
|
|
def handleMatch(self, m: Match[str]) -> Optional[Element]:
|
2017-09-25 09:47:15 +02:00
|
|
|
match = m.group(2)
|
|
|
|
|
2018-11-07 16:26:33 +01:00
|
|
|
db_data = self.markdown.zulip_db_data
|
|
|
|
if self.markdown.zulip_message and db_data is not None:
|
2017-09-25 09:47:15 +02:00
|
|
|
name = extract_user_group(match)
|
2018-11-07 16:26:33 +01:00
|
|
|
user_group = db_data['mention_data'].get_user_group(name)
|
2017-09-25 09:47:15 +02:00
|
|
|
if user_group:
|
2018-11-07 15:24:36 +01:00
|
|
|
self.markdown.zulip_message.mentions_user_group_ids.add(user_group.id)
|
2017-09-25 09:47:15 +02:00
|
|
|
name = user_group.name
|
|
|
|
user_group_id = str(user_group.id)
|
|
|
|
else:
|
|
|
|
# Don't highlight @-mentions that don't refer to a valid user
|
|
|
|
# group.
|
|
|
|
return None
|
|
|
|
|
|
|
|
el = markdown.util.etree.Element("span")
|
|
|
|
el.set('class', 'user-group-mention')
|
|
|
|
el.set('data-user-group-id', user_group_id)
|
|
|
|
el.text = "@%s" % (name,)
|
|
|
|
return el
|
|
|
|
return None
|
|
|
|
|
2019-01-22 19:08:33 +01:00
|
|
|
class StreamPattern(CompiledPattern):
|
2018-05-11 01:42:51 +02:00
|
|
|
def find_stream_by_name(self, name: Match[str]) -> Optional[Dict[str, Any]]:
|
2018-11-07 16:26:33 +01:00
|
|
|
db_data = self.markdown.zulip_db_data
|
|
|
|
if db_data is None:
|
2016-10-26 20:56:17 +02:00
|
|
|
return None
|
2018-11-07 16:26:33 +01:00
|
|
|
stream = db_data['stream_names'].get(name)
|
2016-10-26 20:56:17 +02:00
|
|
|
return stream
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def handleMatch(self, m: Match[str]) -> Optional[Element]:
|
2016-10-26 20:56:17 +02:00
|
|
|
name = m.group('stream_name')
|
|
|
|
|
2018-11-07 15:24:36 +01:00
|
|
|
if self.markdown.zulip_message:
|
2016-10-26 20:56:17 +02:00
|
|
|
stream = self.find_stream_by_name(name)
|
|
|
|
if stream is None:
|
|
|
|
return None
|
|
|
|
el = markdown.util.etree.Element('a')
|
|
|
|
el.set('class', 'stream')
|
|
|
|
el.set('data-stream-id', str(stream['id']))
|
|
|
|
# TODO: We should quite possibly not be specifying the
|
|
|
|
# href here and instead having the browser auto-add the
|
|
|
|
# href when it processes a message with one of these, to
|
|
|
|
# provide more clarity to API clients.
|
2018-02-15 21:02:47 +01:00
|
|
|
stream_url = encode_stream(stream['id'], name)
|
|
|
|
el.set('href', '/#narrow/stream/{stream_url}'.format(stream_url=stream_url))
|
2017-11-03 03:12:25 +01:00
|
|
|
el.text = '#{stream_name}'.format(stream_name=name)
|
2016-10-26 20:56:17 +02:00
|
|
|
return el
|
2017-03-03 20:30:49 +01:00
|
|
|
return None
|
2016-10-26 20:56:17 +02:00
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def possible_linked_stream_names(content: str) -> Set[str]:
|
2017-09-15 00:25:38 +02:00
|
|
|
matches = re.findall(STREAM_LINK_REGEX, content, re.VERBOSE)
|
|
|
|
return set(matches)
|
|
|
|
|
2013-09-03 22:41:17 +02:00
|
|
|
class AlertWordsNotificationProcessor(markdown.preprocessors.Preprocessor):
|
2018-05-11 01:42:51 +02:00
|
|
|
def run(self, lines: Iterable[str]) -> Iterable[str]:
|
2018-11-07 16:26:33 +01:00
|
|
|
db_data = self.markdown.zulip_db_data
|
|
|
|
if self.markdown.zulip_message and db_data is not None:
|
2017-03-15 02:06:22 +01:00
|
|
|
# We check for alert words here, the set of which are
|
2016-09-14 18:02:24 +02:00
|
|
|
# dependent on which users may see this message.
|
|
|
|
#
|
|
|
|
# Our caller passes in the list of possible_words. We
|
|
|
|
# don't do any special rendering; we just append the alert words
|
2018-11-07 15:24:36 +01:00
|
|
|
# we find to the set self.markdown.zulip_message.alert_words.
|
2016-09-14 18:02:24 +02:00
|
|
|
|
2018-11-07 16:26:33 +01:00
|
|
|
realm_words = db_data['possible_words']
|
2016-09-14 18:02:24 +02:00
|
|
|
|
2013-10-09 20:09:48 +02:00
|
|
|
content = '\n'.join(lines).lower()
|
|
|
|
|
2014-02-13 21:12:19 +01:00
|
|
|
allowed_before_punctuation = "|".join([r'\s', '^', r'[\(\".,\';\[\*`>]'])
|
|
|
|
allowed_after_punctuation = "|".join([r'\s', '$', r'[\)\"\?:.,\';\]!\*`]'])
|
2013-10-09 20:48:05 +02:00
|
|
|
|
2016-09-14 18:02:24 +02:00
|
|
|
for word in realm_words:
|
|
|
|
escaped = re.escape(word.lower())
|
2017-11-03 03:12:25 +01:00
|
|
|
match_re = re.compile('(?:%s)%s(?:%s)' %
|
2016-11-30 14:17:35 +01:00
|
|
|
(allowed_before_punctuation,
|
|
|
|
escaped,
|
|
|
|
allowed_after_punctuation))
|
2016-09-14 18:02:24 +02:00
|
|
|
if re.search(match_re, content):
|
2018-11-07 15:24:36 +01:00
|
|
|
self.markdown.zulip_message.alert_words.add(word)
|
2013-09-03 22:41:17 +02:00
|
|
|
|
|
|
|
return lines
|
|
|
|
|
2013-07-31 22:53:15 +02:00
|
|
|
# This prevents realm_filters from running on the content of a
|
|
|
|
# Markdown link, breaking up the link. This is a monkey-patch, but it
|
|
|
|
# might be worth sending a version of this change upstream.
|
2019-01-22 19:31:25 +01:00
|
|
|
class AtomicLinkPattern(CompiledPattern):
|
2019-01-22 19:21:56 +01:00
|
|
|
def get_element(self, m: Match[str]) -> Optional[Element]:
|
|
|
|
href = m.group(9)
|
|
|
|
if not href:
|
|
|
|
return None
|
|
|
|
|
|
|
|
if href[0] == "<":
|
|
|
|
href = href[1:-1]
|
|
|
|
href = sanitize_url(self.unescape(href.strip()))
|
|
|
|
if href is None:
|
|
|
|
return None
|
|
|
|
|
|
|
|
db_data = self.markdown.zulip_db_data
|
|
|
|
href = rewrite_local_links_to_relative(db_data, href)
|
|
|
|
|
|
|
|
el = markdown.util.etree.Element('a')
|
|
|
|
el.text = m.group(2)
|
|
|
|
el.set('href', href)
|
|
|
|
fixup_link(el, target_blank=(href[:1] != '#'))
|
|
|
|
return el
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def handleMatch(self, m: Match[str]) -> Optional[Element]:
|
2019-01-22 19:21:56 +01:00
|
|
|
ret = self.get_element(m)
|
2013-09-18 22:04:18 +02:00
|
|
|
if ret is None:
|
|
|
|
return None
|
2017-09-27 10:06:17 +02:00
|
|
|
if not isinstance(ret, str):
|
2013-07-31 22:53:15 +02:00
|
|
|
ret.text = markdown.util.AtomicString(ret.text)
|
|
|
|
return ret
|
|
|
|
|
2018-12-20 08:28:40 +01:00
|
|
|
def get_sub_registry(r: markdown.util.Registry, keys: List[str]) -> markdown.util.Registry:
|
|
|
|
# Registry is a new class added by py-markdown to replace Ordered List.
|
|
|
|
# Since Registry doesn't support .keys(), it is easier to make a new
|
|
|
|
# object instead of removing keys from the existing object.
|
|
|
|
new_r = markdown.util.Registry()
|
|
|
|
for k in keys:
|
|
|
|
new_r.register(r[k], k, r.get_index_for_name(k))
|
|
|
|
return new_r
|
|
|
|
|
2017-01-17 06:48:46 +01:00
|
|
|
# These are used as keys ("realm_filters_keys") to md_engines and the respective
|
2016-12-31 03:08:43 +01:00
|
|
|
# realm filter caches
|
|
|
|
DEFAULT_BUGDOWN_KEY = -1
|
|
|
|
ZEPHYR_MIRROR_BUGDOWN_KEY = -2
|
|
|
|
|
2019-01-20 09:10:58 +01:00
|
|
|
class Bugdown(markdown.Markdown):
|
2017-11-05 11:15:10 +01:00
|
|
|
def __init__(self, *args: Any, **kwargs: Union[bool, int, List[Any]]) -> None:
|
2016-10-14 05:23:15 +02:00
|
|
|
# define default configs
|
|
|
|
self.config = {
|
2017-11-11 15:43:42 +01:00
|
|
|
"realm_filters": [kwargs['realm_filters'],
|
|
|
|
"Realm-specific filters for realm_filters_key %s" % (kwargs['realm'],)],
|
|
|
|
"realm": [kwargs['realm'], "Realm id"],
|
2017-11-10 03:49:42 +01:00
|
|
|
"code_block_processor_disabled": [kwargs['code_block_processor_disabled'],
|
|
|
|
"Disabled for email gateway"]
|
2016-10-14 05:23:15 +02:00
|
|
|
}
|
|
|
|
|
2017-10-27 08:28:23 +02:00
|
|
|
super().__init__(*args, **kwargs)
|
2019-01-20 09:10:58 +01:00
|
|
|
self.set_output_format('html')
|
2016-10-14 05:23:15 +02:00
|
|
|
|
2019-01-20 09:10:58 +01:00
|
|
|
def build_parser(self) -> markdown.Markdown:
|
|
|
|
# Build the parser using selected default features from py-markdown.
|
|
|
|
# The complete list of all available processors can be found in the
|
|
|
|
# super().build_parser() function.
|
|
|
|
#
|
|
|
|
# Note: for any py-markdown updates, manually check if we want any
|
|
|
|
# of the new features added upstream or not; they wouldn't get
|
|
|
|
# included by default.
|
|
|
|
self.preprocessors = self.build_preprocessors()
|
|
|
|
self.parser = self.build_block_parser()
|
|
|
|
self.inlinePatterns = self.build_inlinepatterns()
|
|
|
|
self.treeprocessors = self.build_treeprocessors()
|
|
|
|
self.postprocessors = self.build_postprocessors()
|
|
|
|
self.handle_zephyr_mirror()
|
|
|
|
return self
|
|
|
|
|
|
|
|
def build_preprocessors(self) -> markdown.util.Registry:
|
2019-01-28 21:24:06 +01:00
|
|
|
# We disable the following preprocessors from upstream:
|
|
|
|
#
|
|
|
|
# html_block - insecure
|
|
|
|
# reference - references don't make sense in a chat context.
|
2019-01-20 09:10:58 +01:00
|
|
|
preprocessors = markdown.util.Registry()
|
|
|
|
preprocessors.register(AutoNumberOListPreprocessor(self), 'auto_number_olist', 40)
|
|
|
|
preprocessors.register(BugdownUListPreprocessor(self), 'hanging_ulists', 35)
|
|
|
|
preprocessors.register(markdown.preprocessors.NormalizeWhitespace(self), 'normalize_whitespace', 30)
|
|
|
|
preprocessors.register(fenced_code.FencedBlockPreprocessor(self), 'fenced_code_block', 25)
|
|
|
|
preprocessors.register(AlertWordsNotificationProcessor(self), 'custom_text_notifications', 20)
|
|
|
|
return preprocessors
|
|
|
|
|
|
|
|
def build_block_parser(self) -> markdown.util.Registry:
|
2019-01-28 21:24:06 +01:00
|
|
|
# We disable the following blockparsers from upstream:
|
|
|
|
#
|
|
|
|
# indent - replaced by ours
|
|
|
|
# hashheader - disabled, since headers look bad and don't make sense in a chat context.
|
|
|
|
# setextheader - disabled, since headers look bad and don't make sense in a chat context.
|
|
|
|
# olist - replaced by ours
|
|
|
|
# ulist - replaced by ours
|
|
|
|
# quote - replaced by ours
|
2019-01-20 09:10:58 +01:00
|
|
|
parser = markdown.blockprocessors.BlockParser(self)
|
|
|
|
parser.blockprocessors.register(markdown.blockprocessors.EmptyBlockProcessor(parser), 'empty', 85)
|
|
|
|
if not self.getConfig('code_block_processor_disabled'):
|
|
|
|
parser.blockprocessors.register(markdown.blockprocessors.CodeBlockProcessor(parser), 'code', 80)
|
|
|
|
# We get priority 75 from 'table' extension
|
|
|
|
parser.blockprocessors.register(markdown.blockprocessors.HRProcessor(parser), 'hr', 70)
|
|
|
|
parser.blockprocessors.register(UListProcessor(parser), 'ulist', 65)
|
|
|
|
parser.blockprocessors.register(ListIndentProcessor(parser), 'indent', 60)
|
|
|
|
parser.blockprocessors.register(BlockQuoteProcessor(parser), 'quote', 55)
|
|
|
|
parser.blockprocessors.register(markdown.blockprocessors.ParagraphProcessor(parser), 'paragraph', 50)
|
|
|
|
return parser
|
|
|
|
|
|
|
|
def build_inlinepatterns(self) -> markdown.util.Registry:
|
2019-01-28 21:24:06 +01:00
|
|
|
# We disable the following upstream inline patterns:
|
|
|
|
#
|
|
|
|
# backtick - replaced by ours
|
|
|
|
# escape - probably will re-add at some point.
|
|
|
|
# link - replaced by ours
|
|
|
|
# image_link - replaced by ours
|
|
|
|
# autolink - replaced by ours
|
|
|
|
# automail - replaced by ours
|
|
|
|
# linebreak - we use nl2br and consider that good enough
|
|
|
|
# html - insecure
|
|
|
|
# reference - references not useful
|
|
|
|
# image_reference - references not useful
|
|
|
|
# short_reference - references not useful
|
|
|
|
# ---------------------------------------------------
|
|
|
|
# strong_em - for these three patterns,
|
|
|
|
# strong2 - we have our own versions where
|
|
|
|
# emphasis2 - we disable _ for bold and emphasis
|
|
|
|
|
2019-01-20 09:10:58 +01:00
|
|
|
# Declare regexes for clean single line calls to .register().
|
|
|
|
NOT_STRONG_RE = markdown.inlinepatterns.NOT_STRONG_RE
|
2016-11-08 07:26:38 +01:00
|
|
|
# Custom strikethrough syntax: ~~foo~~
|
2019-01-20 09:10:58 +01:00
|
|
|
DEL_RE = r'(?<!~)(\~\~)([^~\n]+?)(\~\~)(?!~)'
|
|
|
|
# Custom bold syntax: **foo** but not __foo__
|
2018-05-11 01:42:51 +02:00
|
|
|
# str inside ** must start and end with a word character
|
2016-11-03 07:56:28 +01:00
|
|
|
# it need for things like "const char *x = (char *)y"
|
2019-01-20 09:10:58 +01:00
|
|
|
EMPHASIS_RE = r'(\*)(?!\s+)([^\*^\n]+)(?<!\s)\*'
|
|
|
|
ENTITY_RE = markdown.inlinepatterns.ENTITY_RE
|
|
|
|
STRONG_EM_RE = r'(\*\*\*)(?!\s+)([^\*^\n]+)(?<!\s)\*\*\*'
|
|
|
|
# Inline code block without whitespace stripping
|
|
|
|
BACKTICK_RE = r'(?:(?<!\\)((?:\\{2})+)(?=`+)|(?<!\\)(`+)(.+?)(?<!`)\3(?!`))'
|
|
|
|
|
2019-01-28 21:24:06 +01:00
|
|
|
# Add Inline Patterns. We use a custom numbering of the
|
|
|
|
# rules, that preserves the order from upstream but leaves
|
|
|
|
# space for us to add our own.
|
2019-01-20 09:10:58 +01:00
|
|
|
reg = markdown.util.Registry()
|
|
|
|
reg.register(BacktickPattern(BACKTICK_RE), 'backtick', 105)
|
|
|
|
reg.register(markdown.inlinepatterns.DoubleTagPattern(STRONG_EM_RE, 'strong,em'), 'strong_em', 100)
|
|
|
|
reg.register(UserMentionPattern(mention.find_mentions, self), 'usermention', 95)
|
|
|
|
reg.register(Tex(r'\B(?<!\$)\$\$(?P<body>[^\n_$](\\\$|[^$\n])*)\$\$(?!\$)\B'), 'tex', 90)
|
2019-01-22 20:16:39 +01:00
|
|
|
reg.register(StreamPattern(get_compiled_stream_link_regex(), self), 'stream', 85)
|
2019-01-20 09:10:58 +01:00
|
|
|
reg.register(Avatar(AVATAR_REGEX, self), 'avatar', 80)
|
|
|
|
reg.register(ModalLink(r'!modal_link\((?P<relative_url>[^)]*), (?P<text>[^)]*)\)'), 'modal_link', 75)
|
2013-11-12 23:48:05 +01:00
|
|
|
# Note that !gravatar syntax should be deprecated long term.
|
2019-01-20 09:10:58 +01:00
|
|
|
reg.register(Avatar(GRAVATAR_REGEX, self), 'gravatar', 70)
|
|
|
|
reg.register(UserGroupMentionPattern(mention.user_group_mentions, self), 'usergroupmention', 65)
|
|
|
|
reg.register(AtomicLinkPattern(get_link_re(), self), 'link', 60)
|
|
|
|
reg.register(AutoLink(get_web_link_regex(), self), 'autolink', 55)
|
|
|
|
# Reserve priority 45-54 for Realm Filters
|
|
|
|
reg = self.register_realm_filters(reg)
|
|
|
|
reg.register(markdown.inlinepatterns.HtmlInlineProcessor(ENTITY_RE, self), 'entity', 40)
|
|
|
|
reg.register(markdown.inlinepatterns.SimpleTagPattern(r'(\*\*)([^\n]+?)\2', 'strong'), 'strong', 35)
|
|
|
|
reg.register(markdown.inlinepatterns.SimpleTagPattern(EMPHASIS_RE, 'em'), 'emphasis', 30)
|
|
|
|
reg.register(markdown.inlinepatterns.SimpleTagPattern(DEL_RE, 'del'), 'del', 25)
|
|
|
|
reg.register(markdown.inlinepatterns.SimpleTextInlineProcessor(NOT_STRONG_RE), 'not_strong', 20)
|
|
|
|
reg.register(Emoji(EMOJI_REGEX, self), 'emoji', 15)
|
|
|
|
reg.register(EmoticonTranslation(emoticon_regex, self), 'translate_emoticons', 10)
|
|
|
|
# We get priority 5 from 'nl2br' extension
|
|
|
|
reg.register(UnicodeEmoji(unicode_emoji_regex), 'unicodeemoji', 0)
|
|
|
|
return reg
|
|
|
|
|
|
|
|
def register_realm_filters(self, inlinePatterns: markdown.util.Registry) -> markdown.util.Registry:
|
2016-02-13 19:17:15 +01:00
|
|
|
for (pattern, format_string, id) in self.getConfig("realm_filters"):
|
2019-01-20 09:10:58 +01:00
|
|
|
inlinePatterns.register(RealmFilterPattern(pattern, format_string, self),
|
|
|
|
'realm_filters/%s' % (pattern), 45)
|
|
|
|
return inlinePatterns
|
|
|
|
|
|
|
|
def build_treeprocessors(self) -> markdown.util.Registry:
|
2019-01-28 21:24:06 +01:00
|
|
|
# Here we build all the processors from upstream, plus a few of our own.
|
2019-01-20 09:10:58 +01:00
|
|
|
treeprocessors = markdown.util.Registry()
|
|
|
|
# We get priority 30 from 'hilite' extension
|
|
|
|
treeprocessors.register(markdown.treeprocessors.InlineProcessor(self), 'inline', 25)
|
|
|
|
treeprocessors.register(markdown.treeprocessors.PrettifyTreeprocessor(self), 'prettify', 20)
|
|
|
|
treeprocessors.register(InlineInterestingLinkProcessor(self), 'inline_interesting_links', 15)
|
2013-11-15 19:53:04 +01:00
|
|
|
if settings.CAMO_URI:
|
2019-01-20 09:10:58 +01:00
|
|
|
treeprocessors.register(InlineHttpsProcessor(self), 'rewrite_to_https', 10)
|
|
|
|
return treeprocessors
|
|
|
|
|
|
|
|
def build_postprocessors(self) -> markdown.util.Registry:
|
2019-01-28 21:24:06 +01:00
|
|
|
# These are the default python-markdown processors, unmodified.
|
2019-01-20 09:10:58 +01:00
|
|
|
postprocessors = markdown.util.Registry()
|
|
|
|
postprocessors.register(markdown.postprocessors.RawHtmlPostprocessor(self), 'raw_html', 20)
|
|
|
|
postprocessors.register(markdown.postprocessors.AndSubstitutePostprocessor(), 'amp_substitute', 15)
|
|
|
|
postprocessors.register(markdown.postprocessors.UnescapePostprocessor(), 'unescape', 10)
|
|
|
|
return postprocessors
|
|
|
|
|
|
|
|
def getConfig(self, key: str, default: str='') -> Any:
|
|
|
|
""" Return a setting for the given key or an empty string. """
|
|
|
|
if key in self.config:
|
|
|
|
return self.config[key][0]
|
|
|
|
else:
|
|
|
|
return default
|
2013-03-01 19:20:53 +01:00
|
|
|
|
2019-01-20 09:10:58 +01:00
|
|
|
def handle_zephyr_mirror(self) -> None:
|
2016-12-31 03:08:43 +01:00
|
|
|
if self.getConfig("realm") == ZEPHYR_MIRROR_BUGDOWN_KEY:
|
2016-07-27 02:04:11 +02:00
|
|
|
# Disable almost all inline patterns for zephyr mirror
|
|
|
|
# users' traffic that is mirrored. Note that
|
|
|
|
# inline_interesting_links is a treeprocessor and thus is
|
|
|
|
# not removed
|
2019-01-20 09:10:58 +01:00
|
|
|
self.inlinePatterns = get_sub_registry(self.inlinePatterns, ['autolink'])
|
|
|
|
self.treeprocessors = get_sub_registry(self.treeprocessors, ['inline_interesting_links',
|
|
|
|
'rewrite_to_https'])
|
|
|
|
# insert new 'inline' processor because we have changed self.inlinePatterns
|
2018-12-20 08:28:40 +01:00
|
|
|
# but InlineProcessor copies md as self.md in __init__.
|
2019-01-20 09:10:58 +01:00
|
|
|
self.treeprocessors.register(markdown.treeprocessors.InlineProcessor(self), 'inline', 25)
|
|
|
|
self.preprocessors = get_sub_registry(self.preprocessors, ['custom_text_notifications'])
|
|
|
|
self.parser.blockprocessors = get_sub_registry(self.parser.blockprocessors, ['paragraph'])
|
2013-06-05 17:45:57 +02:00
|
|
|
|
2017-11-03 12:13:17 +01:00
|
|
|
md_engines = {} # type: Dict[Tuple[int, bool], markdown.Markdown]
|
2018-05-11 01:42:51 +02:00
|
|
|
realm_filter_data = {} # type: Dict[int, List[Tuple[str, str, int]]]
|
2013-06-05 17:45:57 +02:00
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def make_md_engine(realm_filters_key: int, email_gateway: bool) -> None:
|
2017-11-11 16:00:46 +01:00
|
|
|
md_engine_key = (realm_filters_key, email_gateway)
|
|
|
|
if md_engine_key in md_engines:
|
|
|
|
del md_engines[md_engine_key]
|
|
|
|
|
|
|
|
realm_filters = realm_filter_data[realm_filters_key]
|
2018-11-03 15:09:59 +01:00
|
|
|
md_engines[md_engine_key] = build_engine(
|
|
|
|
realm_filters=realm_filters,
|
|
|
|
realm_filters_key=realm_filters_key,
|
|
|
|
email_gateway=email_gateway,
|
|
|
|
)
|
|
|
|
|
|
|
|
def build_engine(realm_filters: List[Tuple[str, str, int]],
|
|
|
|
realm_filters_key: int,
|
|
|
|
email_gateway: bool) -> markdown.Markdown:
|
2019-01-20 09:10:58 +01:00
|
|
|
engine = Bugdown(
|
|
|
|
realm_filters=realm_filters,
|
|
|
|
realm=realm_filters_key,
|
|
|
|
code_block_processor_disabled=email_gateway,
|
|
|
|
extensions = [
|
2018-11-03 19:04:52 +01:00
|
|
|
nl2br.makeExtension(),
|
|
|
|
tables.makeExtension(),
|
2017-01-24 07:06:13 +01:00
|
|
|
codehilite.makeExtension(
|
|
|
|
linenums=False,
|
|
|
|
guess_lang=False
|
|
|
|
),
|
2019-01-20 09:10:58 +01:00
|
|
|
])
|
2018-11-03 15:09:59 +01:00
|
|
|
return engine
|
2013-06-05 17:45:57 +02:00
|
|
|
|
2018-11-08 17:21:14 +01:00
|
|
|
def topic_links(realm_filters_key: int, topic_name: str) -> List[str]:
|
2018-05-11 01:42:51 +02:00
|
|
|
matches = [] # type: List[str]
|
2013-07-12 22:29:25 +02:00
|
|
|
|
2017-01-17 06:48:46 +01:00
|
|
|
realm_filters = realm_filters_for_realm(realm_filters_key)
|
2016-06-01 04:46:42 +02:00
|
|
|
|
|
|
|
for realm_filter in realm_filters:
|
|
|
|
pattern = prepare_realm_pattern(realm_filter[0])
|
2018-11-08 17:21:14 +01:00
|
|
|
for m in re.finditer(pattern, topic_name):
|
2016-06-01 04:46:42 +02:00
|
|
|
matches += [realm_filter[1] % m.groupdict()]
|
|
|
|
return matches
|
2013-12-11 20:06:37 +01:00
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def maybe_update_markdown_engines(realm_filters_key: Optional[int], email_gateway: bool) -> None:
|
2017-01-17 06:48:46 +01:00
|
|
|
# If realm_filters_key is None, load all filters
|
2017-11-11 16:00:46 +01:00
|
|
|
global realm_filter_data
|
2017-01-17 06:48:46 +01:00
|
|
|
if realm_filters_key is None:
|
2013-12-11 20:06:37 +01:00
|
|
|
all_filters = all_realm_filters()
|
2016-12-31 03:08:43 +01:00
|
|
|
all_filters[DEFAULT_BUGDOWN_KEY] = []
|
2017-09-27 10:06:17 +02:00
|
|
|
for realm_filters_key, filters in all_filters.items():
|
2017-11-11 16:00:46 +01:00
|
|
|
realm_filter_data[realm_filters_key] = filters
|
|
|
|
make_md_engine(realm_filters_key, email_gateway)
|
2014-01-29 20:01:54 +01:00
|
|
|
# Hack to ensure that getConfig("realm") is right for mirrored Zephyrs
|
2017-11-11 16:00:46 +01:00
|
|
|
realm_filter_data[ZEPHYR_MIRROR_BUGDOWN_KEY] = []
|
|
|
|
make_md_engine(ZEPHYR_MIRROR_BUGDOWN_KEY, False)
|
2013-12-11 20:06:37 +01:00
|
|
|
else:
|
2017-01-17 06:48:46 +01:00
|
|
|
realm_filters = realm_filters_for_realm(realm_filters_key)
|
2017-11-03 12:13:17 +01:00
|
|
|
if realm_filters_key not in realm_filter_data or \
|
2017-11-14 00:17:50 +01:00
|
|
|
realm_filter_data[realm_filters_key] != realm_filters:
|
|
|
|
# Realm filters data has changed, update `realm_filter_data` and any
|
|
|
|
# of the existing markdown engines using this set of realm filters.
|
2017-11-11 16:00:46 +01:00
|
|
|
realm_filter_data[realm_filters_key] = realm_filters
|
2017-11-14 00:17:50 +01:00
|
|
|
for email_gateway_flag in [True, False]:
|
|
|
|
if (realm_filters_key, email_gateway_flag) in md_engines:
|
|
|
|
# Update only existing engines(if any), don't create new one.
|
|
|
|
make_md_engine(realm_filters_key, email_gateway_flag)
|
|
|
|
|
|
|
|
if (realm_filters_key, email_gateway) not in md_engines:
|
|
|
|
# Markdown engine corresponding to this key doesn't exists so create one.
|
2017-11-11 16:00:46 +01:00
|
|
|
make_md_engine(realm_filters_key, email_gateway)
|
2013-12-11 20:06:37 +01:00
|
|
|
|
2012-10-25 21:38:47 +02:00
|
|
|
# We want to log Markdown parser failures, but shouldn't log the actual input
|
|
|
|
# message for privacy reasons. The compromise is to replace all alphanumeric
|
|
|
|
# characters with 'x'.
|
|
|
|
#
|
|
|
|
# We also use repr() to improve reproducibility, and to escape terminal control
|
|
|
|
# codes, which can do surprisingly nasty things.
|
2017-11-03 03:12:25 +01:00
|
|
|
_privacy_re = re.compile('\\w', flags=re.UNICODE)
|
2018-05-11 01:42:51 +02:00
|
|
|
def privacy_clean_markdown(content: str) -> str:
|
2016-10-11 16:33:51 +02:00
|
|
|
return repr(_privacy_re.sub('x', content))
|
2012-10-25 21:38:47 +02:00
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def log_bugdown_error(msg: str) -> None:
|
2016-09-15 21:51:27 +02:00
|
|
|
"""We use this unusual logging approach to log the bugdown error, in
|
2017-11-30 23:06:21 +01:00
|
|
|
order to prevent AdminNotifyHandler from sending the santized
|
2016-09-15 21:51:27 +02:00
|
|
|
original markdown formatting into another Zulip message, which
|
|
|
|
could cause an infinite exception loop."""
|
2018-07-03 07:25:29 +02:00
|
|
|
bugdown_logger.error(msg)
|
2016-09-15 21:51:27 +02:00
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def get_email_info(realm_id: int, emails: Set[str]) -> Dict[str, FullNameInfo]:
|
2017-09-14 22:11:34 +02:00
|
|
|
if not emails:
|
|
|
|
return dict()
|
|
|
|
|
|
|
|
q_list = {
|
|
|
|
Q(email__iexact=email.strip().lower())
|
|
|
|
for email in emails
|
|
|
|
}
|
|
|
|
|
|
|
|
rows = UserProfile.objects.filter(
|
|
|
|
realm_id=realm_id
|
|
|
|
).filter(
|
|
|
|
functools.reduce(lambda a, b: a | b, q_list),
|
|
|
|
).values(
|
|
|
|
'id',
|
|
|
|
'email',
|
|
|
|
)
|
|
|
|
|
|
|
|
dct = {
|
|
|
|
row['email'].strip().lower(): row
|
|
|
|
for row in rows
|
|
|
|
}
|
|
|
|
return dct
|
|
|
|
|
2018-11-02 09:15:46 +01:00
|
|
|
def get_possible_mentions_info(realm_id: int, mention_texts: Set[str]) -> List[FullNameInfo]:
|
2018-11-28 23:07:23 +01:00
|
|
|
if not mention_texts:
|
2018-11-02 09:15:46 +01:00
|
|
|
return list()
|
2017-09-14 19:47:22 +02:00
|
|
|
|
2018-11-02 09:15:46 +01:00
|
|
|
# Remove the trailing part of the `name|id` mention syntax,
|
|
|
|
# thus storing only full names in full_names.
|
|
|
|
full_names = set()
|
2018-08-18 23:21:47 +02:00
|
|
|
name_re = r'(?P<full_name>.+)\|\d+$'
|
2018-11-02 09:15:46 +01:00
|
|
|
for mention_text in mention_texts:
|
2018-11-28 23:07:23 +01:00
|
|
|
name_syntax_match = re.match(name_re, mention_text)
|
2018-08-18 23:21:47 +02:00
|
|
|
if name_syntax_match:
|
2018-11-02 09:15:46 +01:00
|
|
|
full_names.add(name_syntax_match.group("full_name"))
|
|
|
|
else:
|
|
|
|
full_names.add(mention_text)
|
2018-08-18 23:21:47 +02:00
|
|
|
|
2017-09-14 19:47:22 +02:00
|
|
|
q_list = {
|
2018-11-02 09:15:46 +01:00
|
|
|
Q(full_name__iexact=full_name)
|
|
|
|
for full_name in full_names
|
2017-09-14 19:47:22 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
rows = UserProfile.objects.filter(
|
2017-10-13 00:06:24 +02:00
|
|
|
realm_id=realm_id,
|
|
|
|
is_active=True,
|
2017-09-14 19:47:22 +02:00
|
|
|
).filter(
|
|
|
|
functools.reduce(lambda a, b: a | b, q_list),
|
|
|
|
).values(
|
|
|
|
'id',
|
|
|
|
'full_name',
|
|
|
|
'email',
|
|
|
|
)
|
2018-11-02 09:15:46 +01:00
|
|
|
return list(rows)
|
2017-09-14 19:47:22 +02:00
|
|
|
|
2017-11-05 11:37:41 +01:00
|
|
|
class MentionData:
|
2018-05-11 01:42:51 +02:00
|
|
|
def __init__(self, realm_id: int, content: str) -> None:
|
2018-11-28 23:07:23 +01:00
|
|
|
mention_texts = possible_mentions(content)
|
2018-11-02 09:15:46 +01:00
|
|
|
possible_mentions_info = get_possible_mentions_info(realm_id, mention_texts)
|
|
|
|
self.full_name_info = {
|
|
|
|
row['full_name'].lower(): row
|
|
|
|
for row in possible_mentions_info
|
|
|
|
}
|
2018-08-18 23:12:18 +02:00
|
|
|
self.user_id_info = {
|
|
|
|
row['id']: row
|
2018-11-02 09:15:46 +01:00
|
|
|
for row in possible_mentions_info
|
2017-10-24 17:36:27 +02:00
|
|
|
}
|
2018-11-02 19:17:07 +01:00
|
|
|
self.init_user_group_data(realm_id=realm_id, content=content)
|
2017-10-24 02:47:09 +02:00
|
|
|
|
2018-11-02 19:17:07 +01:00
|
|
|
def init_user_group_data(self,
|
|
|
|
realm_id: int,
|
|
|
|
content: str) -> None:
|
2017-09-25 09:47:15 +02:00
|
|
|
user_group_names = possible_user_group_mentions(content)
|
|
|
|
self.user_group_name_info = get_user_group_name_info(realm_id, user_group_names)
|
2018-11-02 19:17:07 +01:00
|
|
|
self.user_group_members = defaultdict(list) # type: Dict[int, List[int]]
|
2017-10-27 14:47:54 +02:00
|
|
|
group_ids = [group.id for group in self.user_group_name_info.values()]
|
2018-11-02 19:17:07 +01:00
|
|
|
|
|
|
|
if not group_ids:
|
|
|
|
# Early-return to avoid the cost of hitting the ORM,
|
|
|
|
# which shows up in profiles.
|
|
|
|
return
|
|
|
|
|
2017-10-27 14:47:54 +02:00
|
|
|
membership = UserGroupMembership.objects.filter(user_group_id__in=group_ids)
|
|
|
|
for info in membership.values('user_group_id', 'user_profile_id'):
|
|
|
|
group_id = info['user_group_id']
|
|
|
|
user_profile_id = info['user_profile_id']
|
|
|
|
self.user_group_members[group_id].append(user_profile_id)
|
2017-09-25 09:47:15 +02:00
|
|
|
|
2018-11-02 09:24:27 +01:00
|
|
|
def get_user_by_name(self, name: str) -> Optional[FullNameInfo]:
|
|
|
|
# warning: get_user_by_name is not dependable if two
|
|
|
|
# users of the same full name are mentioned. Use
|
|
|
|
# get_user_by_id where possible.
|
2017-10-24 02:47:09 +02:00
|
|
|
return self.full_name_info.get(name.lower(), None)
|
|
|
|
|
2018-08-18 23:12:18 +02:00
|
|
|
def get_user_by_id(self, id: str) -> Optional[FullNameInfo]:
|
|
|
|
return self.user_id_info.get(int(id), None)
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def get_user_ids(self) -> Set[int]:
|
2017-10-24 17:36:27 +02:00
|
|
|
"""
|
|
|
|
Returns the user IDs that might have been mentioned by this
|
|
|
|
content. Note that because this data structure has not parsed
|
|
|
|
the message and does not know about escaping/code blocks, this
|
|
|
|
will overestimate the list of user ids.
|
|
|
|
"""
|
2018-08-18 23:12:18 +02:00
|
|
|
return set(self.user_id_info.keys())
|
2017-10-24 17:36:27 +02:00
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def get_user_group(self, name: str) -> Optional[UserGroup]:
|
2017-09-25 09:47:15 +02:00
|
|
|
return self.user_group_name_info.get(name.lower(), None)
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def get_group_members(self, user_group_id: int) -> List[int]:
|
2017-10-27 14:47:54 +02:00
|
|
|
return self.user_group_members.get(user_group_id, [])
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def get_user_group_name_info(realm_id: int, user_group_names: Set[str]) -> Dict[str, UserGroup]:
|
2017-09-25 09:47:15 +02:00
|
|
|
if not user_group_names:
|
|
|
|
return dict()
|
|
|
|
|
|
|
|
rows = UserGroup.objects.filter(realm_id=realm_id,
|
|
|
|
name__in=user_group_names)
|
|
|
|
dct = {row.name.lower(): row for row in rows}
|
|
|
|
return dct
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def get_stream_name_info(realm: Realm, stream_names: Set[str]) -> Dict[str, FullNameInfo]:
|
2017-09-15 00:25:38 +02:00
|
|
|
if not stream_names:
|
|
|
|
return dict()
|
|
|
|
|
|
|
|
q_list = {
|
|
|
|
Q(name=name)
|
|
|
|
for name in stream_names
|
|
|
|
}
|
|
|
|
|
|
|
|
rows = get_active_streams(
|
|
|
|
realm=realm,
|
|
|
|
).filter(
|
|
|
|
functools.reduce(lambda a, b: a | b, q_list),
|
|
|
|
).values(
|
|
|
|
'id',
|
|
|
|
'name',
|
|
|
|
)
|
|
|
|
|
|
|
|
dct = {
|
|
|
|
row['name']: row
|
|
|
|
for row in rows
|
|
|
|
}
|
|
|
|
return dct
|
|
|
|
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def do_convert(content: str,
|
2017-11-05 11:15:10 +01:00
|
|
|
message: Optional[Message]=None,
|
|
|
|
message_realm: Optional[Realm]=None,
|
2018-05-11 01:42:51 +02:00
|
|
|
possible_words: Optional[Set[str]]=None,
|
2017-11-05 11:15:10 +01:00
|
|
|
sent_by_bot: Optional[bool]=False,
|
2018-11-02 12:50:09 +01:00
|
|
|
translate_emoticons: Optional[bool]=False,
|
2017-11-05 11:15:10 +01:00
|
|
|
mention_data: Optional[MentionData]=None,
|
2018-05-11 01:42:51 +02:00
|
|
|
email_gateway: Optional[bool]=False) -> str:
|
2013-08-06 21:32:15 +02:00
|
|
|
"""Convert Markdown to HTML, with Zulip-specific settings and hacks."""
|
2017-01-22 06:29:11 +01:00
|
|
|
# This logic is a bit convoluted, but the overall goal is to support a range of use cases:
|
|
|
|
# * Nothing is passed in other than content -> just run default options (e.g. for docs)
|
|
|
|
# * message is passed, but no realm is -> look up realm from message
|
|
|
|
# * message_realm is passed -> use that realm for bugdown purposes
|
2017-05-26 02:08:16 +02:00
|
|
|
if message is not None:
|
2017-01-18 23:19:18 +01:00
|
|
|
if message_realm is None:
|
|
|
|
message_realm = message.get_realm()
|
2017-01-22 06:29:11 +01:00
|
|
|
if message_realm is None:
|
|
|
|
realm_filters_key = DEFAULT_BUGDOWN_KEY
|
|
|
|
else:
|
|
|
|
realm_filters_key = message_realm.id
|
|
|
|
|
2019-01-29 21:06:27 +01:00
|
|
|
if message and hasattr(message, 'id') and message.id:
|
|
|
|
logging_message_id = 'id# ' + str(message.id)
|
|
|
|
else:
|
|
|
|
logging_message_id = 'unknown'
|
|
|
|
|
2018-11-02 02:07:43 +01:00
|
|
|
if message is not None and message_realm is not None:
|
|
|
|
if message_realm.is_zephyr_mirror_realm:
|
|
|
|
if message.sending_client.name == "zephyr_mirror":
|
|
|
|
# Use slightly customized Markdown processor for content
|
|
|
|
# delivered via zephyr_mirror
|
|
|
|
realm_filters_key = ZEPHYR_MIRROR_BUGDOWN_KEY
|
2017-01-22 06:29:11 +01:00
|
|
|
|
2017-11-11 15:30:19 +01:00
|
|
|
maybe_update_markdown_engines(realm_filters_key, email_gateway)
|
2017-11-03 12:13:17 +01:00
|
|
|
md_engine_key = (realm_filters_key, email_gateway)
|
2012-11-20 20:15:55 +01:00
|
|
|
|
2017-11-03 12:13:17 +01:00
|
|
|
if md_engine_key in md_engines:
|
|
|
|
_md_engine = md_engines[md_engine_key]
|
2013-06-05 17:45:57 +02:00
|
|
|
else:
|
2016-12-31 03:08:43 +01:00
|
|
|
if DEFAULT_BUGDOWN_KEY not in md_engines:
|
2017-11-11 15:30:19 +01:00
|
|
|
maybe_update_markdown_engines(realm_filters_key=None, email_gateway=False)
|
2016-10-27 12:57:57 +02:00
|
|
|
|
2017-11-03 12:13:17 +01:00
|
|
|
_md_engine = md_engines[(DEFAULT_BUGDOWN_KEY, email_gateway)]
|
2012-11-20 20:15:55 +01:00
|
|
|
# Reset the parser; otherwise it will get slower over time.
|
|
|
|
_md_engine.reset()
|
2012-10-15 22:03:50 +02:00
|
|
|
|
2018-11-07 15:24:36 +01:00
|
|
|
# Filters such as UserMentionPattern need a message.
|
|
|
|
_md_engine.zulip_message = message
|
2018-11-07 15:48:08 +01:00
|
|
|
_md_engine.zulip_realm = message_realm
|
2018-11-07 16:26:33 +01:00
|
|
|
_md_engine.zulip_db_data = None # for now
|
2013-10-09 20:48:05 +02:00
|
|
|
|
|
|
|
# Pre-fetch data from the DB that is used in the bugdown thread
|
2017-05-26 02:08:16 +02:00
|
|
|
if message is not None:
|
2017-07-09 01:28:18 +02:00
|
|
|
assert message_realm is not None # ensured above if message is not None
|
2016-09-14 18:02:24 +02:00
|
|
|
if possible_words is None:
|
2018-05-11 01:42:51 +02:00
|
|
|
possible_words = set() # Set[str]
|
2016-09-14 18:02:24 +02:00
|
|
|
|
2017-09-15 10:21:04 +02:00
|
|
|
# Here we fetch the data structures needed to render
|
|
|
|
# mentions/avatars/stream mentions from the database, but only
|
|
|
|
# if there is syntax in the message that might use them, since
|
|
|
|
# the fetches are somewhat expensive and these types of syntax
|
|
|
|
# are uncommon enough that it's a useful optimization.
|
2017-10-24 02:47:09 +02:00
|
|
|
|
|
|
|
if mention_data is None:
|
|
|
|
mention_data = MentionData(message_realm.id, content)
|
2017-09-14 19:47:22 +02:00
|
|
|
|
2017-09-14 22:11:34 +02:00
|
|
|
emails = possible_avatar_emails(content)
|
|
|
|
email_info = get_email_info(message_realm.id, emails)
|
|
|
|
|
2017-09-15 00:25:38 +02:00
|
|
|
stream_names = possible_linked_stream_names(content)
|
|
|
|
stream_name_info = get_stream_name_info(message_realm, stream_names)
|
|
|
|
|
2017-09-15 03:08:15 +02:00
|
|
|
if content_has_emoji_syntax(content):
|
2018-03-11 18:48:56 +01:00
|
|
|
active_realm_emoji = message_realm.get_active_emoji()
|
2017-09-15 03:08:15 +02:00
|
|
|
else:
|
2018-03-11 18:48:56 +01:00
|
|
|
active_realm_emoji = dict()
|
2017-09-15 03:08:15 +02:00
|
|
|
|
2018-11-07 16:26:33 +01:00
|
|
|
_md_engine.zulip_db_data = {
|
2017-09-15 00:25:38 +02:00
|
|
|
'possible_words': possible_words,
|
|
|
|
'email_info': email_info,
|
2017-10-24 02:47:09 +02:00
|
|
|
'mention_data': mention_data,
|
2018-03-11 18:48:56 +01:00
|
|
|
'active_realm_emoji': active_realm_emoji,
|
2017-10-31 22:03:39 +01:00
|
|
|
'realm_uri': message_realm.uri,
|
2017-09-15 00:25:38 +02:00
|
|
|
'sent_by_bot': sent_by_bot,
|
|
|
|
'stream_names': stream_name_info,
|
2018-11-02 12:50:09 +01:00
|
|
|
'translate_emoticons': translate_emoticons,
|
2017-09-15 00:25:38 +02:00
|
|
|
}
|
2013-10-09 20:48:05 +02:00
|
|
|
|
2012-10-15 22:03:50 +02:00
|
|
|
try:
|
2018-04-13 17:38:40 +02:00
|
|
|
# Spend at most 5 seconds rendering; this protects the backend
|
|
|
|
# from being overloaded by bugs (e.g. markdown logic that is
|
|
|
|
# extremely inefficient in corner cases) as well as user
|
|
|
|
# errors (e.g. a realm filter that makes some syntax
|
|
|
|
# infinite-loop).
|
2018-02-09 19:49:13 +01:00
|
|
|
rendered_content = timeout(5, _md_engine.convert, content)
|
|
|
|
|
|
|
|
# Throw an exception if the content is huge; this protects the
|
|
|
|
# rest of the codebase from any bugs where we end up rendering
|
|
|
|
# something huge.
|
2018-04-13 08:23:21 +02:00
|
|
|
if len(rendered_content) > MAX_MESSAGE_LENGTH * 10:
|
2019-01-29 21:06:27 +01:00
|
|
|
raise BugdownRenderingException('Rendered content exceeds %s characters (message %s)' %
|
|
|
|
(MAX_MESSAGE_LENGTH * 10, logging_message_id))
|
2018-02-09 19:49:13 +01:00
|
|
|
return rendered_content
|
2017-03-05 10:25:27 +01:00
|
|
|
except Exception:
|
2017-10-12 02:40:42 +02:00
|
|
|
cleaned = privacy_clean_markdown(content)
|
2018-07-02 09:55:42 +02:00
|
|
|
# NOTE: Don't change this message without also changing the
|
|
|
|
# logic in logging_handlers.py or we can create recursive
|
|
|
|
# exceptions.
|
2019-01-29 21:06:27 +01:00
|
|
|
exception_message = ('Exception in Markdown parser: %sInput (sanitized) was: %s\n (message %s)'
|
|
|
|
% (traceback.format_exc(), cleaned, logging_message_id))
|
2018-07-02 09:55:42 +02:00
|
|
|
bugdown_logger.exception(exception_message)
|
2017-10-13 02:45:33 +02:00
|
|
|
|
2016-09-15 22:23:25 +02:00
|
|
|
raise BugdownRenderingException()
|
2013-06-28 16:02:58 +02:00
|
|
|
finally:
|
2018-11-07 16:26:33 +01:00
|
|
|
# These next three lines are slightly paranoid, since
|
|
|
|
# we always set these right before actually using the
|
|
|
|
# engine, but better safe then sorry.
|
2018-11-07 15:24:36 +01:00
|
|
|
_md_engine.zulip_message = None
|
2018-11-07 15:48:08 +01:00
|
|
|
_md_engine.zulip_realm = None
|
2018-11-07 16:26:33 +01:00
|
|
|
_md_engine.zulip_db_data = None
|
2013-05-21 23:59:27 +02:00
|
|
|
|
2016-01-26 04:08:05 +01:00
|
|
|
bugdown_time_start = 0.0
|
|
|
|
bugdown_total_time = 0.0
|
2013-05-21 23:59:27 +02:00
|
|
|
bugdown_total_requests = 0
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def get_bugdown_time() -> float:
|
2013-05-21 23:59:27 +02:00
|
|
|
return bugdown_total_time
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def get_bugdown_requests() -> int:
|
2013-05-21 23:59:27 +02:00
|
|
|
return bugdown_total_requests
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def bugdown_stats_start() -> None:
|
2013-05-21 23:59:27 +02:00
|
|
|
global bugdown_time_start
|
|
|
|
bugdown_time_start = time.time()
|
|
|
|
|
2017-11-05 11:15:10 +01:00
|
|
|
def bugdown_stats_finish() -> None:
|
2013-05-21 23:59:27 +02:00
|
|
|
global bugdown_total_time
|
|
|
|
global bugdown_total_requests
|
|
|
|
global bugdown_time_start
|
|
|
|
bugdown_total_requests += 1
|
|
|
|
bugdown_total_time += (time.time() - bugdown_time_start)
|
|
|
|
|
2018-05-11 01:42:51 +02:00
|
|
|
def convert(content: str,
|
2017-11-05 11:15:10 +01:00
|
|
|
message: Optional[Message]=None,
|
|
|
|
message_realm: Optional[Realm]=None,
|
2018-05-11 01:42:51 +02:00
|
|
|
possible_words: Optional[Set[str]]=None,
|
2017-11-05 11:15:10 +01:00
|
|
|
sent_by_bot: Optional[bool]=False,
|
2018-11-02 12:50:09 +01:00
|
|
|
translate_emoticons: Optional[bool]=False,
|
2017-11-05 11:15:10 +01:00
|
|
|
mention_data: Optional[MentionData]=None,
|
2018-05-11 01:42:51 +02:00
|
|
|
email_gateway: Optional[bool]=False) -> str:
|
2013-05-21 23:59:27 +02:00
|
|
|
bugdown_stats_start()
|
2017-11-10 03:49:42 +01:00
|
|
|
ret = do_convert(content, message, message_realm,
|
2018-11-02 12:50:09 +01:00
|
|
|
possible_words, sent_by_bot, translate_emoticons,
|
|
|
|
mention_data, email_gateway)
|
2013-05-21 23:59:27 +02:00
|
|
|
bugdown_stats_finish()
|
|
|
|
return ret
|