zulip/zerver/lib/url_preview/preview.py

import re
from typing import Any, Callable, Dict, Optional
from typing.re import Match

import magic
import requests
from django.conf import settings
from django.utils.encoding import smart_text

from version import ZULIP_VERSION
from zerver.lib.cache import cache_with_key, get_cache_with_key, preview_url_cache_key
from zerver.lib.pysa import mark_sanitized
from zerver.lib.url_preview.oembed import get_oembed_data
from zerver.lib.url_preview.parsers import GenericParser, OpenGraphParser

# FIXME: Should we use a database cache or a memcached in production? What if
# opengraph data is changed for a site?
# Use an in-memory cache for development, to make it easy to develop this code
CACHE_NAME = "database" if not settings.DEVELOPMENT else "in-memory"
# Based on django.core.validators.URLValidator, with ftp support removed.
link_regex = re.compile(
    r'^(?:http)s?://'  # http:// or https://
    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  # domain...
    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip
    r'(?::\d+)?'  # optional port
    r'(?:/?|[/?]\S+)$', re.IGNORECASE)

# Use Chrome User-Agent, since some sites refuse to work on old browsers
ZULIP_URL_PREVIEW_USER_AGENT = (
    'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; ZulipURLPreview/{version}; '
    '+{external_host}) Chrome/81.0.4044.113 Safari/537.36'
).format(version=ZULIP_VERSION, external_host=settings.EXTERNAL_HOST)

# FIXME: This header and timeout are not used by pyoembed, when trying to autodiscover!
HEADERS = {'User-Agent': ZULIP_URL_PREVIEW_USER_AGENT}
TIMEOUT = 15


def is_link(url: str) -> Match[str]:
    return link_regex.match(smart_text(url))

def guess_mimetype_from_content(response: requests.Response) -> str:
    mime_magic = magic.Magic(mime=True)
    try:
        content = next(response.iter_content(1000))
    except StopIteration:
        content = ''
    return mime_magic.from_buffer(content)

def valid_content_type(url: str) -> bool:
    try:
        response = requests.get(url, stream=True, headers=HEADERS, timeout=TIMEOUT)
    except requests.RequestException:
        return False

    if not response.ok:
        return False

    content_type = response.headers.get('content-type')
    # Be accommodating of bad servers: assume content may be html if no content-type header
    if not content_type or content_type.startswith('text/html'):
        # Verify that the content is actually HTML if the server claims it is
        content_type = guess_mimetype_from_content(response)
    return content_type.startswith('text/html')

def catch_network_errors(func: Callable[..., Any]) -> Callable[..., Any]:
    def wrapper(*args: Any, **kwargs: Any) -> Any:
        try:
            return func(*args, **kwargs)
        except requests.exceptions.RequestException:
            pass
    return wrapper

@catch_network_errors
@cache_with_key(preview_url_cache_key, cache_name=CACHE_NAME, with_statsd_key="urlpreview_data")
def get_link_embed_data(url: str,
                        maxwidth: Optional[int]=640,
                        maxheight: Optional[int]=480) -> Optional[Dict[str, Any]]:
    if not is_link(url):
        return None

    if not valid_content_type(url):
        return None

    # We are using two different mechanisms to get the embed data
    # 1. Use OEmbed data, if found, for photo and video "type" sites
    # 2. Otherwise, use a combination of Open Graph tags and Meta tags
    data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight) or {}
    if data.get('oembed'):
        return data

    response = requests.get(mark_sanitized(url), stream=True, headers=HEADERS, timeout=TIMEOUT)
    if response.ok:
        og_data = OpenGraphParser(response.text).extract_data()
        for key in ['title', 'description', 'image']:
            if not data.get(key) and og_data.get(key):
                data[key] = og_data[key]

        generic_data = GenericParser(response.text).extract_data() or {}
        for key in ['title', 'description', 'image']:
            if not data.get(key) and generic_data.get(key):
                data[key] = generic_data[key]
    return data

@get_cache_with_key(preview_url_cache_key, cache_name=CACHE_NAME)
def link_embed_data_from_cache(url: str, maxwidth: Optional[int]=640, maxheight: Optional[int]=480) -> Any:
    return
Add oembed/Open Graph/Meta tags data retrieval from inline links. This change adds support for displaying inline open graph previews for links posted into Zulip. It is designed to interact correctly with message editing. This adds the new settings.INLINE_URL_EMBED_PREVIEW setting to control whether this feature is enabled. By default, this setting is currently disabled, so that we can burn it in for a bit before it impacts users more broadly. Eventually, we may want to make this manageable via a (set of?) per-realm settings. E.g. I can imagine a realm wanting to be able to enable/disable it for certain URLs. 2016-10-27 12:06:44 +02:00			`import re`
python: Sort imports with isort. Fixes #2665. Regenerated by tabbott with `lint --fix` after a rebase and change in parameters. Note from tabbott: In a few cases, this converts technical debt in the form of unsorted imports into different technical debt in the form of our largest files having very long, ugly import sequences at the start. I expect this change will increase pressure for us to split those files, which isn't a bad thing. Signed-off-by: Anders Kaseorg <anders@zulip.com> 2020-06-11 00:54:34 +02:00			`from typing import Any, Callable, Dict, Optional`
			`from typing.re import Match`
url preview: Cleanup import ordering. 2019-05-02 17:13:20 +02:00
python: Sort imports with isort. Fixes #2665. Regenerated by tabbott with `lint --fix` after a rebase and change in parameters. Note from tabbott: In a few cases, this converts technical debt in the form of unsorted imports into different technical debt in the form of our largest files having very long, ugly import sequences at the start. I expect this change will increase pressure for us to split those files, which isn't a bad thing. Signed-off-by: Anders Kaseorg <anders@zulip.com> 2020-06-11 00:54:34 +02:00			`import magic`
			`import requests`
url preview: Use in-memory caching in dev environment. 2019-05-02 17:14:08 +02:00			`from django.conf import settings`
url preview: Cleanup import ordering. 2019-05-02 17:13:20 +02:00			`from django.utils.encoding import smart_text`

url preview: Set a custom user agent for requests. Some sites seem to block the default user agent of the requests library. Using a custom user agent lets us show previews for some of these sites. 2019-05-05 20:56:54 +02:00			`from version import ZULIP_VERSION`
preview: Hash cache keys for preview urls. We don't want really long urls to lead to truncated keys, or we could theoretically have two different urls get mixed up previews. Also, this suppresses warnings about exceeding the 250 char limit. Finally, this gives the key a proper prefix. 2018-10-14 14:41:15 +02:00			`from zerver.lib.cache import cache_with_key, get_cache_with_key, preview_url_cache_key`
pysa: Introduce sanitizers, models, and inline marking safe. This commit adds three `.pysa` model files: `false_positives.pysa` for ruling out false positive flows with `Sanitize` annotations, `req_lib.pysa` for educating pysa about Zulip's `REQ()` pattern for extracting user input, and `redirects.pysa` for capturing the risk of open redirects within Zulip code. Additionally, this commit introduces `mark_sanitized`, an identity function which can be used to selectively clear taint in cases where `Sanitize` models will not work. This commit also puts `mark_sanitized` to work removing known false postive flows. 2019-12-20 00:00:45 +01:00			`from zerver.lib.pysa import mark_sanitized`
Add oembed/Open Graph/Meta tags data retrieval from inline links. This change adds support for displaying inline open graph previews for links posted into Zulip. It is designed to interact correctly with message editing. This adds the new settings.INLINE_URL_EMBED_PREVIEW setting to control whether this feature is enabled. By default, this setting is currently disabled, so that we can burn it in for a bit before it impacts users more broadly. Eventually, we may want to make this manageable via a (set of?) per-realm settings. E.g. I can imagine a realm wanting to be able to enable/disable it for certain URLs. 2016-10-27 12:06:44 +02:00			`from zerver.lib.url_preview.oembed import get_oembed_data`
python: Sort imports with isort. Fixes #2665. Regenerated by tabbott with `lint --fix` after a rebase and change in parameters. Note from tabbott: In a few cases, this converts technical debt in the form of unsorted imports into different technical debt in the form of our largest files having very long, ugly import sequences at the start. I expect this change will increase pressure for us to split those files, which isn't a bad thing. Signed-off-by: Anders Kaseorg <anders@zulip.com> 2020-06-11 00:54:34 +02:00			`from zerver.lib.url_preview.parsers import GenericParser, OpenGraphParser`
Add oembed/Open Graph/Meta tags data retrieval from inline links. This change adds support for displaying inline open graph previews for links posted into Zulip. It is designed to interact correctly with message editing. This adds the new settings.INLINE_URL_EMBED_PREVIEW setting to control whether this feature is enabled. By default, this setting is currently disabled, so that we can burn it in for a bit before it impacts users more broadly. Eventually, we may want to make this manageable via a (set of?) per-realm settings. E.g. I can imagine a realm wanting to be able to enable/disable it for certain URLs. 2016-10-27 12:06:44 +02:00
url preview: Use in-memory caching in dev environment. 2019-05-02 17:14:08 +02:00			`# FIXME: Should we use a database cache or a memcached in production? What if`
			`# opengraph data is changed for a site?`
			`# Use an in-memory cache for development, to make it easy to develop this code`
			`CACHE_NAME = "database" if not settings.DEVELOPMENT else "in-memory"`
Add oembed/Open Graph/Meta tags data retrieval from inline links. This change adds support for displaying inline open graph previews for links posted into Zulip. It is designed to interact correctly with message editing. This adds the new settings.INLINE_URL_EMBED_PREVIEW setting to control whether this feature is enabled. By default, this setting is currently disabled, so that we can burn it in for a bit before it impacts users more broadly. Eventually, we may want to make this manageable via a (set of?) per-realm settings. E.g. I can imagine a realm wanting to be able to enable/disable it for certain URLs. 2016-10-27 12:06:44 +02:00			`# Based on django.core.validators.URLValidator, with ftp support removed.`
			`link_regex = re.compile(`
			`r'^(?:http)s?://' # http:// or https://`
			`r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?\|[A-Z0-9-]{2,}\.?)\|' # domain...`
			`r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip`
			`r'(?::\d+)?' # optional port`
			`r'(?:/?\|[/?]\S+)$', re.IGNORECASE)`
url preview: Use Chrome's user agent instead of a Zulip one. Some sites don't render correctly unless you are one of the latest browsers. YouTube Music, for instance, changes the page title to "Your browser is deprecated, please upgrade.", which makes our URL previews look bad. 2020-04-23 21:26:31 +02:00
			`# Use Chrome User-Agent, since some sites refuse to work on old browsers`
			`ZULIP_URL_PREVIEW_USER_AGENT = (`
			`'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; ZulipURLPreview/{version}; '`
			`'+{external_host}) Chrome/81.0.4044.113 Safari/537.36'`
			`).format(version=ZULIP_VERSION, external_host=settings.EXTERNAL_HOST)`

url preview: Timeout requests after 15 seconds. 2019-05-06 04:49:47 +02:00			`# FIXME: This header and timeout are not used by pyoembed, when trying to autodiscover!`
url preview: Use Chrome's user agent instead of a Zulip one. Some sites don't render correctly unless you are one of the latest browsers. YouTube Music, for instance, changes the page title to "Your browser is deprecated, please upgrade.", which makes our URL previews look bad. 2020-04-23 21:26:31 +02:00			`HEADERS = {'User-Agent': ZULIP_URL_PREVIEW_USER_AGENT}`
url preview: Timeout requests after 15 seconds. 2019-05-06 04:49:47 +02:00			`TIMEOUT = 15`
Add oembed/Open Graph/Meta tags data retrieval from inline links. This change adds support for displaying inline open graph previews for links posted into Zulip. It is designed to interact correctly with message editing. This adds the new settings.INLINE_URL_EMBED_PREVIEW setting to control whether this feature is enabled. By default, this setting is currently disabled, so that we can burn it in for a bit before it impacts users more broadly. Eventually, we may want to make this manageable via a (set of?) per-realm settings. E.g. I can imagine a realm wanting to be able to enable/disable it for certain URLs. 2016-10-27 12:06:44 +02:00

zerver/lib: Change use of typing.Text to str. 2018-05-10 19:13:36 +02:00			`def is_link(url: str) -> Match[str]:`
preview.py: Fix error raised on uploading file with unicode filename. 2017-06-16 00:23:35 +02:00			`return link_regex.match(smart_text(url))`
Add oembed/Open Graph/Meta tags data retrieval from inline links. This change adds support for displaying inline open graph previews for links posted into Zulip. It is designed to interact correctly with message editing. This adds the new settings.INLINE_URL_EMBED_PREVIEW setting to control whether this feature is enabled. By default, this setting is currently disabled, so that we can burn it in for a bit before it impacts users more broadly. Eventually, we may want to make this manageable via a (set of?) per-realm settings. E.g. I can imagine a realm wanting to be able to enable/disable it for certain URLs. 2016-10-27 12:06:44 +02:00
url preview: Confirm content-type before trying to show previews. Currently, we only show previews for URLs which are HTML pages, which could contain other media. We don't show previews for links to non-HTML pages, like pdf documents or audio/video files. To verify that the URL posted is an HTML page, we verify the content-type of the page, either using server headers or by sniffing the content. Closes #8358 2019-05-04 17:54:18 +02:00			`def guess_mimetype_from_content(response: requests.Response) -> str:`
			`mime_magic = magic.Magic(mime=True)`
			`try:`
			`content = next(response.iter_content(1000))`
			`except StopIteration:`
			`content = ''`
			`return mime_magic.from_buffer(content)`

			`def valid_content_type(url: str) -> bool:`
			`try:`
url preview: Timeout requests after 15 seconds. 2019-05-06 04:49:47 +02:00			`response = requests.get(url, stream=True, headers=HEADERS, timeout=TIMEOUT)`
url preview: Confirm content-type before trying to show previews. Currently, we only show previews for URLs which are HTML pages, which could contain other media. We don't show previews for links to non-HTML pages, like pdf documents or audio/video files. To verify that the URL posted is an HTML page, we verify the content-type of the page, either using server headers or by sniffing the content. Closes #8358 2019-05-04 17:54:18 +02:00			`except requests.RequestException:`
			`return False`

			`if not response.ok:`
			`return False`

			`content_type = response.headers.get('content-type')`
			`# Be accommodating of bad servers: assume content may be html if no content-type header`
			`if not content_type or content_type.startswith('text/html'):`
			`# Verify that the content is actually HTML if the server claims it is`
			`content_type = guess_mimetype_from_content(response)`
			`return content_type.startswith('text/html')`

url preview: Don't cache embed data when fetch has network errors. 2019-05-10 14:29:33 +02:00			`def catch_network_errors(func: Callable[..., Any]) -> Callable[..., Any]:`
			`def wrapper(args: Any, *kwargs: Any) -> Any:`
			`try:`
			`return func(args, *kwargs)`
			`except requests.exceptions.RequestException:`
			`pass`
			`return wrapper`

			`@catch_network_errors`
preview: Hash cache keys for preview urls. We don't want really long urls to lead to truncated keys, or we could theoretically have two different urls get mixed up previews. Also, this suppresses warnings about exceeding the 250 char limit. Finally, this gives the key a proper prefix. 2018-10-14 14:41:15 +02:00			`@cache_with_key(preview_url_cache_key, cache_name=CACHE_NAME, with_statsd_key="urlpreview_data")`
zerver/lib: Change use of typing.Text to str. 2018-05-10 19:13:36 +02:00			`def get_link_embed_data(url: str,`
zerver/lib: Use python 3 syntax for typing. Extracted from a larger commit by tabbott because these changes will not create significant merge conflicts. 2017-11-05 11:15:10 +01:00			`maxwidth: Optional[int]=640,`
mypy: Improve typing of oembed data, to Dict[str, Any]. 2018-06-16 23:00:17 +02:00			`maxheight: Optional[int]=480) -> Optional[Dict[str, Any]]:`
Add oembed/Open Graph/Meta tags data retrieval from inline links. This change adds support for displaying inline open graph previews for links posted into Zulip. It is designed to interact correctly with message editing. This adds the new settings.INLINE_URL_EMBED_PREVIEW setting to control whether this feature is enabled. By default, this setting is currently disabled, so that we can burn it in for a bit before it impacts users more broadly. Eventually, we may want to make this manageable via a (set of?) per-realm settings. E.g. I can imagine a realm wanting to be able to enable/disable it for certain URLs. 2016-10-27 12:06:44 +02:00			`if not is_link(url):`
			`return None`
url preview: Confirm content-type before trying to show previews. Currently, we only show previews for URLs which are HTML pages, which could contain other media. We don't show previews for links to non-HTML pages, like pdf documents or audio/video files. To verify that the URL posted is an HTML page, we verify the content-type of the page, either using server headers or by sniffing the content. Closes #8358 2019-05-04 17:54:18 +02:00
			`if not valid_content_type(url):`
			`return None`

url preview: Use oEmbed html for videos. Ensure that the html is safe, before using it. The html is considered if it is in an iframe with a http/https src, based on the recommendations here: https://oembed.com/#section3 We directly embed the `iframe` html into the lightbox overlay. 2019-05-02 18:58:39 +02:00			`# We are using two different mechanisms to get the embed data`
			`# 1. Use OEmbed data, if found, for photo and video "type" sites`
			`# 2. Otherwise, use a combination of Open Graph tags and Meta tags`
url preview: Don't cache embed data when fetch has network errors. 2019-05-10 14:29:33 +02:00			`data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight) or {}`
url preview: Show inline images as previews for oEmbed photo pages. 2019-05-26 06:27:01 +02:00			`if data.get('oembed'):`
			`return data`
url_preview: Fix parsing of open graph tags. Our open graph parser logic sloppily mixed data obtained by parsing open graph properties with trusted data set by our oembed parser. We fix this by consistenly using our explicit whitelist of generic properties (image, title, and description) in both places where we interact with open graph properties. The fixes are redundant with each other, but doing both helps in making the intent of the code clearer. This issue fixed here was originally reported as an XSS vulnerability in the upcoming Inline URL Previews feature found by Graham Bleaney and Ibrahim Mohamed using Pysa. The recent Oembed changes close that vulnerability, but this change is still worth doing to make the implementation do what it looks like it does. 2019-12-12 02:10:50 +01:00
pysa: Introduce sanitizers, models, and inline marking safe. This commit adds three `.pysa` model files: `false_positives.pysa` for ruling out false positive flows with `Sanitize` annotations, `req_lib.pysa` for educating pysa about Zulip's `REQ()` pattern for extracting user input, and `redirects.pysa` for capturing the risk of open redirects within Zulip code. Additionally, this commit introduces `mark_sanitized`, an identity function which can be used to selectively clear taint in cases where `Sanitize` models will not work. This commit also puts `mark_sanitized` to work removing known false postive flows. 2019-12-20 00:00:45 +01:00			`response = requests.get(mark_sanitized(url), stream=True, headers=HEADERS, timeout=TIMEOUT)`
Add oembed/Open Graph/Meta tags data retrieval from inline links. This change adds support for displaying inline open graph previews for links posted into Zulip. It is designed to interact correctly with message editing. This adds the new settings.INLINE_URL_EMBED_PREVIEW setting to control whether this feature is enabled. By default, this setting is currently disabled, so that we can burn it in for a bit before it impacts users more broadly. Eventually, we may want to make this manageable via a (set of?) per-realm settings. E.g. I can imagine a realm wanting to be able to enable/disable it for certain URLs. 2016-10-27 12:06:44 +02:00			`if response.ok:`
			`og_data = OpenGraphParser(response.text).extract_data()`
url_preview: Fix parsing of open graph tags. Our open graph parser logic sloppily mixed data obtained by parsing open graph properties with trusted data set by our oembed parser. We fix this by consistenly using our explicit whitelist of generic properties (image, title, and description) in both places where we interact with open graph properties. The fixes are redundant with each other, but doing both helps in making the intent of the code clearer. This issue fixed here was originally reported as an XSS vulnerability in the upcoming Inline URL Previews feature found by Graham Bleaney and Ibrahim Mohamed using Pysa. The recent Oembed changes close that vulnerability, but this change is still worth doing to make the implementation do what it looks like it does. 2019-12-12 02:10:50 +01:00			`for key in ['title', 'description', 'image']:`
			`if not data.get(key) and og_data.get(key):`
			`data[key] = og_data[key]`

Add oembed/Open Graph/Meta tags data retrieval from inline links. This change adds support for displaying inline open graph previews for links posted into Zulip. It is designed to interact correctly with message editing. This adds the new settings.INLINE_URL_EMBED_PREVIEW setting to control whether this feature is enabled. By default, this setting is currently disabled, so that we can burn it in for a bit before it impacts users more broadly. Eventually, we may want to make this manageable via a (set of?) per-realm settings. E.g. I can imagine a realm wanting to be able to enable/disable it for certain URLs. 2016-10-27 12:06:44 +02:00			`generic_data = GenericParser(response.text).extract_data() or {}`
			`for key in ['title', 'description', 'image']:`
			`if not data.get(key) and generic_data.get(key):`
			`data[key] = generic_data[key]`
			`return data`

preview: Hash cache keys for preview urls. We don't want really long urls to lead to truncated keys, or we could theoretically have two different urls get mixed up previews. Also, this suppresses warnings about exceeding the 250 char limit. Finally, this gives the key a proper prefix. 2018-10-14 14:41:15 +02:00			`@get_cache_with_key(preview_url_cache_key, cache_name=CACHE_NAME)`
zerver/lib: Change use of typing.Text to str. 2018-05-10 19:13:36 +02:00			`def link_embed_data_from_cache(url: str, maxwidth: Optional[int]=640, maxheight: Optional[int]=480) -> Any:`
Add oembed/Open Graph/Meta tags data retrieval from inline links. This change adds support for displaying inline open graph previews for links posted into Zulip. It is designed to interact correctly with message editing. This adds the new settings.INLINE_URL_EMBED_PREVIEW setting to control whether this feature is enabled. By default, this setting is currently disabled, so that we can burn it in for a bit before it impacts users more broadly. Eventually, we may want to make this manageable via a (set of?) per-realm settings. E.g. I can imagine a realm wanting to be able to enable/disable it for certain URLs. 2016-10-27 12:06:44 +02:00			`return`